# <p style="padding:10px;background-color:#87CEEB ;margin:10;color:#000000;font-family:newtimeroman;font-size:100%;text-align:center;border-radius: 10px 10px ;overflow:hidden;font-weight:50">Model Training on Personality Dataset</p>

In [74]:
# Basic Import
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns

from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from joblib import dump, load
from xgboost import XGBClassifier
import warnings
warnings.filterwarnings("ignore")

# <p style="padding:10px;background-color:#87CEEB ;margin:10;color:#000000;font-family:newtimeroman;font-size:100%;text-align:center;border-radius: 10px 10px ;overflow:hidden;font-weight:50">Read the Dataset</p>

In [75]:
df= pd.read_csv(r'/Users/user/ML_course/4th_month/wrapped/notebook/data/personality_dataset.csv')
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2900 entries, 0 to 2899
Data columns (total 8 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Time_spent_Alone           2837 non-null   float64
 1   Stage_fear                 2827 non-null   object 
 2   Social_event_attendance    2838 non-null   float64
 3   Going_outside              2834 non-null   float64
 4   Drained_after_socializing  2848 non-null   object 
 5   Friends_circle_size        2823 non-null   float64
 6   Post_frequency             2835 non-null   float64
 7   Personality                2900 non-null   object 
dtypes: float64(5), object(3)
memory usage: 181.4+ KB


Unnamed: 0,Time_spent_Alone,Stage_fear,Social_event_attendance,Going_outside,Drained_after_socializing,Friends_circle_size,Post_frequency,Personality
0,4.0,No,4.0,6.0,No,13.0,5.0,Extrovert
1,9.0,Yes,0.0,0.0,Yes,0.0,3.0,Introvert
2,9.0,Yes,1.0,2.0,Yes,5.0,2.0,Introvert
3,0.0,No,6.0,7.0,No,14.0,8.0,Extrovert
4,3.0,No,9.0,4.0,No,8.0,5.0,Extrovert


# <p style="padding:10px;background-color:#87CEEB ;margin:10;color:#000000;font-family:newtimeroman;font-size:100%;text-align:center;border-radius: 10px 10px ;overflow:hidden;font-weight:50">Drop the unnecessary columns and duplicated rows</p>

In [76]:
df.drop(['Post_frequency'], axis=1, inplace=True)
df.drop_duplicates(inplace=True)

# <p style="padding:10px;background-color:#87CEEB ;margin:10;color:#000000;font-family:newtimeroman;font-size:100%;text-align:center;border-radius: 10px 10px ;overflow:hidden;font-weight:50">Splitting the data into input(x) and output(y) features</p>

In [77]:
# mapping category of personality to numerical values
df['Personality'] = df['Personality'].map({
    'Extrovert': 0,
    'Introvert': 1,
})

In [78]:
x = df.drop('Personality', axis=1)
y = df['Personality']

cat_cols = x.select_dtypes(include=['object']).columns.tolist()
num_cols = x.select_dtypes(include=['int64', 'float64']).columns.tolist()


# <p style="padding:10px;background-color:#87CEEB ;margin:10;color:#000000;font-family:newtimeroman;font-size:100%;text-align:center;border-radius: 10px 10px ;overflow:hidden;font-weight:50">Creating Data Transformation Pipeline</p>

In [79]:
# Preprocessing
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, num_cols),
        ('cat', categorical_transformer, cat_cols)
    ]
)

# <p style="padding:10px;background-color:#87CEEB ;margin:10;color:#000000;font-family:newtimeroman;font-size:100%;text-align:center;border-radius: 10px 10px ;overflow:hidden;font-weight:50">Splitting the data into training and testing datasets</p>

In [80]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# <p style="padding:10px;background-color:#87CEEB ;margin:10;color:#000000;font-family:newtimeroman;font-size:100%;text-align:center;border-radius: 10px 10px ;overflow:hidden;font-weight:50">Baseline Models training</p>

In [81]:
# Full pipeline
models = {
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Support Vector Machine": SVC(),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "K-Nearest Neighbors": KNeighborsClassifier(n_neighbors=5),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss')
}

results = {}

for name, clf in models.items():
    # Create pipeline with current model
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', clf)
    ])
    
    # Fit model
    pipeline.fit(x_train, y_train)
    
    # Predict
    y_pred = pipeline.predict(x_test)
    
    # Evaluate
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Model {name}: {clf.__class__.__name__}")
    print(f"Accuracy: {accuracy:.4f}")
    print("-" * 50)
    
    # Store results
    results[name] = {
        "model": clf.__class__.__name__,
        "accuracy": accuracy
    }



Model Random Forest: RandomForestClassifier
Accuracy: 0.8648
--------------------------------------------------
Model Logistic Regression: LogisticRegression
Accuracy: 0.8789
--------------------------------------------------
Model Support Vector Machine: SVC
Accuracy: 0.9155
--------------------------------------------------
Model Decision Tree: DecisionTreeClassifier
Accuracy: 0.8056
--------------------------------------------------
Model K-Nearest Neighbors: KNeighborsClassifier
Accuracy: 0.9014
--------------------------------------------------
Model XGBoost: XGBClassifier
Accuracy: 0.8732
--------------------------------------------------


# <p style="padding:10px;background-color:#87CEEB ;margin:10;color:#000000;font-family:newtimeroman;font-size:100%;text-align:center;border-radius: 10px 10px ;overflow:hidden;font-weight:50">Model Evaluation and hyperparameter tuning</p>

In [82]:
def evaluate_model(x_train, y_train, x_test, y_test, models: dict, param:  dict):
    try:
        report = {}
        best_model_obj = None
        best_score = 0
        best_model_name = None
        x_train1 = preprocessor.fit_transform(x_train)
        x_test1 = preprocessor.transform(x_test)
        
        # Ensure x_train is transformed before fitting models

        for model_name, model in models.items():
            param_grid = param.get(model_name, {})
            if not param_grid:
                print(f"⚠️ Warning: No param grid found for {model_name}, using defaults.")
            gs = GridSearchCV(model, param_grid, cv=3, n_jobs=-1)
            
            gs.fit(x_train1, y_train)
            best_model = gs.best_estimator_

            y_test_pred = best_model.predict(x_test1)
            test_score = accuracy_score(y_test, y_test_pred)

            report[model_name] = test_score
            
            if test_score > best_score:
                best_score = test_score
                best_model_obj = best_model
                best_model_name = model_name

        return report, best_model_obj, best_model_name

    except Exception as e:
        raise Exception(f"Error in evaluate_models: {str(e)}")


In [83]:
models = {
        "Random Forest": RandomForestClassifier(),
        "Logistic Regression": LogisticRegression(),
        "Support Vector Machine": SVC(),
        "Decision Tree": DecisionTreeClassifier(),
        "K-Nearest Neighbors": KNeighborsClassifier(),
        "XGBoost": XGBClassifier( eval_metric='logloss')
    }
    
params = {
        "Random Forest": {
            "n_estimators": [50, 100, 200],
            "max_depth": [None, 10, 20],
            "min_samples_split": [2, 5, 10],
            "min_samples_leaf": [1, 2, 4]
        },

        "Logistic Regression": {
            "C": [0.01, 0.1, 1, 10, 100],
            "solver": ["liblinear", "lbfgs"],
            "penalty": ["l2"],
            "max_iter": [100, 200]
        },

        "Support Vector Machine": {
            "C": [0.1, 1, 10],
            "kernel": ["linear", "rbf", "poly"],
            "gamma": ["scale", "auto"]
        },

        "Decision Tree": {
            "criterion": ["gini", "entropy", "log_loss"],
            "max_depth": [None, 10, 20, 30],
            "min_samples_split": [2, 5, 10],
            "min_samples_leaf": [1, 2, 4]
        },

        "K-Nearest Neighbors": {
            "n_neighbors": [3, 5, 7, 9],
            "weights": ["uniform", "distance"],
            "metric": ["euclidean", "manhattan"]
        },

        "XGBoost": {
            "learning_rate": [0.01, 0.05, 0.1],
            "n_estimators": [50, 100, 200],
            "max_depth": [3, 5, 7],
            "subsample": [0.7, 0.8, 1.0]
        }
    }

            
model_report, best_model, best_model_name = evaluate_model(
                x_train=x_train,
                y_train=y_train,
                x_test=x_test,
                y_test=y_test,
                models=models,
                param=params
            )

best_model_accuracy = model_report[best_model_name]
print(f"✅ Best model after tuning: {best_model_name} with accuracy: {best_model_accuracy:.4f}")

if best_model_accuracy > 0.6:
    print("Model accuracy is above threshold. Saving the model...")
    dump(best_model, 'trained_model.joblib')
    
    print(f"Model saved at: as a 'trained_model.joblib'")
else:
    print("Model accuracy below threshold. Model not saved.")
print(model_report)



✅ Best model after tuning: Random Forest with accuracy: 0.9155
Model accuracy is above threshold. Saving the model...
Model saved at: as a 'trained_model.joblib'
{'Random Forest': 0.9154929577464789, 'Logistic Regression': 0.9154929577464789, 'Support Vector Machine': 0.9154929577464789, 'Decision Tree': 0.9154929577464789, 'K-Nearest Neighbors': 0.9126760563380282, 'XGBoost': 0.9154929577464789}


In [84]:
from tabulate import tabulate

# Sort results by accuracy
sorted_results = sorted(results.items(), key=lambda x: x[1]["accuracy"], reverse=True)

# Create table rows: [Model Name, Model Type, Accuracy]
table_rows = [
    [name, info["model"], f"{info['accuracy']:.4f}"] for name, info in sorted_results
]

# Print as table
print("\n🔍 Model Comparison Summary:")
print(tabulate(table_rows, headers=["Model Name", "Model Type", "Accuracy"], tablefmt="grid"))



🔍 Model Comparison Summary:
+------------------------+------------------------+------------+
| Model Name             | Model Type             |   Accuracy |
| Support Vector Machine | SVC                    |     0.9155 |
+------------------------+------------------------+------------+
| K-Nearest Neighbors    | KNeighborsClassifier   |     0.9014 |
+------------------------+------------------------+------------+
| Logistic Regression    | LogisticRegression     |     0.8789 |
+------------------------+------------------------+------------+
| XGBoost                | XGBClassifier          |     0.8732 |
+------------------------+------------------------+------------+
| Random Forest          | RandomForestClassifier |     0.8648 |
+------------------------+------------------------+------------+
| Decision Tree          | DecisionTreeClassifier |     0.8056 |
+------------------------+------------------------+------------+
