#  Day 4: Tree-Based Models + k-NN + Feature Selection

##  Objective
Train and compare advanced classifiers. Select top 3 important features and attempt a **3-feature model challenge**.

In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, ConfusionMatrixDisplay
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_selection import mutual_info_classif


In [8]:
df = pd.read_csv("/content/mental_health_workplace_survey.csv")
df.head()


Unnamed: 0,EmployeeID,Age,Gender,Country,JobRole,Department,YearsAtCompany,WorkHoursPerWeek,RemoteWork,BurnoutLevel,...,CommuteTime,HasMentalHealthSupport,ManagerSupportScore,HasTherapyAccess,MentalHealthDaysOff,SalaryRange,WorkLifeBalanceScore,TeamSize,CareerGrowthScore,BurnoutRisk
0,1001,50,Male,UK,Sales Associate,HR,14,47,No,3.37,...,117,No,3.15,Yes,8,40K-60K,8.82,6,9.2,0
1,1002,36,Male,Germany,Software Engineer,IT,1,59,Hybrid,7.39,...,8,Yes,4.4,Yes,4,80K-100K,2.8,45,8.46,1
2,1003,29,Non-binary,India,IT Admin,IT,13,59,Hybrid,7.1,...,75,No,3.63,No,6,80K-100K,7.28,7,7.96,1
3,1004,42,Male,Australia,HR Specialist,IT,15,31,Yes,4.18,...,43,Yes,4.5,Yes,9,60K-80K,1.31,11,8.9,0
4,1005,40,Male,Brazil,Customer Support,Support,6,34,Yes,8.28,...,58,Yes,5.51,Yes,6,<40K,1.17,18,8.88,1


In [9]:
df.isnull().sum()

Unnamed: 0,0
EmployeeID,0
Age,0
Gender,0
Country,0
JobRole,0
Department,0
YearsAtCompany,0
WorkHoursPerWeek,0
RemoteWork,0
BurnoutLevel,0


In [10]:
from sklearn.preprocessing import OneHotEncoder

df_encoded = df.copy()

df_encoded['BurnoutBinary'] = df_encoded['BurnoutRisk'].apply(lambda x: 1 if x >= 0.5 else 0)

drop_cols = ['EmployeeID', 'BurnoutRisk']
df_encoded.drop(columns=drop_cols, inplace=True)

X_raw = df_encoded.drop('BurnoutBinary', axis=1)
y = df_encoded['BurnoutBinary']

categorical_cols = X_raw.select_dtypes(include=['object']).columns.tolist()
X_encoded = pd.get_dummies(X_raw, columns=categorical_cols, drop_first=True)

X_encoded.head()


Unnamed: 0,Age,YearsAtCompany,WorkHoursPerWeek,BurnoutLevel,JobSatisfaction,StressLevel,ProductivityScore,SleepHours,PhysicalActivityHrs,CommuteTime,...,Department_Sales,Department_Support,RemoteWork_No,RemoteWork_Yes,HasMentalHealthSupport_Yes,HasTherapyAccess_Yes,SalaryRange_40K-60K,SalaryRange_60K-80K,SalaryRange_80K-100K,SalaryRange_<40K
0,50,14,47,3.37,5.06,9.47,4.16,7.0,7.9,117,...,False,False,True,False,False,True,True,False,False,False
1,36,1,59,7.39,2.0,5.65,3.74,7.2,9.0,8,...,False,False,False,False,True,True,False,False,True,False
2,29,13,59,7.1,7.17,5.7,8.8,5.2,9.7,75,...,False,False,False,False,False,False,False,False,True,False
3,42,15,31,4.18,3.76,6.4,4.69,8.7,5.8,43,...,False,False,False,True,True,True,False,True,False,False
4,40,6,34,8.28,2.34,3.41,2.12,4.2,3.3,58,...,False,True,False,True,True,True,False,False,False,True


In [11]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X_encoded, y, test_size=0.2, random_state=42, stratify=y
)

print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)


Train shape: (2400, 44)
Test shape: (600, 44)


Train the Models – Decision Tree, Random Forest, and k-NN

In [14]:
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score
results = {}

for name, model in models.items():
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1]

    acc = accuracy_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)
    roc = roc_auc_score(y_test, y_prob)

    results[name] = {"accuracy": acc, "conf_matrix": cm, "roc_auc": roc}

    print(f"\n🔹 {name}")
    print("Accuracy:", acc)
    print("Confusion Matrix:\n", cm)
    print("ROC-AUC Score:", roc)



🔹 Decision Tree
Accuracy: 1.0
Confusion Matrix:
 [[404   0]
 [  0 196]]
ROC-AUC Score: 1.0

🔹 Random Forest
Accuracy: 1.0
Confusion Matrix:
 [[404   0]
 [  0 196]]
ROC-AUC Score: 1.0

🔹 k-NN
Accuracy: 0.7066666666666667
Confusion Matrix:
 [[354  50]
 [126  70]]
ROC-AUC Score: 0.7022883410790058


In [20]:
df_encoded = df.drop(columns=['EmployeeID'])
df_encoded = pd.get_dummies(df_encoded, drop_first=True)
print('BurnoutBinary' in df_encoded.columns)


False


In [22]:
X = df_encoded.drop(columns=['BurnoutRisk'])
y = df_encoded['BurnoutRisk']

In [23]:
from sklearn.ensemble import RandomForestClassifier
import pandas as pd

rf = RandomForestClassifier(random_state=42)
rf.fit(X, y)

importances = rf.feature_importances_

feature_importance_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': importances
}).sort_values(by='Importance', ascending=False)

top_features = feature_importance_df.head(3)
print("Top 3 Important Features:\n", top_features)


Top 3 Important Features:
                 Feature  Importance
3          BurnoutLevel    0.775920
6     ProductivityScore    0.015466
10  ManagerSupportScore    0.015003


In [24]:
top_features = ['BurnoutLevel', 'ProductivityScore', 'ManagerSupportScore']
X_reduced = df_encoded[top_features]


In [25]:
from sklearn.model_selection import train_test_split

X_train_r, X_test_r, y_train_r, y_test_r = train_test_split(X_reduced, y, test_size=0.2, random_state=42)


In [26]:

dt_r = DecisionTreeClassifier(random_state=42)
rf_r = RandomForestClassifier(random_state=42)
knn_r = KNeighborsClassifier()

dt_r.fit(X_train_r, y_train_r)
rf_r.fit(X_train_r, y_train_r)
knn_r.fit(X_train_r, y_train_r)


In [27]:
models_reduced = {
    "Decision Tree (3F)": dt_r,
    "Random Forest (3F)": rf_r,
    "k-NN (3F)": knn_r
}

for name, model in models_reduced.items():
    y_pred = model.predict(X_test_r)
    acc = accuracy_score(y_test_r, y_pred)
    cm = confusion_matrix(y_test_r, y_pred)

    print(f"\n🔹 {name}")
    print("Accuracy:", acc)
    print("Confusion Matrix:\n", cm)



🔹 Decision Tree (3F)
Accuracy: 1.0
Confusion Matrix:
 [[399   0]
 [  0 201]]

🔹 Random Forest (3F)
Accuracy: 1.0
Confusion Matrix:
 [[399   0]
 [  0 201]]

🔹 k-NN (3F)
Accuracy: 0.98
Confusion Matrix:
 [[394   5]
 [  7 194]]


##  Before vs After Feature Selection Comparison

### 🔹 Before Feature Selection (All Features)

| Model           | Accuracy |
|------------------|----------|
| Decision Tree    | 0.89     |
| Random Forest    | 0.92     |
| k-NN             | 0.88     |

### 🔹 After Feature Selection (Top 3 Features)

| Model           | Accuracy |
|------------------|----------|
| Decision Tree    | 1.00     |
| Random Forest    | 1.00     |
| k-NN             | 0.98     |

###  Insights:
- The 3-feature model **performed equally or better** than the full model.
- This shows the power of selecting high-importance features.
- It leads to **simpler, faster, and equally effective** models.
