In [16]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import warnings
warnings.filterwarnings('ignore')

# Load the dataset
df = pd.read_csv("/content/drive/MyDrive/Data Set/mental_health_workplace_survey.csv")


In [9]:
# Check for missing values
if df.isnull().any().any():
    print("Warning: Missing values detected. Imputing with median for numerical columns.")
    df.fillna(df.median(numeric_only=True), inplace=True)

# Preprocessing
# Encode categorical variables
categorical_cols = ['Gender', 'Country', 'JobRole', 'Department', 'RemoteWork', 'HasMentalHealthSupport', 'HasTherapyAccess', 'SalaryRange']
le_dict = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    le_dict[col] = le

In [10]:
# Define features and target
X = df.drop(['EmployeeID', 'BurnoutRisk'], axis=1)
y = df['BurnoutRisk']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale numerical features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize models
dt = DecisionTreeClassifier(random_state=42)
rf = RandomForestClassifier(random_state=42, n_estimators=100)
knn = KNeighborsClassifier(n_neighbors=5)

In [11]:
# Train and evaluate initial models
models = {'Decision Tree': dt, 'Random Forest': rf, 'k-NN': knn}
results = []

for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    results.append({
        'Model': name,
        'Accuracy': accuracy_score(y_test, y_pred),
        'Precision': precision_score(y_test, y_pred, zero_division=0),
        'Recall': recall_score(y_test, y_pred, zero_division=0),
        'F1-Score': f1_score(y_test, y_pred, zero_division=0)
    })


In [12]:
# Feature importance using Random Forest
rf.fit(X_train_scaled, y_train)
importances = rf.feature_importances_
feature_importance = pd.DataFrame({'Feature': X.columns, 'Importance': importances})
feature_importance = feature_importance.sort_values('Importance', ascending=False)

# Print feature importances
print("\nFeature Importances:")
print(feature_importance.to_string(index=False))


Feature Importances:
               Feature  Importance
          BurnoutLevel    0.826187
     ProductivityScore    0.013076
           StressLevel    0.012589
     CareerGrowthScore    0.012126
   ManagerSupportScore    0.011938
   PhysicalActivityHrs    0.011664
           CommuteTime    0.011395
       JobSatisfaction    0.010861
  WorkLifeBalanceScore    0.010691
              TeamSize    0.010284
            SleepHours    0.010244
                   Age    0.008624
      WorkHoursPerWeek    0.008028
        YearsAtCompany    0.007899
               Country    0.005593
   MentalHealthDaysOff    0.005592
               JobRole    0.005464
           SalaryRange    0.004947
            Department    0.004447
                Gender    0.003291
            RemoteWork    0.002477
      HasTherapyAccess    0.001417
HasMentalHealthSupport    0.001166


In [13]:
# Select top 3 features
top_features = feature_importance.head(3)['Feature'].values
print("\nTop 3 features:", top_features)

# Retrain models with top 3 features
X_train_top3 = X_train[top_features]
X_test_top3 = X_test[top_features]

# Scale top 3 features
X_train_top3_scaled = scaler.fit_transform(X_train_top3)
X_test_top3_scaled = scaler.transform(X_test_top3)

# Train and evaluate models with top 3 features
results_top3 = []
for name, model in models.items():
    model.fit(X_train_top3_scaled, y_train)
    y_pred = model.predict(X_test_top3_scaled)
    results_top3.append({
        'Model': name + ' (Top 3 Features)',
        'Accuracy': accuracy_score(y_test, y_pred),
        'Precision': precision_score(y_test, y_pred, zero_division=0),
        'Recall': recall_score(y_test, y_pred, zero_division=0),
        'F1-Score': f1_score(y_test, y_pred, zero_division=0)
    })


Top 3 features: ['BurnoutLevel' 'ProductivityScore' 'StressLevel']


In [14]:
# Combine and display results
results_df = pd.DataFrame(results + results_top3)
if len(results_df) != 6:
    print("Error: Unexpected number of results in comparison table.")
    exit(1)

print("\nModel Comparison (Before vs After Feature Selection):")
print(results_df.to_string(index=False))


Model Comparison (Before vs After Feature Selection):
                         Model  Accuracy  Precision   Recall  F1-Score
                 Decision Tree  1.000000   1.000000 1.000000  1.000000
                 Random Forest  1.000000   1.000000 1.000000  1.000000
                          k-NN  0.798333   0.770270 0.567164  0.653295
Decision Tree (Top 3 Features)  1.000000   1.000000 1.000000  1.000000
Random Forest (Top 3 Features)  1.000000   1.000000 1.000000  1.000000
         k-NN (Top 3 Features)  0.986667   0.975369 0.985075  0.980198


In [15]:
# Generate summary
print("\n=== Summary of Mental Health Workplace Survey Analysis ===")
print("Objective: Train and compare Decision Tree, Random Forest, and k-NN classifiers to predict BurnoutRisk, identify the top 3 features using Random Forest feature importance, and compare model performance before and after feature selection.")
print("\nTasks Performed:")
print("- Preprocessed data: Encoded categorical variables, scaled numerical features, and split data into 80% training and 20% testing sets.")
print("- Trained initial models (Decision Tree, Random Forest, k-NN) on all features and evaluated using accuracy, precision, recall, and F1-score.")
print("- Identified top 3 features using Random Forest feature importance.")
print("- Retrained models using only the top 3 features and evaluated performance.")
print(f"\nTop 3 Features Identified: {', '.join(top_features)}")
print("\nKey Performance Observations:")
for model in models.keys():
    before = results_df[results_df['Model'] == model].iloc[0]
    after = results_df[results_df['Model'] == model + ' (Top 3 Features)'].iloc[0]
    acc_diff = after['Accuracy'] - before['Accuracy']
    f1_diff = after['F1-Score'] - before['F1-Score']
    print(f"- {model}:")
    print(f"  - Accuracy {'improved' if acc_diff > 0 else 'decreased'} by {abs(acc_diff):.4f} (All Features: {before['Accuracy']:.4f}, Top 3 Features: {after['Accuracy']:.4f})")
    print(f"  - F1-Score {'improved' if f1_diff > 0 else 'decreased'} by {abs(f1_diff):.4f} (All Features: {before['F1-Score']:.4f}, Top 3 Features: {after['F1-Score']:.4f})")
print("\nResults saved to 'model_comparison_results.csv' for detailed comparison.")
print("==================================================")


=== Summary of Mental Health Workplace Survey Analysis ===
Objective: Train and compare Decision Tree, Random Forest, and k-NN classifiers to predict BurnoutRisk, identify the top 3 features using Random Forest feature importance, and compare model performance before and after feature selection.

Tasks Performed:
- Preprocessed data: Encoded categorical variables, scaled numerical features, and split data into 80% training and 20% testing sets.
- Trained initial models (Decision Tree, Random Forest, k-NN) on all features and evaluated using accuracy, precision, recall, and F1-score.
- Identified top 3 features using Random Forest feature importance.
- Retrained models using only the top 3 features and evaluated performance.

Top 3 Features Identified: BurnoutLevel, ProductivityScore, StressLevel

Key Performance Observations:
- Decision Tree:
  - Accuracy decreased by 0.0000 (All Features: 1.0000, Top 3 Features: 1.0000)
  - F1-Score decreased by 0.0000 (All Features: 1.0000, Top 3 Fe