In [15]:
# 📦 Import necessary libraries
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from google.colab import drive, files

# 📂 Step 1: Mount Google Drive
drive.mount('/content/drive')

# 📥 Step 2: Load Dataset
file_path = '/content/drive/MyDrive/Final_dataset_cleanedf.csv'
df = pd.read_csv(file_path)

# 🧹 Step 3: Remove negative engagement duration
df = df[df['engagement_duration_days'] >= 0]

# 🧾 Step 4: Encode categorical variables
df = pd.get_dummies(df, columns=['region', 'imd_band', 'final_result', 'disability', 'gender'], drop_first=True)

# ✂️ Step 5: Remove extreme outliers before scaling
numeric_features = ['avg_clicks_per_week', 'studied_credits', 'engagement_duration_days', 'total_failed_or_missed']
for col in numeric_features:
    q_low = df[col].quantile(0.01)
    q_high = df[col].quantile(0.99)
    df = df[(df[col] >= q_low) & (df[col] <= q_high)]

# 🔄 Step 6: Standardize numeric features
scaler = StandardScaler()
df[numeric_features] = scaler.fit_transform(df[numeric_features])

# 🎯 Step 7: Separate features and target
X = df.drop('burnout', axis=1)
y = df['burnout']

# 🧪 Step 8: Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# ⚖️ Step 9: Handle imbalance using SMOTE
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# ✅ Step 10: Final check and save
print("Final class balance:\n", pd.Series(y_train_resampled).value_counts(normalize=True))

# 🧽 Step 11: Clean column names (for XGBoost compatibility)
cleaned_columns = X_train_resampled.columns.str.replace(r'[\[\]<>]', '', regex=True).str.replace(' ', '_')
X_train_resampled.columns = cleaned_columns
X_test.columns = cleaned_columns  # Ensure test set matches

# 🚀 Logistic Regression
logreg = LogisticRegression(random_state=42)
logreg.fit(X_train_resampled, y_train_resampled)
y_pred_logreg = logreg.predict(X_test)
print("\n🔍 Logistic Regression Evaluation:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_logreg) * 100:.2f}%")
print(classification_report(y_test, y_pred_logreg, target_names=['Not at risk (0)', 'At risk (1)']))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_logreg))

# 🚀 Decision Tree
dtree = DecisionTreeClassifier(random_state=42)
dtree.fit(X_train_resampled, y_train_resampled)
y_pred_dtree = dtree.predict(X_test)
print("\n🔍 Decision Tree Evaluation:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_dtree) * 100:.2f}%")
print(classification_report(y_test, y_pred_dtree, target_names=['Not at risk (0)', 'At risk (1)']))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_dtree))

# 🚀 Random Forest
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train_resampled, y_train_resampled)
y_pred_rf = rf.predict(X_test)
print("\n🔍 Random Forest Evaluation:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_rf) * 100:.2f}%")
print(classification_report(y_test, y_pred_rf, target_names=['Not at risk (0)', 'At risk (1)']))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))

# 🚀 XGBoost
xgb = XGBClassifier(random_state=42)
xgb.fit(X_train_resampled, y_train_resampled)
y_pred_xgb = xgb.predict(X_test)
print("\n🔍 XGBoost Evaluation:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_xgb) * 100:.2f}%")
print(classification_report(y_test, y_pred_xgb, target_names=['Not at risk (0)', 'At risk (1)']))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_xgb))


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Final class balance:
 burnout
1    0.5
0    0.5
Name: proportion, dtype: float64


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



🔍 Logistic Regression Evaluation:
Accuracy: 93.06%
                 precision    recall  f1-score   support

Not at risk (0)       0.92      0.91      0.92      2347
    At risk (1)       0.94      0.94      0.94      3118

       accuracy                           0.93      5465
      macro avg       0.93      0.93      0.93      5465
   weighted avg       0.93      0.93      0.93      5465

Confusion Matrix:
 [[2143  204]
 [ 175 2943]]

🔍 Decision Tree Evaluation:
Accuracy: 95.33%
                 precision    recall  f1-score   support

Not at risk (0)       0.94      0.95      0.95      2347
    At risk (1)       0.96      0.96      0.96      3118

       accuracy                           0.95      5465
      macro avg       0.95      0.95      0.95      5465
   weighted avg       0.95      0.95      0.95      5465

Confusion Matrix:
 [[2231  116]
 [ 139 2979]]

🔍 Random Forest Evaluation:
Accuracy: 96.47%
                 precision    recall  f1-score   support

Not at risk (0) 

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
