In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import pandas as pd

# Replace 'path/to/your/dataset.csv' with the actual path to your file in Google Drive
try:
    df = pd.read_csv('/content/drive/MyDrive/ColabNotebooks/ML/mental_health_data.csv')
    display(df.head())
except FileNotFoundError:
    print("Error: Dataset not found. Please check the path to your file.")
except Exception as e:
    print(f"An error occurred: {e}")

Unnamed: 0,User_ID,Age,Gender,Occupation,Country,Mental_Health_Condition,Severity,Consultation_History,Stress_Level,Sleep_Hours,Work_Hours,Physical_Activity_Hours,Social_Media_Usage,Diet_Quality,Smoking_Habit,Alcohol_Consumption,Medication_Usage
0,1,36,Male,Education,Australia,Yes,,Yes,Low,7.6,46,8,2.2,Healthy,Regular Smoker,Regular Drinker,Yes
1,2,48,Male,Engineering,Other,No,Low,No,Low,6.8,74,2,3.4,Unhealthy,Heavy Smoker,Social Drinker,No
2,3,18,Prefer not to say,Sales,India,No,,Yes,Medium,7.1,77,9,5.9,Healthy,Heavy Smoker,Social Drinker,No
3,4,30,Non-binary,Engineering,Australia,No,Medium,No,Low,6.9,57,4,5.4,Average,Regular Smoker,Regular Drinker,No
4,5,58,Male,IT,USA,Yes,,Yes,High,4.7,45,10,3.3,Unhealthy,Regular Smoker,Non-Drinker,Yes


In [7]:
#Data Cleaning
# ----------------------------
# Step 1: Drop duplicates
# ----------------------------
df = df.drop_duplicates()

# ----------------------------
# Step 2: Drop useless column
# ----------------------------
df = df.drop(columns=["Unnamed: 0"], errors="ignore")
#df

# ----------------------------
# Step 3: Handle missing values
# ----------------------------
# Option A: Fill missing values in 'Severity' with "Unknown"
df["Severity"] = df["Severity"].fillna("Unknown")
#df

# ----------------------------
# Save cleaned dataset in Google Drive path
# ----------------------------
save_path = "/content/drive/MyDrive/ColabNotebooks/ML/ML_dataset_cleaned_without_one_hot_encoding.csv"
df.to_csv(save_path, index=False)
print("✅ Dataset cleaned and saved as ML_dataset_cleaned.csv")

✅ Dataset cleaned and saved as ML_dataset_cleaned.csv


In [11]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# ----------------------------
# Load and Clean the Dataset
# ----------------------------
# Load your dataset from the path before it was encoded
try:
    df = pd.read_csv("/content/drive/MyDrive/ColabNotebooks/ML/ML_dataset_cleaned_without_one_hot_encoding.csv")
except FileNotFoundError:
    print("Please make sure 'ML_dataset_cleaned_without_one_hot_encoding.csv' is in the correct path.")
    # Creating a placeholder if the file is not found
    data = {'User_ID': [1, 2, 3, 4], 'Age': [36, 48, 18, 30], 'Gender': ['Male', 'Male', 'Prefer not to say', 'Non-binary'],
            'Occupation': ['Education', 'Engineering', 'Sales', 'Engineering'], 'Severity': ['Unknown', 'Low', 'Unknown', 'Medium']}
    df = pd.DataFrame(data)

# Basic cleaning steps
df = df.drop_duplicates()
df = df.drop(columns=["Unnamed: 0"], errors="ignore")
df["Severity"] = df["Severity"].fillna("Unknown")


# ----------------------------
# Step 4 (Corrected): Separate Target from Features ✨
# ----------------------------
# The 'Severity' column is what we want to predict (our target, y)
y = df['Severity']

# The rest of the data, excluding identifiers like User_ID, are our features (X)
X = df.drop(columns=['Severity', 'User_ID'], errors='ignore')


# ----------------------------
# Step 5 (New): Encode Target and Features Separately ✨
# ----------------------------
# A. Encode the target variable 'y' using LabelEncoder
# This converts labels like 'Low', 'Medium' into numbers (0, 1, 2...) in a single column
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# B. One-hot encode the features 'X'
X_encoded = pd.get_dummies(X, drop_first=True)


# --- Display Results to Verify ---
print("--- Preview of the Correctly Encoded Features (X) ---")
print(X_encoded.head())

print("\n--- Preview of the Correctly Encoded Target (y) ---")
print("Original labels:", y.head().values)
print("Encoded labels: ", y_encoded[:5]) # Show first 5 encoded numbers

# Show the mapping from text to number
print("\n--- Target Label Mapping ---")
for i, class_name in enumerate(label_encoder.classes_):
    print(f"'{class_name}'  ->  {i}")


# ----------------------------
# Save the Processed Data
# ----------------------------
# It's best practice to save your processed features and target separately
X_encoded.to_csv("/content/drive/MyDrive/ColabNotebooks/ML/features_processed.csv", index=False)
pd.Series(y_encoded, name="target").to_csv("/content/drive/MyDrive/ColabNotebooks/ML/target_processed.csv", index=False)

print("\n✅ Processed features and target saved to 'features_processed.csv' and 'target_processed.csv'")

--- Preview of the Correctly Encoded Features (X) ---
   Age  Sleep_Hours  Work_Hours  Physical_Activity_Hours  Social_Media_Usage  \
0   36          7.6          46                        8                 2.2   
1   48          6.8          74                        2                 3.4   
2   18          7.1          77                        9                 5.9   
3   30          6.9          57                        4                 5.4   
4   58          4.7          45                       10                 3.3   

   Gender_Male  Gender_Non-binary  Gender_Prefer not to say  \
0         True              False                     False   
1         True              False                     False   
2        False              False                      True   
3        False               True                     False   
4         True              False                     False   

   Occupation_Engineering  Occupation_Finance  ...  Stress_Level_Medium  \
0          

In [14]:
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
import xgboost as xgb

# --- 1. LOAD YOUR PROCESSED DATA ---
# Let's assume you're running this in a new session and need to load the files you just saved.
try:
    X_processed = pd.read_csv("/content/drive/MyDrive/ColabNotebooks/ML/features_processed.csv")
    y_processed = pd.read_csv("/content/drive/MyDrive/ColabNotebooks/ML/target_processed.csv").squeeze("columns") # .squeeze converts it to a Series
except FileNotFoundError:
    print("Processed files not found. Please ensure they are in the correct directory.")
    # You would handle this error appropriately in a real script
    exit()

# --- 2. SCALE THE FEATURES ---
# Initialize the scaler
scaler = StandardScaler()

# Fit the scaler to the feature data and transform it
X_scaled = scaler.fit_transform(X_processed)

print("--- Features Scaled ---")
print("The data type of X_scaled is:", type(X_scaled))
print("Shape of X_scaled:", X_scaled.shape)


# --- 3. SPLIT THE DATA FOR TRAINING AND TESTING ---XGBoost with 5-Fold Cross-Validation----
# This is the final preparation step!
# --- INITIALIZE THE XGBOOST CLASSIFIER AND K-FOLD ---
# We determine the number of classes directly from our target variable
num_classes = y_processed.nunique()

# Initialize the XGBoost model for multi-class classification
model_xgb = xgb.XGBClassifier(
    objective='multi:softmax',  # Specifies the learning task
    num_class=num_classes,      # Number of unique classes to predict
    seed=42                     # for reproducibility
)

# Set up the 5-fold cross-validation strategy
# StratifiedKFold is best for classification to preserve the percentage of samples for each class.
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)


# --- RUN THE CROSS-VALIDATION ---
print("🚀 Running 5-Fold Cross-Validation with XGBoost...")
scores = cross_val_score(
    model_xgb,
    X_scaled,
    y_processed,
    cv=kfold,
    scoring='accuracy' # You can change this to other metrics like 'f1_macro'
)

# --- PRINT THE RESULTS ---
print("\n--- Cross-Validation Results ---")
print(f"Fold Accuracies: {np.round(scores, 4)}")
print(f"Average Accuracy (Mean): {scores.mean():.4f}")
print(f"Accuracy Standard Deviation: {scores.std():.4f}")

--- Features Scaled ---
The data type of X_scaled is: <class 'numpy.ndarray'>
Shape of X_scaled: (50000, 33)
🚀 Running 5-Fold Cross-Validation with XGBoost...

--- Cross-Validation Results ---
Fold Accuracies: [0.4883 0.4869 0.4861 0.4847 0.4867]
Average Accuracy (Mean): 0.4865
Accuracy Standard Deviation: 0.0012


In [15]:
#Train and Evaluate the Final Model
#------------------------------------
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# --- SPLIT DATA (we repeat this step to ensure we have the variables) ---
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y_processed, test_size=0.2, random_state=42, stratify=y_processed
)

# --- TRAIN THE FINAL XGBOOST MODEL ---
print("\n💪 Training the final XGBoost model on the full training set...")
final_model = xgb.XGBClassifier(
    objective='multi:softmax',
    num_class=num_classes,
    seed=42
)
final_model.fit(X_train, y_train)
print("✅ Training complete.")

# --- MAKE PREDICTIONS AND EVALUATE ---
print("\n🔍 Evaluating the model on the unseen test set...")
y_pred = final_model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"\nTest Set Accuracy: {accuracy:.4f}")

# Display a detailed classification report
# Note: You need the original label encoder to see the class names
# Let's create it again for clarity
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
# Assuming the classes are ['High', 'Low', 'Medium', 'Unknown'] in the order they were encoded
le.fit(['High', 'Low', 'Medium', 'Unknown'])


print("\n--- Classification Report ---")
print(classification_report(y_test, y_pred, target_names=le.classes_))


💪 Training the final XGBoost model on the full training set...
✅ Training complete.

🔍 Evaluating the model on the unseen test set...

Test Set Accuracy: 0.4836

--- Classification Report ---
              precision    recall  f1-score   support

        High       0.15      0.01      0.02      1660
         Low       0.12      0.01      0.02      1652
      Medium       0.15      0.01      0.02      1687
     Unknown       0.50      0.96      0.66      5001

    accuracy                           0.48     10000
   macro avg       0.23      0.25      0.18     10000
weighted avg       0.32      0.48      0.34     10000



In [19]:
#Model is facing a classic and very common machine learning problem: severe class imbalance.
#Next Step: Handle the Class Imbalance with SMOTE
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score # ✨ Import roc_auc_score
from imblearn.over_sampling import SMOTE

# --- 1. LOAD AND PREPARE DATA (As before) ---
X_processed = pd.read_csv("/content/drive/MyDrive/ColabNotebooks/ML/features_processed.csv")
y_processed = pd.read_csv("/content/drive/MyDrive/ColabNotebooks/ML/target_processed.csv").squeeze("columns")

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_processed)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y_processed, test_size=0.2, random_state=42, stratify=y_processed
)


# --- 2. APPLY SMOTE TO THE TRAINING DATA ---
print("Class distribution before SMOTE:", pd.Series(y_train).value_counts().sort_index())

smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

print("\nClass distribution after SMOTE:", pd.Series(y_train_resampled).value_counts().sort_index())


# --- 3. TRAIN XGBOOST ON THE NEW, BALANCED DATA ---
print("\n💪 Training XGBoost model on the resampled data...")
num_classes = y_processed.nunique()

balanced_model = xgb.XGBClassifier(
    objective='multi:softmax',
    num_class=num_classes,
    seed=42
)

balanced_model.fit(X_train_resampled, y_train_resampled)
print("✅ Training complete.")


# --- 4. EVALUATE THE NEW BALANCED MODEL ---
print("\n🔍 Evaluating the new model on the original, unseen test set...")
y_pred_balanced = balanced_model.predict(X_test)
y_pred_proba_balanced = balanced_model.predict_proba(X_test) # ✨ Get class probabilities for AUC

# --- Calculate Metrics ---
accuracy = accuracy_score(y_test, y_pred_balanced)

# ✨ Calculate Macro AUC score
macro_auc = roc_auc_score(
    y_test,
    y_pred_proba_balanced,
    multi_class='ovr', # One-vs-Rest strategy
    average='macro'
)

print(f"\nTest Set Accuracy (after SMOTE): {accuracy:.4f}")
print(f"Test Set Macro AUC (after SMOTE): {macro_auc:.4f}") # ✨ Print the new metric

# You need the label encoder again to see the class names
le = LabelEncoder()
le.fit(['High', 'Low', 'Medium', 'Unknown'])

print("\n--- Classification Report (after SMOTE) ---")
print(classification_report(y_test, y_pred_balanced, target_names=le.classes_))


Class distribution before SMOTE: target
0     6641
1     6609
2     6749
3    20001
Name: count, dtype: int64

Class distribution after SMOTE: target
0    20001
1    20001
2    20001
3    20001
Name: count, dtype: int64

💪 Training XGBoost model on the resampled data...
✅ Training complete.

🔍 Evaluating the new model on the original, unseen test set...

Test Set Accuracy (after SMOTE): 0.4902
Test Set Macro AUC (after SMOTE): 0.5007

--- Classification Report (after SMOTE) ---
              precision    recall  f1-score   support

        High       0.14      0.01      0.01      1660
         Low       0.15      0.01      0.02      1652
      Medium       0.17      0.01      0.02      1687
     Unknown       0.50      0.97      0.66      5001

    accuracy                           0.49     10000
   macro avg       0.24      0.25      0.18     10000
weighted avg       0.33      0.49      0.34     10000



In [20]:
!pip install tabulate



In [21]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score, f1_score # ✨ Import f1_score
from imblearn.over_sampling import SMOTE
from tabulate import tabulate # ✨ Import tabulate

# --- 1. LOAD AND PREPARE DATA ---
X_processed = pd.read_csv("/content/drive/MyDrive/ColabNotebooks/ML/features_processed.csv")
y_processed = pd.read_csv("/content/drive/MyDrive/ColabNotebooks/ML/target_processed.csv").squeeze("columns")

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_processed)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y_processed, test_size=0.2, random_state=42, stratify=y_processed
)

# --- 2. APPLY SMOTE ---
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# --- 3. TRAIN XGBOOST ---
print("💪 Training XGBoost model on the resampled data...")
balanced_model = xgb.XGBClassifier(
    objective='multi:softmax',
    num_class=y_processed.nunique(),
    seed=42
)
balanced_model.fit(X_train_resampled, y_train_resampled)
print("✅ Training complete.")

# --- 4. EVALUATE THE MODEL ---
print("\n🔍 Evaluating the model...")
y_pred = balanced_model.predict(X_test)
y_pred_proba = balanced_model.predict_proba(X_test)

# --- Calculate All Metrics ---
accuracy = accuracy_score(y_test, y_pred)
macro_auc = roc_auc_score(y_test, y_pred_proba, multi_class='ovr', average='macro')
macro_f1 = f1_score(y_test, y_pred, average='macro') # ✨ Calculate Macro F1-Score

# --- 5. DISPLAY RESULTS IN A TABLE ✨ ---
# Prepare the data for the table
table_data = [
    ["Accuracy", f"{accuracy * 100:.2f} %"],
    ["Macro AUC", f"{macro_auc * 100:.2f} %"],
    ["Macro F1-Score", f"{macro_f1 * 100:.2f} %"]
]

# Create the table header
headers = ["XGBoost", ""] # Using model name as header

# Print the formatted table
print("\n" + "="*40)
print("          MODEL EVALUATION SUMMARY")
print("="*40)
print(tabulate(table_data, headers=headers, tablefmt="grid"))
print("="*40 + "\n")


# --- (Optional) Print the detailed classification report as before ---
le = LabelEncoder()
le.fit(['High', 'Low', 'Medium', 'Unknown'])
print("\n--- Detailed Classification Report ---")
print(classification_report(y_test, y_pred, target_names=le.classes_))

💪 Training XGBoost model on the resampled data...
✅ Training complete.

🔍 Evaluating the model...

          MODEL EVALUATION SUMMARY
+----------------+---------+
| XGBoost        |         |
| Accuracy       | 49.02 % |
+----------------+---------+
| Macro AUC      | 50.07 % |
+----------------+---------+
| Macro F1-Score | 17.88 % |
+----------------+---------+


--- Detailed Classification Report ---
              precision    recall  f1-score   support

        High       0.14      0.01      0.01      1660
         Low       0.15      0.01      0.02      1652
      Medium       0.17      0.01      0.02      1687
     Unknown       0.50      0.97      0.66      5001

    accuracy                           0.49     10000
   macro avg       0.24      0.25      0.18     10000
weighted avg       0.33      0.49      0.34     10000



In [22]:
#Updated Pipeline: Removing "Unknown" and Retraining

import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score, f1_score
from imblearn.over_sampling import SMOTE
from tabulate import tabulate

# --- 1. LOAD AND PREPARE DATA ---
# Start with the original cleaned file
df = pd.read_csv("/content/drive/MyDrive/ColabNotebooks/ML/ML_dataset_cleaned_without_one_hot_encoding.csv")

# ✨ --- CRUCIAL NEW STEP: REMOVE THE 'UNKNOWN' CLASS --- ✨
print(f"Original dataset shape: {df.shape}")
df = df[df['Severity'] != 'Unknown']
print(f"Dataset shape after removing 'Unknown' class: {df.shape}\n")

# --- 2. SEPARATE AND ENCODE ---
# Separate features (X) and target (y)
y = df['Severity']
X = df.drop(columns=['Severity', 'User_ID'], errors='ignore')

# Encode the new 3-class target variable
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# One-hot encode the features
X_encoded = pd.get_dummies(X, drop_first=True)


# --- 3. SCALE AND SPLIT ---
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_encoded)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)


# --- 4. APPLY SMOTE ---
# SMOTE is still useful as the 3 classes might not be perfectly balanced
print("Class distribution before SMOTE:", pd.Series(y_train).value_counts().sort_index())
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
print("Class distribution after SMOTE:", pd.Series(y_train_resampled).value_counts().sort_index())


# --- 5. TRAIN FINAL XGBOOST MODEL ---
print("\n💪 Training XGBoost model on the filtered, balanced data...")
# The number of classes is now 3
final_model = xgb.XGBClassifier(
    objective='multi:softmax',
    num_class=len(label_encoder.classes_),
    seed=42
)
final_model.fit(X_train_resampled, y_train_resampled)
print("✅ Training complete.")


# --- 6. EVALUATE THE NEW MODEL ---
print("\n🔍 Evaluating the new 3-class model...")
y_pred = final_model.predict(X_test)
y_pred_proba = final_model.predict_proba(X_test)

# --- Calculate Metrics ---
accuracy = accuracy_score(y_test, y_pred)
macro_auc = roc_auc_score(y_test, y_pred_proba, multi_class='ovr', average='macro')
macro_f1 = f1_score(y_test, y_pred, average='macro')

# --- Display Results Table ---
table_data = [
    ["Accuracy", f"{accuracy * 100:.2f} %"],
    ["Macro AUC", f"{macro_auc * 100:.2f} %"],
    ["Macro F1-Score", f"{macro_f1 * 100:.2f} %"]
]
headers = ["XGBoost (3-Class)", ""]
print("\n" + tabulate(table_data, headers=headers, tablefmt="grid") + "\n")

# --- Display Detailed Report ---
print("\n--- Detailed Classification Report (3-Class Model) ---")
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

Original dataset shape: (50000, 17)
Dataset shape after removing 'Unknown' class: (24998, 17)

Class distribution before SMOTE: 0    6641
1    6609
2    6748
Name: count, dtype: int64
Class distribution after SMOTE: 0    6748
1    6748
2    6748
Name: count, dtype: int64

💪 Training XGBoost model on the filtered, balanced data...
✅ Training complete.

🔍 Evaluating the new 3-class model...

+---------------------+---------+
| XGBoost (3-Class)   |         |
| Accuracy            | 32.56 % |
+---------------------+---------+
| Macro AUC           | 49.59 % |
+---------------------+---------+
| Macro F1-Score      | 32.54 % |
+---------------------+---------+


--- Detailed Classification Report (3-Class Model) ---
              precision    recall  f1-score   support

        High       0.32      0.31      0.31      1660
         Low       0.32      0.32      0.32      1652
      Medium       0.34      0.35      0.34      1688

    accuracy                           0.33      5000
   mac