In [2]:
import pandas as pd
import numpy as np
import pickle
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import OrdinalEncoder, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report

# Load dataset
df = pd.read_csv('./data/Student Depression Dataset.csv')

# Select input features and target column
features = [
    'Gender', 'Age', 'Academic Pressure', 'CGPA', 'Sleep Duration', 'Dietary Habits', 
    'Degree', 'Have you ever had suicidal thoughts ?', 'Work/Study Hours', 'Family History of Mental Illness', 'Financial Stress'
]
target = 'Depression'

# Define categorical and numerical features
categorical_features = ['Gender', 'Sleep Duration', 'Dietary Habits', 'Degree', 'Have you ever had suicidal thoughts ?', 'Family History of Mental Illness']
numerical_features = ['Age', 'Academic Pressure', 'CGPA', 'Work/Study Hours', 'Financial Stress']

# Preprocessing pipelines
categorical_pipeline = Pipeline([
    ('encoder', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
])
numeric_pipeline = Pipeline([
    ('scaler', MinMaxScaler())
])

preprocessor = ColumnTransformer([
    ('cat', categorical_pipeline, categorical_features),
    ('num', numeric_pipeline, numerical_features)
])

# Split data
X = df[features]
y = df[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define model and pipeline
xgb_model = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', XGBClassifier(eval_metric='logloss', use_label_encoder=False))
])

# Perform cross-validation
cv_scores = cross_val_score(xgb_model, X_train, y_train, cv=5, scoring='f1_macro')
print(f'Cross-validation F1 scores: {cv_scores}')
print(f'Mean F1 score: {np.mean(cv_scores)}')

# Train model
xgb_model.fit(X_train, y_train)

# Evaluate model
y_pred = xgb_model.predict(X_test)
print(classification_report(y_test, y_pred))

# Save the model as a pickle file
with open('model.pkl', 'wb') as file:
    pickle.dump(xgb_model, file)

print("Model saved as model.pkl")

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Cross-validation F1 scores: [0.83374693 0.82687125 0.82569937 0.82787022 0.83030827]
Mean F1 score: 0.8288992064342693
              precision    recall  f1-score   support

           0       0.80      0.77      0.79      2343
           1       0.84      0.87      0.85      3238

    accuracy                           0.82      5581
   macro avg       0.82      0.82      0.82      5581
weighted avg       0.82      0.82      0.82      5581

Model saved as model.pkl
