In [1]:
#Imports
import pandas as pd
import numpy as np
from sklearn.preprocessing import normalize, OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate, GridSearchCV
from sklearn.dummy import DummyClassifier
from sklearn.metrics import mean_squared_error, accuracy_score, recall_score, make_scorer, confusion_matrix, ConfusionMatrixDisplay, f1_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, FeatureUnion
import statsmodels.api as sm
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
import pickle

## Data Understanding

In [3]:
df = pd.read_csv('../data/students_mental_health_survey.csv')

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
# There are a total of 27 rows with null values so decided to drop all 27 rows. 
df.dropna(inplace=True)

In [None]:
df.reset_index(inplace=True)

In [None]:
df

In [None]:
df.info()

In [None]:
df['Sleep_Quality'].value_counts()

In [None]:
df['Physical_Activity'].value_counts()

In [None]:
df['Diet_Quality'].value_counts()

In [None]:
df['Social_Support'].value_counts()

In [None]:
df['Relationship_Status'].value_counts()

In [None]:
df['Substance_Use'].value_counts()

In [None]:
df['Counseling_Service_Use'].value_counts()

In [None]:
df['Family_History'].value_counts()

In [None]:
df['Chronic_Illness'].value_counts()

In [None]:
df['Financial_Stress'].value_counts()

In [None]:
df['Extracurricular_Involvement'].value_counts()

In [None]:
df['Semester_Credit_Load'].value_counts()

In [None]:
df['Age'].value_counts()

In [None]:
df.info()

In [None]:
df['Extracurricular_Involvement'].value_counts()

In [None]:
df['Depression_Score'].value_counts()

## Data Preparation

### Preprocessing Data

In [None]:
# Define which columns are categorical and continuous
ohe_cols = ['Course', 'Gender', 'Relationship_Status', 'Family_History', 'Residence_Type', 'Chronic_Illness']
ordinal_cols = ['Stress_Level', 'Anxiety_Score','Sleep_Quality', 
               'Physical_Activity', 'Diet_Quality', 'Social_Support', 'Substance_Use', 
               'Counseling_Service_Use', 'Extracurricular_Involvement']
stress_ord = [0, 1, 2, 3, 4, 5]
anx_ord = [0, 1, 2, 3, 4, 5]
sleep_ord = ['Poor', 'Average', 'Good']
phys_ord = ['Low', 'Moderate', 'High']
diet_ord = ['Poor', 'Average', 'Good']
social_ord = ['Low', 'Moderate', 'High']
subst_ord = ['Never', 'Occasionally', 'Frequently']
counsel_ord = ['Never', 'Occasionally', 'Frequently']
extracurric_ord = ['Low', 'Moderate', 'High']

continuous_cols = ['Age', 'Semester_Credit_Load', 'CGPA']

# Create a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), continuous_cols),  # Scale continuous variables
        ('ohe', OneHotEncoder(), ohe_cols),
        ('ord', OrdinalEncoder(categories=[stress_ord, anx_ord, sleep_ord, phys_ord, diet_ord, 
                                           social_ord, subst_ord, counsel_ord, extracurric_ord]), 
         ordinal_cols)])

# Create a pipeline that applies the preprocessing steps
preprocess_pipeline = Pipeline(steps=[('preprocessor', preprocessor)])

# Transform the data
transformed_data = preprocess_pipeline.fit_transform(df)
type(transformed_data)

# Get the column names for the one-hot encoded categorical variables from the one-hot encoder
ohe_encoder = preprocess_pipeline.named_steps['preprocessor'].named_transformers_['ohe']
one_hot_feature_names = ohe_encoder.get_feature_names(input_features=ohe_cols)

# # Combine the one-hot encoded features with the scaled continuous variables and ordinal encoded features
all_feature_names = list(continuous_cols) + list(one_hot_feature_names) + list(ordinal_cols)

# # Create a DataFrame using the transformed data and feature names
transformed_df = pd.DataFrame(transformed_data, columns=all_feature_names)

In [None]:
transformed_df

In [None]:
df

In [None]:
df.loc[df['Depression_Score'] >= 3, 'Depression_Binary'] = 'Yes'
df.loc[df['Depression_Score'] < 3, 'Depression_Binary'] = 'No'

### Train_Test_Split

In [None]:
X = transformed_df
y = df['Depression_Binary']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=333)

In [None]:
X_train

In [None]:
y_train.value_counts()

## Modeling

## Evaluation

### Dummy Model Classifier

In [None]:
# Instantiate dummy model that will always predict Yes
dummy_model = DummyClassifier(strategy="constant", constant='Yes')
dummy_model.fit(X_train, y_train)

In [None]:
# Pickle the model
with open('pickled_models/dummy_model.pkl', 'wb') as f:
    pickle.dump(dummy_model, f)

In [None]:
# Load the pickled model
with open('pickled_models/dummy_model.pkl', 'rb') as f:
    dummy_model = pickle.load(f)

In [None]:
# Defining the predictions from the Pipeline using the training data
y_pred = dummy_model.predict(X_train)
# Evaluating the accuracy score on the training data
print('Recall: ', recall_score(y_train, y_pred, pos_label='Yes'))
print('F1: ', f1_score(y_train, y_pred, pos_label='Yes'))
print('Accuracy: ', accuracy_score(y_train, y_pred))

In [None]:
# Creating confusion matrix
cf = confusion_matrix(y_train, y_pred)
# Displaying confusion matrix
ConfusionMatrixDisplay(cf, display_labels=['No', 'Yes']).plot()

In [None]:
recall_scorer = make_scorer(recall_score, pos_label='Yes')
f1_scorer = make_scorer(f1_score, pos_label='Yes')

In [None]:
# Cross Validation Scores
print('CV Recall: ', cross_val_score(dummy_model, X_train, y_train, scoring=recall_scorer).mean())
print('CV F1: ', cross_val_score(dummy_model, X_train, y_train, scoring=f1_scorer).mean())
print('CV Accuracy: ', cross_val_score(dummy_model, X_train, y_train).mean())

### First Model: Decision Tree 

In [None]:
# # Creating steps for a Pipeline 
# tree_steps = [('smote', SMOTE(random_state=333)),
#               ('tree', DecisionTreeClassifier())]
# # Feeding the Pipeline the steps defined above
# tree_pipe = Pipeline(tree_steps)
# # Fitting the training data to the Pipeline
# tree_pipe.fit(X_train, y_train)

In [None]:
# # Pickle the model
# with open('tree_pipe.pkl', 'wb') as f:
#     pickle.dump(tree_pipe, f)

In [None]:
# Load the pickled model
with open('pickled_models/tree_pipe.pkl', 'rb') as f:
    tree_pipe = pickle.load(f)

In [None]:
# Load the pickled model
with open('pickled_models/tree_pipe.pkl', 'rb') as f:
    tree_pipe = pickle.load(f)

In [None]:
# Defining the predictions from the Pipeline using the training data
y_pred = tree_pipe.predict(X_train)
# Evaluating the accuracy score on the training data
print('Recall: ', recall_score(y_train, y_pred, pos_label='Yes'))
print('F1: ', f1_score(y_train, y_pred, pos_label='Yes'))
print('Accuracy: ', accuracy_score(y_train, y_pred))

In [None]:
# Creating confusion matrix
cf = confusion_matrix(y_train, y_pred)
# Displaying confusion matrix
ConfusionMatrixDisplay(cf, display_labels=['No', 'Yes']).plot()

In [None]:
# Cross Validation Scores
print('CV Recall: ', cross_val_score(tree_pipe, X_train, y_train, scoring=recall_scorer).mean())
print('CV F1: ', cross_val_score(tree_pipe, X_train, y_train, scoring=f1_scorer).mean())
print('CV Accuracy: ', cross_val_score(tree_pipe, X_train, y_train).mean())

### Second Model: Random Forest

In [None]:
# Creating steps for a Pipeline 
forest_steps = [('smote', SMOTE(random_state=333)),
                ('forest', RandomForestClassifier(random_state=333))]
# Feeding the Pipeline the steps defined above
forest_pipe = Pipeline(forest_steps)
# Fitting the training data to the Pipeline
forest_pipe.fit(X_train, y_train)

In [None]:
# Pickle the model
with open('pickled_models/forest_pipe.pkl', 'wb') as f:
    pickle.dump(forest_pipe, f)

In [None]:
# Load the pickled model
with open('pickled_models/forest_pipe.pkl', 'rb') as f:
    forest_pipe = pickle.load(f)

In [None]:
# Scores for model
# Defining the predictions from the Pipeline using the training data
y_pred = forest_pipe.predict(X_train)
# Evaluating the accuracy score on the training data
print('Recall: ', recall_score(y_train, y_pred, pos_label='Yes'))
print('F1: ', f1_score(y_train, y_pred, pos_label='Yes'))
print('Accuracy: ', accuracy_score(y_train, y_pred))

In [None]:
# Creating confusion matrix
cf = confusion_matrix(y_train, y_pred)
# Displaying confusion matrix
ConfusionMatrixDisplay(cf, display_labels=['No', 'Yes']).plot()

In [None]:
# Cross Validation Scores for model
model = forest_pipe
print('CV Recall: ', cross_val_score(model, X_train, y_train, scoring=recall_scorer).mean())
print('CV F1: ', cross_val_score(model, X_train, y_train, scoring=f1_scorer).mean())
print('CV Accuracy: ', cross_val_score(model, X_train, y_train).mean())

### Random Forest with GridSearch 1

In [None]:
##Recall
# Creating parameters for GridSearch
params = {'forest__n_estimators': [50, 100, 150],
          'forest__criterion': ['gini', 'entropy'],
          'forest__max_depth': [10, None],
          'forest__min_samples_split': [1, 2],
          'forest__min_weight_fraction_leaf': [0, .5],
         'forest__max_features': ['auto', None, 15],
          'forest__max_leaf_nodes': [None, 10],
          'forest__min_impurity_decrease': [0, .5],
         }
# GridSearch with the random forest pipeline, parameters above, 5 fold cross validation, and accuracy score
forest_grid1 = GridSearchCV(estimator=forest_pipe, param_grid=params, cv=5, scoring=recall_scorer, verbose=2)
# Fitting the GridSearch
forest_grid1.fit(X_train, y_train)

In [None]:
# Pickle the model
with open('pickled_models/forest_grid1.pkl', 'wb') as f:
    pickle.dump(forest_grid1, f)

In [None]:
# Load the pickled model
with open('pickled_models/forest_grid1.pkl', 'rb') as f:
    forest_grid1 = pickle.load(f)

In [None]:
print("Recall")
print(forest_grid1.best_estimator_)
print(forest_grid1.best_score_)

In [None]:
model = forest_grid1.best_estimator_
# Cross Validation Scores for model
print('CV Recall: ', cross_val_score(model, X_train, y_train, scoring=recall_scorer).mean())
print('CV F1: ', cross_val_score(model, X_train, y_train, scoring=f1_scorer).mean())
print('CV Accuracy: ', cross_val_score(model, X_train, y_train).mean())

#### Random Forest with GridSearch 2

In [None]:
##Recall
# Creating parameters for GridSearch
params = {'forest__n_estimators': [150, 160, 140],
          'forest__criterion': ['gini'],
          'forest__max_depth': [30, 10],
          'forest__min_samples_split': [3, 2],
          'forest__min_weight_fraction_leaf': [.75, .5, .25],
         'forest__max_features': [30, 20, None],
          'forest__max_leaf_nodes': [None, 2],
          'forest__min_impurity_decrease': [0, .25],
         }
# GridSearch with the random forest pipeline, parameters above, 5 fold cross validation, and accuracy score
forest_grid2 = GridSearchCV(estimator=forest_pipe, param_grid=params, cv=5, scoring=recall_scorer, verbose=2)
# Fitting the GridSearch
forest_grid2.fit(X_train, y_train)

In [None]:
# Pickle the model
with open('pickled_models/forest_grid2.pkl', 'wb') as f:
    pickle.dump(forest_grid2, f)

In [None]:
# Load the pickled model
with open('pickled_models/forest_grid2.pkl', 'rb') as f:
    forest_grid2 = pickle.load(f)

In [None]:
print("Recall")
print(forest_grid2.best_estimator_)
print(forest_grid2.best_score_)

#### Random Forest with Grid Search 3

In [None]:
##Recall
# Creating parameters for GridSearch
params = {'forest__n_estimators': [145, 155, 150],
          'forest__criterion': ['gini'],
          'forest__max_depth': [30, 20, 40],
          'forest__min_samples_split': [3, 4],
          'forest__min_weight_fraction_leaf': [.2, .3, .25],
         'forest__max_features': [30, 40, 50],
          'forest__max_leaf_nodes': [1, 2, 3],
          'forest__min_impurity_decrease': [0, .1],
         }
# GridSearch with the random forest pipeline, parameters above, 5 fold cross validation, and accuracy score
forest_grid3 = GridSearchCV(estimator=forest_pipe, param_grid=params, cv=5, scoring=recall_scorer, verbose=2)
# Fitting the GridSearch
forest_grid3.fit(X_train, y_train)

In [None]:
# Pickle the model
with open('pickled_models/forest_grid3.pkl', 'wb') as f:
    pickle.dump(forest_grid3, f)

In [None]:
# Load the pickled model
with open('pickled_models/forest_grid3.pkl', 'rb') as f:
    forest_grid3 = pickle.load(f)

In [None]:
print("Recall")
print(forest_grid3.best_estimator_)
print(forest_grid3.best_score_)

### Third Model: Logistic Regression

In [None]:
# Creating steps for a Pipeline 
logreg_steps = [('smote', SMOTE(random_state=333)),
                ('logreg', LogisticRegression(random_state=333))]
# Feeding the Pipeline the steps defined above
logreg_pipe = Pipeline(logreg_steps)
# Fitting the training data to the Pipeline
logreg_pipe.fit(X_train, y_train)

In [None]:
# Pickle the model
with open('pickled_models/logreg_pipe.pkl', 'wb') as f:
    pickle.dump(logreg_pipe, f)

In [None]:
# Load the pickled model
with open('pickled_models/logreg_pipe.pkl', 'rb') as f:
    logreg_pipe = pickle.load(f)

In [None]:
# Scores for model
# Defining the predictions from the Pipeline using the training data
y_pred = logreg_pipe.predict(X_train)
# Evaluating the accuracy score on the training data
print('Recall: ', recall_score(y_train, y_pred, pos_label='Yes'))
print('F1: ', f1_score(y_train, y_pred, pos_label='Yes'))
print('Accuracy: ', accuracy_score(y_train, y_pred))

In [None]:
# Creating confusion matrix
cf = confusion_matrix(y_train, y_pred)
# Displaying confusion matrix
ConfusionMatrixDisplay(cf, display_labels=['No', 'Yes']).plot()

In [None]:
# Cross Validation Scores for model
model = logreg_pipe
print('CV Recall: ', cross_val_score(model, X_train, y_train, scoring=recall_scorer).mean())
print('CV F1: ', cross_val_score(model, X_train, y_train, scoring=f1_scorer).mean())
print('CV Accuracy: ', cross_val_score(model, X_train, y_train).mean())

#### Logistic Regression with GridSearch 1

In [None]:
# Creating parameters for GridSearch
logreg_params = {'logreg__class_weight':['balanced', None, [1, 10]],
                'logreg__C': [1, 0.0001],
                'logreg__solver': ['lbfgs', 'sag', 'saga'],
                'logreg__max_iter': [100],
                'logreg__penalty': ['l1', 'l2']}
# GridSearch with the logistic regression pipeline, parameters above, 5 fold cross validation, and accuracy score
logreg_grid1 = GridSearchCV(estimator=logreg_pipe, param_grid=logreg_params, cv=5, scoring=recall_scorer, verbose=2)
# Fitting the GridSearch
logreg_grid1.fit(X_train, y_train)

In [None]:
# Pickle the model
with open('pickled_models/logreg_grid1.pkl', 'wb') as f:
    pickle.dump(logreg_grid1, f)

In [None]:
# Load the pickled model
with open('pickled_models/logreg_grid1.pkl', 'rb') as f:
    logreg_grid1 = pickle.load(f)

In [None]:
print("Recall")
print(logreg_grid1.best_estimator_)
print(logreg_grid1.best_score_)

#### Logistic Regression with GridSearch 2

In [None]:
# Creating parameters for GridSearch
logreg_params = {'logreg__class_weight':['balanced', [1, 20], [20, 1], [1, 50]],
                'logreg__C': [1, 10, 100],
                'logreg__solver': ['lbfgs', 'liblinear', 'newton-cg'],
                'logreg__max_iter': [50, 100, 150, 1000],
                'logreg__penalty': ['elasticnet', 'l2', None]}
# GridSearch with the logistic regression pipeline, parameters above, 5 fold cross validation, and accuracy score
logreg_grid2 = GridSearchCV(estimator=logreg_pipe, param_grid=logreg_params, cv=5, scoring=recall_scorer, verbose=2)
# Fitting the GridSearch
logreg_grid2.fit(X_train, y_train)

#LogReg to see parameters when changing params
# LogisticRegression(random_state=333)

In [None]:
# Pickle the model
with open('pickled_models/logreg_grid2.pkl', 'wb') as f:
    pickle.dump(logreg_grid2, f)

In [None]:
# Load the pickled model
with open('pickled_models/logreg_grid2.pkl', 'rb') as f:
    logreg_grid2 = pickle.load(f)

In [None]:
print("Recall")
print(logreg_grid2.best_estimator_)
print(logreg_grid2.best_score_)

#### Logistic Regression with GridSearch 3

In [None]:
# Creating parameters for GridSearch
logreg_params = {'logreg__class_weight':['balanced'],
                'logreg__C': [1, .01, .001],
                'logreg__solver': ['lbfgs'],
                'logreg__max_iter': [100, 125, 200, 300],
                'logreg__penalty': ['l2']}
# GridSearch with the logistic regression pipeline, parameters above, 5 fold cross validation, and accuracy score
logreg_grid3 = GridSearchCV(estimator=logreg_pipe, param_grid=logreg_params, cv=5, scoring=recall_scorer, verbose=2)
# Fitting the GridSearch
logreg_grid3.fit(X_train, y_train)

In [None]:
# Pickle the model
with open('pickled_models/logreg_grid3.pkl', 'wb') as f:
    pickle.dump(logreg_grid3, f)

In [None]:
# Load the pickled model
with open('pickled_models/logreg_grid3.pkl', 'rb') as f:
    logreg_grid3 = pickle.load(f)

In [None]:
print("Recall")
print(logreg_grid3.best_estimator_)
print(logreg_grid3.best_score_)

In [None]:
# Cross Validation Scores for model
model = logreg_grid3.best_estimator_
print('CV Recall: ', cross_val_score(model, X_train, y_train, scoring=recall_scorer).mean())
print('CV F1: ', cross_val_score(model, X_train, y_train, scoring=f1_scorer).mean())
print('CV Accuracy: ', cross_val_score(model, X_train, y_train).mean())

### Fourth Model: Neural Network

In [None]:
# Creating steps for a Pipeline 
neural_steps = [('smote', SMOTE(random_state=333)),
                ('neural', MLPClassifier(random_state=333))]
# Feeding the Pipeline the steps defined above
neural_pipe = Pipeline(neural_steps)
# Fitting the training data to the Pipeline
neural_pipe.fit(X_train, y_train)

In [None]:
# Pickle the model
with open('pickled_models/neural_pipe.pkl', 'wb') as f:
    pickle.dump(neural_pipe, f)

In [None]:
# Load the pickled model
with open('pickled_models/neural_pipe.pkl', 'rb') as f:
    neural_pipe = pickle.load(f)

In [None]:
# Scores for model
# Defining the predictions from the Pipeline using the training data
y_pred = neural_pipe.predict(X_train)
# Evaluating the accuracy score on the training data
print('Recall: ', recall_score(y_train, y_pred, pos_label='Yes'))
print('F1: ', f1_score(y_train, y_pred, pos_label='Yes'))
print('Accuracy: ', accuracy_score(y_train, y_pred))

In [None]:
# Creating confusion matrix
cf = confusion_matrix(y_train, y_pred)
# Displaying confusion matrix
ConfusionMatrixDisplay(cf, display_labels=['No', 'Yes']).plot()

In [None]:
# Cross Validation Scores for model
model = neural_pipe
print('CV Recall: ', cross_val_score(model, X_train, y_train, scoring=recall_scorer).mean())
print('CV F1: ', cross_val_score(model, X_train, y_train, scoring=f1_scorer).mean())
print('CV Accuracy: ', cross_val_score(model, X_train, y_train).mean())