In [173]:

import pandas as pd


In [174]:
student_data = pd.read_csv("students_mental_health_survey.csv")
student_data.head()

Unnamed: 0,Age,Course,Gender,CGPA,Stress_Level,Depression_Score,Anxiety_Score,Sleep_Quality,Physical_Activity,Diet_Quality,Social_Support,Relationship_Status,Substance_Use,Counseling_Service_Use,Family_History,Chronic_Illness,Financial_Stress,Extracurricular_Involvement,Semester_Credit_Load,Residence_Type
0,25,Others,Male,3.56,3,3,2,Good,Moderate,Good,Moderate,Married,Never,Never,No,No,2,Moderate,17,On-Campus
1,24,Engineering,Female,2.44,0,3,0,Average,Low,Average,Low,Single,Occasionally,Occasionally,No,No,3,Low,27,On-Campus
2,19,Business,Female,3.74,4,0,3,Good,Low,Average,Moderate,In a Relationship,Never,Occasionally,No,No,4,High,15,On-Campus
3,19,Computer Science,Male,3.65,2,1,0,Average,Low,Average,Moderate,Single,,Never,No,No,4,Moderate,20,Off-Campus
4,18,Business,Male,3.4,3,3,4,Good,Low,Average,High,Married,Never,Never,No,Yes,0,High,23,On-Campus


In [175]:
student_data.describe()

Unnamed: 0,Age,CGPA,Stress_Level,Depression_Score,Anxiety_Score,Financial_Stress,Semester_Credit_Load
count,7022.0,7010.0,7022.0,7022.0,7022.0,7022.0,7022.0
mean,23.003418,3.49127,2.427941,2.254486,2.300484,2.453005,22.010538
std,3.853978,0.28742,1.638408,1.625193,1.624305,1.708995,4.35838
min,18.0,2.44,0.0,0.0,0.0,0.0,15.0
25%,20.0,3.29,1.0,1.0,1.0,1.0,18.0
50%,22.0,3.5,2.0,2.0,2.0,2.0,22.0
75%,25.0,3.7,4.0,3.0,4.0,4.0,26.0
max,35.0,4.0,5.0,5.0,5.0,5.0,29.0


# Which Attributes are most correlated

In [176]:
student_data.corr()

  student_data.corr()


Unnamed: 0,Age,CGPA,Stress_Level,Depression_Score,Anxiety_Score,Financial_Stress,Semester_Credit_Load
Age,1.0,0.014599,-0.019901,0.000225,0.008618,-0.002008,0.005908
CGPA,0.014599,1.0,0.005143,-0.0231,0.003504,-0.011488,-0.004284
Stress_Level,-0.019901,0.005143,1.0,-0.043313,-0.035802,-0.001388,-0.005558
Depression_Score,0.000225,-0.0231,-0.043313,1.0,-0.018019,-0.01259,0.024234
Anxiety_Score,0.008618,0.003504,-0.035802,-0.018019,1.0,-0.003994,0.010819
Financial_Stress,-0.002008,-0.011488,-0.001388,-0.01259,-0.003994,1.0,-0.003203
Semester_Credit_Load,0.005908,-0.004284,-0.005558,0.024234,0.010819,-0.003203,1.0


# Taking care of all the nan values 
- Substance_use has null values
- CGPA has null values

In [177]:
from sklearn.impute import SimpleImputer
imputer_num = SimpleImputer(strategy='median')
imputer_cat = SimpleImputer(strategy='most_frequent')

CGPA_encoded = imputer_num.fit_transform(student_data[['CGPA']])
substance_encoded = imputer_cat.fit_transform(student_data[['Substance_Use']])

student_data['CGPA'] = CGPA_encoded
student_data['Substance_Use'] = substance_encoded

# Split the dataset, as soon as possible

In [178]:
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(student_data, test_size=0.2, random_state=43)
student = train_set.drop("Depression_Score", axis=1).copy()
student_label = train_set['Depression_Score'].copy()

# Transform the data
- categorical colummns
     [ Ordinal Encoding and OneHotEncoding]
- numerical columns:
    [MinMaxScaler]

In [179]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder

ordinal_attr=[ 'Sleep_Quality','Physical_Activity','Diet_Quality', 'Social_Support', 'Substance_Use', 'Counseling_Service_Use','Extracurricular_Involvement']
one_attr= ['Course', 'Gender','Relationship_Status','Family_History','Chronic_Illness','Residence_Type']
num_attr = ['Age','CGPA','Financial_Stress','Semester_Credit_Load', 'Stress_Level', 'Anxiety_Score']

full_pipeline = ColumnTransformer([
    ("ordinal_encoding", OrdinalEncoder(), ordinal_attr),
    ("one_hot_encoding", OneHotEncoder(), one_attr),
    ("min_max_scaler", MinMaxScaler(), num_attr),
 ])

student_prepared = full_pipeline.fit_transform(student)

Unnamed: 0,Age,Course,Gender,CGPA,Stress_Level,Anxiety_Score,Sleep_Quality,Physical_Activity,Diet_Quality,Social_Support,Relationship_Status,Substance_Use,Counseling_Service_Use,Family_History,Chronic_Illness,Financial_Stress,Extracurricular_Involvement,Semester_Credit_Load,Residence_Type
156,30,Engineering,Male,3.41,1,1,Average,Moderate,Poor,Low,Single,Occasionally,Never,No,No,5,High,24,On-Campus
3109,29,Computer Science,Male,3.72,0,5,Good,Moderate,Poor,Low,Single,Never,Never,Yes,No,1,High,25,With Family
1524,32,Medical,Female,3.52,2,4,Good,Moderate,Average,Moderate,Single,Never,Occasionally,No,No,5,Low,27,On-Campus
6973,30,Law,Female,3.36,4,5,Good,Moderate,Good,Low,Single,Never,Never,Yes,No,5,Moderate,27,On-Campus
3870,25,Business,Female,3.52,1,5,Average,Low,Average,Low,Single,Never,Frequently,Yes,No,0,Low,20,On-Campus
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6202,24,Law,Male,3.72,3,5,Average,Moderate,Average,Low,Single,Frequently,Never,No,No,1,Moderate,27,Off-Campus
2325,32,Computer Science,Male,3.35,5,2,Average,Moderate,Average,High,Single,Never,Never,No,No,2,Moderate,18,On-Campus
2303,19,Law,Male,3.90,2,2,Poor,Moderate,Good,High,In a Relationship,Never,Never,No,No,0,Low,16,Off-Campus
3392,25,Medical,Female,3.56,3,2,Poor,Low,Poor,Moderate,Single,Never,Frequently,Yes,No,2,High,26,On-Campus


In [180]:
# student = pd.DataFrame(student_prepared, student.columnlen(
student_prepared[0]

array([0.        , 2.        , 2.        , 1.        , 2.        ,
       1.        , 0.        , 0.        , 0.        , 1.        ,
       0.        , 0.        , 0.        , 0.        , 1.        ,
       0.        , 0.        , 1.        , 1.        , 0.        ,
       1.        , 0.        , 0.        , 1.        , 0.        ,
       0.70588235, 0.62179487, 1.        , 0.64285714, 0.2       ,
       0.2       ])

# Train Model

In [196]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

forest_reg = RandomForestRegressor()
forest_reg.fit(student_prepared, student_label)

# prediction on training data
student_predictions = forest_reg.predict(student_prepared)
forest_mse = mean_squared_error(student_label,student_predictions)

forest_rmse = np.sqrt(forest_mse)
forest_rmse

0.5946891537626956

# Scores on k-folds validation

In [182]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(forest_reg, student_prepared, student_label, scoring="neg_mean_squared_error", cv=10)
forest_rmse_scores = np.sqrt(-scores)

print(forest_rmse_scores.mean())
print(forest_rmse_scores.std())


1.5886869607167569
0.04330480474422679


# Scores on test data

In [198]:
test_data = test_set.drop("Depression_Score", axis=1)
test_label = test_set["Depression_Score"]

test_predictions = forest_reg.predict(full_pipeline.transform(test_data))

# Save the Pipline and Model in two separate files

In [None]:
# from sklearn.externals import joblib

# joblib.dump(full_pipline, "mental_pipeline.pkl")
# joblib.dump(forest_reg, "mental_model.pkl")

import pickle

# Save an object
with open("mental_pipeline.pkl", "wb") as file:
    pickle.dump(full_pipeline, file)

with open("mental_model.pkl", "wb") as file:
    pickle.dump(forest_reg, file)


# How to predict new unseen data?
- open both files(mental_pipeline, mental_model) in 'rb' mode
- convert the record into dataframe, with columns=test_data.columns


In [199]:
file_pipe = open("mental_pipeline.pkl", "rb")
loaded_pipeline = pickle.load(file_pipe)

file_model = open("mental_model.pkl", "rb")
loaded_model = pickle.load(file_model)

In [200]:
our_data = pd.DataFrame([[18,'Business','Female',3.30,5, 0,'Average','Moderate','Average','Low','Married','Never','Occasionally','No','No',5,'Moderate',18,'Off-Campus']],columns=test_data.columns)
vector = loaded_pipeline.transform(our_data)
our_prediction = loaded_model.predict(vector)

In [201]:
our_prediction

array([1.94])