In [60]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from warnings import filterwarnings
filterwarnings('ignore')

In [61]:
df = pd.read_csv('depression_data.csv')
df.head()

Unnamed: 0,Name,Age,Marital Status,Education Level,Number of Children,Smoking Status,Physical Activity Level,Employment Status,Income,Alcohol Consumption,Dietary Habits,Sleep Patterns,History of Mental Illness,History of Substance Abuse,Family History of Depression,Chronic Medical Conditions
0,Christine Barker,31,Married,Bachelor's Degree,2,Non-smoker,Active,Unemployed,26265.67,Moderate,Moderate,Fair,Yes,No,Yes,Yes
1,Jacqueline Lewis,55,Married,High School,1,Non-smoker,Sedentary,Employed,42710.36,High,Unhealthy,Fair,Yes,No,No,Yes
2,Shannon Church,78,Widowed,Master's Degree,1,Non-smoker,Sedentary,Employed,125332.79,Low,Unhealthy,Good,No,No,Yes,No
3,Charles Jordan,58,Divorced,Master's Degree,3,Non-smoker,Moderate,Unemployed,9992.78,Moderate,Moderate,Poor,No,No,No,No
4,Michael Rich,18,Single,High School,0,Non-smoker,Sedentary,Unemployed,8595.08,Low,Moderate,Fair,Yes,No,Yes,Yes


In [62]:
df.shape

(413768, 16)

In [63]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 413768 entries, 0 to 413767
Data columns (total 16 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0   Name                          413768 non-null  object 
 1   Age                           413768 non-null  int64  
 2   Marital Status                413768 non-null  object 
 3   Education Level               413768 non-null  object 
 4   Number of Children            413768 non-null  int64  
 5   Smoking Status                413768 non-null  object 
 6   Physical Activity Level       413768 non-null  object 
 7   Employment Status             413768 non-null  object 
 8   Income                        413768 non-null  float64
 9   Alcohol Consumption           413768 non-null  object 
 10  Dietary Habits                413768 non-null  object 
 11  Sleep Patterns                413768 non-null  object 
 12  History of Mental Illness     413768 non-nul

In [64]:
df.isnull().sum()

Name                            0
Age                             0
Marital Status                  0
Education Level                 0
Number of Children              0
Smoking Status                  0
Physical Activity Level         0
Employment Status               0
Income                          0
Alcohol Consumption             0
Dietary Habits                  0
Sleep Patterns                  0
History of Mental Illness       0
History of Substance Abuse      0
Family History of Depression    0
Chronic Medical Conditions      0
dtype: int64

In [65]:
df['Marital Status'].unique()

array(['Married', 'Widowed', 'Divorced', 'Single'], dtype=object)

In [66]:
# encode the marital status type

mapped = {'Single':0,'Married':1,'Divorced':2,'Widowed':3}
df['Marital Status'] = df['Marital Status'].map(mapped)

In [67]:
df.head(3)

Unnamed: 0,Name,Age,Marital Status,Education Level,Number of Children,Smoking Status,Physical Activity Level,Employment Status,Income,Alcohol Consumption,Dietary Habits,Sleep Patterns,History of Mental Illness,History of Substance Abuse,Family History of Depression,Chronic Medical Conditions
0,Christine Barker,31,1,Bachelor's Degree,2,Non-smoker,Active,Unemployed,26265.67,Moderate,Moderate,Fair,Yes,No,Yes,Yes
1,Jacqueline Lewis,55,1,High School,1,Non-smoker,Sedentary,Employed,42710.36,High,Unhealthy,Fair,Yes,No,No,Yes
2,Shannon Church,78,3,Master's Degree,1,Non-smoker,Sedentary,Employed,125332.79,Low,Unhealthy,Good,No,No,Yes,No


In [68]:
df['Education Level'].unique()

array(["Bachelor's Degree", 'High School', "Master's Degree",
       'Associate Degree', 'PhD'], dtype=object)

In [69]:
# encode the education level
mapped = {'High School':0,"Bachelor's Degree":1,"Master's Degree":2, 'Associate Degree':3,'PhD':4}
df['Education Level'] = df['Education Level'].map(mapped)

In [70]:
df.head(2)

Unnamed: 0,Name,Age,Marital Status,Education Level,Number of Children,Smoking Status,Physical Activity Level,Employment Status,Income,Alcohol Consumption,Dietary Habits,Sleep Patterns,History of Mental Illness,History of Substance Abuse,Family History of Depression,Chronic Medical Conditions
0,Christine Barker,31,1,1,2,Non-smoker,Active,Unemployed,26265.67,Moderate,Moderate,Fair,Yes,No,Yes,Yes
1,Jacqueline Lewis,55,1,0,1,Non-smoker,Sedentary,Employed,42710.36,High,Unhealthy,Fair,Yes,No,No,Yes


In [71]:
df['Smoking Status'].unique()

array(['Non-smoker', 'Former', 'Current'], dtype=object)

In [72]:
# encode the Smoking status
mapped = {'Non-smoker':0,'Former':1,'Current':2}
df['Smoking Status'] = df['Smoking Status'].map(mapped)

In [73]:
df.head(2)

Unnamed: 0,Name,Age,Marital Status,Education Level,Number of Children,Smoking Status,Physical Activity Level,Employment Status,Income,Alcohol Consumption,Dietary Habits,Sleep Patterns,History of Mental Illness,History of Substance Abuse,Family History of Depression,Chronic Medical Conditions
0,Christine Barker,31,1,1,2,0,Active,Unemployed,26265.67,Moderate,Moderate,Fair,Yes,No,Yes,Yes
1,Jacqueline Lewis,55,1,0,1,0,Sedentary,Employed,42710.36,High,Unhealthy,Fair,Yes,No,No,Yes


In [74]:
df['Physical Activity Level'].unique()

array(['Active', 'Sedentary', 'Moderate'], dtype=object)

In [75]:
# encode the physical activity level
mapped = {'Sedentary':0,'Active':1,'Moderate':2}
df['Physical Activity Level'] = df['Physical Activity Level'].map(mapped)

In [76]:
df.head(2)

Unnamed: 0,Name,Age,Marital Status,Education Level,Number of Children,Smoking Status,Physical Activity Level,Employment Status,Income,Alcohol Consumption,Dietary Habits,Sleep Patterns,History of Mental Illness,History of Substance Abuse,Family History of Depression,Chronic Medical Conditions
0,Christine Barker,31,1,1,2,0,1,Unemployed,26265.67,Moderate,Moderate,Fair,Yes,No,Yes,Yes
1,Jacqueline Lewis,55,1,0,1,0,0,Employed,42710.36,High,Unhealthy,Fair,Yes,No,No,Yes


In [77]:
df['Employment Status'].unique()

array(['Unemployed', 'Employed'], dtype=object)

In [78]:
# encode the Employment Status
mapped = {'Unemployed':0,'Employed':1}
df['Employment Status'] = df['Employment Status'].map(mapped)

In [79]:
df.head(2)

Unnamed: 0,Name,Age,Marital Status,Education Level,Number of Children,Smoking Status,Physical Activity Level,Employment Status,Income,Alcohol Consumption,Dietary Habits,Sleep Patterns,History of Mental Illness,History of Substance Abuse,Family History of Depression,Chronic Medical Conditions
0,Christine Barker,31,1,1,2,0,1,0,26265.67,Moderate,Moderate,Fair,Yes,No,Yes,Yes
1,Jacqueline Lewis,55,1,0,1,0,0,1,42710.36,High,Unhealthy,Fair,Yes,No,No,Yes


In [80]:
df['Alcohol Consumption'].unique()

array(['Moderate', 'High', 'Low'], dtype=object)

In [81]:
# encode the Alcohol Consumption
mapped = {'Low':0,'Moderate':1,'High':2}
df['Alcohol Consumption'] = df['Alcohol Consumption'].map(mapped)

In [82]:
df.head(2)

Unnamed: 0,Name,Age,Marital Status,Education Level,Number of Children,Smoking Status,Physical Activity Level,Employment Status,Income,Alcohol Consumption,Dietary Habits,Sleep Patterns,History of Mental Illness,History of Substance Abuse,Family History of Depression,Chronic Medical Conditions
0,Christine Barker,31,1,1,2,0,1,0,26265.67,1,Moderate,Fair,Yes,No,Yes,Yes
1,Jacqueline Lewis,55,1,0,1,0,0,1,42710.36,2,Unhealthy,Fair,Yes,No,No,Yes


In [83]:
df['Dietary Habits'].unique()

array(['Moderate', 'Unhealthy', 'Healthy'], dtype=object)

In [84]:
# encode the Dietary habits
mapped = {'Unhealthy':0,'Moderate':1,'Healthy':2}
df['Dietary Habits'] = df['Dietary Habits'].map(mapped)

In [85]:
df.head(2)

Unnamed: 0,Name,Age,Marital Status,Education Level,Number of Children,Smoking Status,Physical Activity Level,Employment Status,Income,Alcohol Consumption,Dietary Habits,Sleep Patterns,History of Mental Illness,History of Substance Abuse,Family History of Depression,Chronic Medical Conditions
0,Christine Barker,31,1,1,2,0,1,0,26265.67,1,1,Fair,Yes,No,Yes,Yes
1,Jacqueline Lewis,55,1,0,1,0,0,1,42710.36,2,0,Fair,Yes,No,No,Yes


In [86]:
df['Sleep Patterns'].unique()

array(['Fair', 'Good', 'Poor'], dtype=object)

In [87]:
# encode the sleep patterns
mapped = {'Poor':0,'Fair':1,'Good':2}
df['Sleep Patterns'] = df['Sleep Patterns'].map(mapped)

In [88]:
df.head(2)

Unnamed: 0,Name,Age,Marital Status,Education Level,Number of Children,Smoking Status,Physical Activity Level,Employment Status,Income,Alcohol Consumption,Dietary Habits,Sleep Patterns,History of Mental Illness,History of Substance Abuse,Family History of Depression,Chronic Medical Conditions
0,Christine Barker,31,1,1,2,0,1,0,26265.67,1,1,1,Yes,No,Yes,Yes
1,Jacqueline Lewis,55,1,0,1,0,0,1,42710.36,2,0,1,Yes,No,No,Yes


In [89]:
df['History of Mental Illness'].unique()

array(['Yes', 'No'], dtype=object)

In [90]:
# encode the History of Mental Illness
mapped = {'No':0,'Yes':1}
df['History of Mental Illness'] = df['History of Mental Illness'].map(mapped)

In [91]:
df.head(2)

Unnamed: 0,Name,Age,Marital Status,Education Level,Number of Children,Smoking Status,Physical Activity Level,Employment Status,Income,Alcohol Consumption,Dietary Habits,Sleep Patterns,History of Mental Illness,History of Substance Abuse,Family History of Depression,Chronic Medical Conditions
0,Christine Barker,31,1,1,2,0,1,0,26265.67,1,1,1,1,No,Yes,Yes
1,Jacqueline Lewis,55,1,0,1,0,0,1,42710.36,2,0,1,1,No,No,Yes


In [92]:
df['History of Substance Abuse'].unique()

array(['No', 'Yes'], dtype=object)

In [93]:
# encode the History of Substance Abuse
mapped = {'No':0,'Yes':1}
df['History of Substance Abuse'] = df['History of Substance Abuse'].map(mapped)

In [94]:
df.head(2)

Unnamed: 0,Name,Age,Marital Status,Education Level,Number of Children,Smoking Status,Physical Activity Level,Employment Status,Income,Alcohol Consumption,Dietary Habits,Sleep Patterns,History of Mental Illness,History of Substance Abuse,Family History of Depression,Chronic Medical Conditions
0,Christine Barker,31,1,1,2,0,1,0,26265.67,1,1,1,1,0,Yes,Yes
1,Jacqueline Lewis,55,1,0,1,0,0,1,42710.36,2,0,1,1,0,No,Yes


In [95]:
df['Family History of Depression'].unique()

array(['Yes', 'No'], dtype=object)

In [96]:
# encode the family history of depression
mapped = {'No':0,'Yes':1}
df['Family History of Depression'] = df['Family History of Depression'].map(mapped)

In [97]:
df.head(2)

Unnamed: 0,Name,Age,Marital Status,Education Level,Number of Children,Smoking Status,Physical Activity Level,Employment Status,Income,Alcohol Consumption,Dietary Habits,Sleep Patterns,History of Mental Illness,History of Substance Abuse,Family History of Depression,Chronic Medical Conditions
0,Christine Barker,31,1,1,2,0,1,0,26265.67,1,1,1,1,0,1,Yes
1,Jacqueline Lewis,55,1,0,1,0,0,1,42710.36,2,0,1,1,0,0,Yes


In [98]:
df['Chronic Medical Conditions'].unique()

array(['Yes', 'No'], dtype=object)

In [99]:
# encode the chronic medical conditions
mapped = {'No':0,'Yes':1}
df['Chronic Medical Conditions'] = df['Chronic Medical Conditions'].map(mapped)

In [100]:
df.head(2)

Unnamed: 0,Name,Age,Marital Status,Education Level,Number of Children,Smoking Status,Physical Activity Level,Employment Status,Income,Alcohol Consumption,Dietary Habits,Sleep Patterns,History of Mental Illness,History of Substance Abuse,Family History of Depression,Chronic Medical Conditions
0,Christine Barker,31,1,1,2,0,1,0,26265.67,1,1,1,1,0,1,1
1,Jacqueline Lewis,55,1,0,1,0,0,1,42710.36,2,0,1,1,0,0,1


In [101]:
df.drop('Name',inplace=True,axis=1)

In [102]:
df.head(2)

Unnamed: 0,Age,Marital Status,Education Level,Number of Children,Smoking Status,Physical Activity Level,Employment Status,Income,Alcohol Consumption,Dietary Habits,Sleep Patterns,History of Mental Illness,History of Substance Abuse,Family History of Depression,Chronic Medical Conditions
0,31,1,1,2,0,1,0,26265.67,1,1,1,1,0,1,1
1,55,1,0,1,0,0,1,42710.36,2,0,1,1,0,0,1


In [103]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

In [104]:
x = df.drop('Chronic Medical Conditions',axis=1)
y = df['Chronic Medical Conditions']

xtrain,xtest,ytrain,ytest = train_test_split(x,y,train_size=0.8,random_state=100)

In [105]:
lg = LogisticRegression()
lg.fit(xtrain,ytrain)
ypred = lg.predict(xtest)

In [106]:
print(classification_report(ytest, ypred))

              precision    recall  f1-score   support

           0       0.67      1.00      0.80     55559
           1       0.00      0.00      0.00     27195

    accuracy                           0.67     82754
   macro avg       0.34      0.50      0.40     82754
weighted avg       0.45      0.67      0.54     82754



# feature engineering

In [107]:
df

Unnamed: 0,Age,Marital Status,Education Level,Number of Children,Smoking Status,Physical Activity Level,Employment Status,Income,Alcohol Consumption,Dietary Habits,Sleep Patterns,History of Mental Illness,History of Substance Abuse,Family History of Depression,Chronic Medical Conditions
0,31,1,1,2,0,1,0,26265.67,1,1,1,1,0,1,1
1,55,1,0,1,0,0,1,42710.36,2,0,1,1,0,0,1
2,78,3,2,1,0,0,1,125332.79,0,0,2,0,0,1,0
3,58,2,2,3,0,2,0,9992.78,1,1,0,0,0,0,0
4,18,0,0,0,0,0,0,8595.08,0,1,1,1,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
413763,68,1,2,0,1,2,1,109233.43,0,2,2,0,0,0,0
413764,26,0,1,0,2,1,1,96760.97,0,2,0,1,1,0,1
413765,57,1,1,0,0,0,1,77353.26,1,1,1,0,0,1,1
413766,71,1,3,2,0,0,0,24557.08,1,1,0,0,1,0,0


In [108]:
df['smoking_alchol'] = df['Smoking Status'].astype(str) +"_"+ df['Alcohol Consumption'].astype(str)

from sklearn.preprocessing import PolynomialFeatures
numeric_features = ['Age','Income']
poly = PolynomialFeatures(degree=2 , interaction_only=False , include_bias=False)
poly_features = poly.fit_transform(df[numeric_features])
poly_df = pd.DataFrame(poly_features,columns=poly.get_feature_names_out(numeric_features))

df = pd.concat([df,poly_df],axis=1)

In [109]:
df

Unnamed: 0,Age,Marital Status,Education Level,Number of Children,Smoking Status,Physical Activity Level,Employment Status,Income,Alcohol Consumption,Dietary Habits,...,History of Mental Illness,History of Substance Abuse,Family History of Depression,Chronic Medical Conditions,smoking_alchol,Age.1,Income.1,Age^2,Age Income,Income^2
0,31,1,1,2,0,1,0,26265.67,1,1,...,1,0,1,1,0_1,31.0,26265.67,961.0,814235.77,6.898854e+08
1,55,1,0,1,0,0,1,42710.36,2,0,...,1,0,0,1,0_2,55.0,42710.36,3025.0,2349069.80,1.824175e+09
2,78,3,2,1,0,0,1,125332.79,0,0,...,0,0,1,0,0_0,78.0,125332.79,6084.0,9775957.62,1.570831e+10
3,58,2,2,3,0,2,0,9992.78,1,1,...,0,0,0,0,0_1,58.0,9992.78,3364.0,579581.24,9.985565e+07
4,18,0,0,0,0,0,0,8595.08,0,1,...,1,0,1,1,0_0,18.0,8595.08,324.0,154711.44,7.387540e+07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
413763,68,1,2,0,1,2,1,109233.43,0,2,...,0,0,0,0,1_0,68.0,109233.43,4624.0,7427873.24,1.193194e+10
413764,26,0,1,0,2,1,1,96760.97,0,2,...,1,1,0,1,2_0,26.0,96760.97,676.0,2515785.22,9.362685e+09
413765,57,1,1,0,0,0,1,77353.26,1,1,...,0,0,1,1,0_1,57.0,77353.26,3249.0,4409135.82,5.983527e+09
413766,71,1,3,2,0,0,0,24557.08,1,1,...,0,1,0,0,0_1,71.0,24557.08,5041.0,1743552.68,6.030502e+08


# handling class imbalance

In [110]:
from imblearn.over_sampling import SMOTE
x = df.drop('Chronic Medical Conditions',axis=1)
y = df['Chronic Medical Conditions']
smote = SMOTE()
x_resampled,y_resampled = smote.fit_resample(x,y)

x_train , x_test , y_train , y_test = train_test_split(x_resampled,y_resampled,train_size=0.8,random_state=100)

In [111]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

In [112]:
x_train

Unnamed: 0,Age,Marital Status,Education Level,Number of Children,Smoking Status,Physical Activity Level,Employment Status,Income,Alcohol Consumption,Dietary Habits,Sleep Patterns,History of Mental Illness,History of Substance Abuse,Family History of Depression,smoking_alchol,Age.1,Income.1,Age^2,Age Income,Income^2
481685,52,1,3,1,1,2,1,114921.407091,0,1,0,0,0,0,10.36312,52.541843,114921.407091,2771.977222,6.038195e+06,1.320693e+10
534542,37,1,0,1,0,1,1,44499.645375,1,1,1,0,0,0,4.139789,37.686021,44499.645375,1420.451580,1.677015e+06,1.980218e+09
329626,34,1,2,0,1,0,1,80068.420000,2,1,2,0,0,0,12.0,34.000000,80068.420000,1156.000000,2.722326e+06,6.410952e+09
320189,31,1,3,4,1,0,1,33345.090000,1,0,1,0,0,1,11.0,31.000000,33345.090000,961.000000,1.033698e+06,1.111895e+09
258638,30,0,3,0,0,1,1,58088.430000,0,0,2,0,0,0,0.0,30.000000,58088.430000,900.000000,1.742653e+06,3.374266e+09
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
344414,54,1,3,0,0,0,0,14602.360000,1,1,2,1,0,0,1.0,54.000000,14602.360000,2916.000000,7.885274e+05,2.132289e+08
212912,63,1,1,1,1,0,1,93833.340000,2,0,0,0,0,0,12.0,63.000000,93833.340000,3969.000000,5.911500e+06,8.804696e+09
65615,24,0,1,0,0,1,1,70330.740000,0,2,2,0,0,0,0.0,24.000000,70330.740000,576.000000,1.687938e+06,4.946413e+09
210755,42,1,3,1,1,0,1,38174.670000,0,0,0,0,0,0,10.0,42.000000,38174.670000,1764.000000,1.603336e+06,1.457305e+09


In [113]:
rf = RandomForestClassifier()
rf.fit(x_train,y_train)
ypred = rf.predict(x_test)
print(classification_report(y_test,ypred))

              precision    recall  f1-score   support

           0       0.66      0.86      0.75     55459
           1       0.80      0.57      0.66     55566

    accuracy                           0.71    111025
   macro avg       0.73      0.71      0.71    111025
weighted avg       0.73      0.71      0.71    111025

