In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings 
warnings.filterwarnings( "ignore")

# Preprocessing Required imports
from scipy.stats import zscore
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

# Model Imports
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier

# Accuracy and Performance Metric Imports
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score,roc_curve,classification_report,confusion_matrix,plot_confusion_matrix

In [2]:
!pip install pandas-profiling==2.7.1



In [3]:
depression_dataset = pd.read_csv('data.csv')

In [4]:
depression_dataset=depression_dataset.dropna()

In [5]:
depression_dataset['inter_dom'] = np.where(depression_dataset['inter_dom'] =='Dom', 0, depression_dataset['inter_dom'])
depression_dataset['inter_dom'] = np.where(depression_dataset['inter_dom'] =='Inter', 1, depression_dataset['inter_dom'])

depression_dataset['Region'] = np.where(depression_dataset['Region'] =='SA', 0, depression_dataset['Region'])
depression_dataset['Region'] = np.where(depression_dataset['Region'] =='EA', 1, depression_dataset['Region'])
depression_dataset['Region'] = np.where(depression_dataset['Region'] =='SEA', 2, depression_dataset['Region'])
depression_dataset['Region'] = np.where(depression_dataset['Region'] =='JAP', 3, depression_dataset['Region'])
depression_dataset['Region'] = np.where(depression_dataset['Region'] =='Others', 4, depression_dataset['Region'])

depression_dataset['Gender'] = np.where(depression_dataset['Gender'] =='Female', 0, depression_dataset['Gender'])
depression_dataset['Gender'] = np.where(depression_dataset['Gender'] =='Male', 1, depression_dataset['Gender'])

depression_dataset['Academic'] = np.where(depression_dataset['Academic'] =='Under', 0, depression_dataset['Academic'])
depression_dataset['Academic'] = np.where(depression_dataset['Academic'] =='Grad', 1, depression_dataset['Academic'])

depression_dataset['Stay_Cate'] = np.where(depression_dataset['Stay_Cate'] =='Short', 0, depression_dataset['Stay_Cate'])
depression_dataset['Stay_Cate'] = np.where(depression_dataset['Stay_Cate'] =='Medium', 1, depression_dataset['Stay_Cate'])
depression_dataset['Stay_Cate'] = np.where(depression_dataset['Stay_Cate'] =='Long', 2, depression_dataset['Stay_Cate'])

depression_dataset['DepType'] = np.where(depression_dataset['DepType'] =='No', 0, depression_dataset['DepType'])
depression_dataset['DepType'] = np.where(depression_dataset['DepType'] =='Major', 1, depression_dataset['DepType'])
depression_dataset['DepType'] = np.where(depression_dataset['DepType'] =='Other', 2, depression_dataset['DepType'])

depression_dataset['DepSev'] = np.where(depression_dataset['DepSev'] =='Min', 0, depression_dataset['DepSev'])
depression_dataset['DepSev'] = np.where(depression_dataset['DepSev'] =='Mild', 1, depression_dataset['DepSev'])
depression_dataset['DepSev'] = np.where(depression_dataset['DepSev'] =='Mod', 2, depression_dataset['DepSev'])
depression_dataset['DepSev'] = np.where(depression_dataset['DepSev'] =='Sev', 3, depression_dataset['DepSev'])
depression_dataset['DepSev'] = np.where(depression_dataset['DepSev'] =='ModSev', 4, depression_dataset['DepSev'])

In [6]:
depression_dataset['Japanese_cate'] = np.where(depression_dataset['Japanese_cate'] =='Low', 0, depression_dataset['Japanese_cate'])
depression_dataset['Japanese_cate'] = np.where(depression_dataset['Japanese_cate'] =='Average', 1, depression_dataset['Japanese_cate'])
depression_dataset['Japanese_cate'] = np.where(depression_dataset['Japanese_cate'] =='High', 2, depression_dataset['Japanese_cate'])

depression_dataset['English_cate'] = np.where(depression_dataset['English_cate'] =='Low', 0, depression_dataset['English_cate'])
depression_dataset['English_cate'] = np.where(depression_dataset['English_cate'] =='Average', 1, depression_dataset['English_cate'])
depression_dataset['English_cate'] = np.where(depression_dataset['English_cate'] =='High', 2, depression_dataset['English_cate'])

In [7]:
depression_dataset['Intimate'] = np.where(depression_dataset['Intimate'] =='Yes', 0, depression_dataset['Intimate'])
depression_dataset['Intimate'] = np.where(depression_dataset['Intimate'] =='No', 1, depression_dataset['Intimate'])

depression_dataset['Religion'] = np.where(depression_dataset['Religion'] =='Yes', 0, depression_dataset['Religion'])
depression_dataset['Religion'] = np.where(depression_dataset['Religion'] =='No', 1, depression_dataset['Religion'])

depression_dataset['Suicide'] = np.where(depression_dataset['Suicide'] =='Yes', 0, depression_dataset['Suicide'])
depression_dataset['Suicide'] = np.where(depression_dataset['Suicide'] =='No', 1, depression_dataset['Suicide'])

depression_dataset['Dep'] = np.where(depression_dataset['Dep'] =='Yes', 0, depression_dataset['Dep'])
depression_dataset['Dep'] = np.where(depression_dataset['Dep'] =='No', 1, depression_dataset['Dep'])

depression_dataset['Partner_bi'] = np.where(depression_dataset['Partner_bi'] =='Yes', 0, depression_dataset['Partner_bi'])
depression_dataset['Partner_bi'] = np.where(depression_dataset['Partner_bi'] =='No', 1, depression_dataset['Partner_bi'])

depression_dataset['Friends_bi'] = np.where(depression_dataset['Friends_bi'] =='Yes', 0, depression_dataset['Friends_bi'])
depression_dataset['Friends_bi'] = np.where(depression_dataset['Friends_bi'] =='No', 1, depression_dataset['Friends_bi'])

depression_dataset['Parents_bi'] = np.where(depression_dataset['Parents_bi'] =='Yes', 0, depression_dataset['Parents_bi'])
depression_dataset['Parents_bi'] = np.where(depression_dataset['Parents_bi'] =='No', 1, depression_dataset['Parents_bi'])

depression_dataset['Relative_bi'] = np.where(depression_dataset['Relative_bi'] =='Yes', 0, depression_dataset['Relative_bi'])
depression_dataset['Relative_bi'] = np.where(depression_dataset['Relative_bi'] =='No', 1, depression_dataset['Relative_bi'])

depression_dataset['Professional_bi'] = np.where(depression_dataset['Professional_bi'] =='Yes', 0, depression_dataset['Professional_bi'])
depression_dataset['Professional_bi'] = np.where(depression_dataset['Professional_bi'] =='No', 1, depression_dataset['Professional_bi'])

depression_dataset['Phone_bi'] = np.where(depression_dataset['Phone_bi'] =='Yes', 0, depression_dataset['Phone_bi'])
depression_dataset['Phone_bi'] = np.where(depression_dataset['Phone_bi'] =='No', 1, depression_dataset['Phone_bi'])

depression_dataset['Doctor_bi'] = np.where(depression_dataset['Doctor_bi'] =='Yes', 0, depression_dataset['Doctor_bi'])
depression_dataset['Doctor_bi'] = np.where(depression_dataset['Doctor_bi'] =='No', 1, depression_dataset['Doctor_bi'])

depression_dataset['religion_bi'] = np.where(depression_dataset['religion_bi'] =='Yes', 0, depression_dataset['religion_bi'])
depression_dataset['religion_bi'] = np.where(depression_dataset['religion_bi'] =='No', 1, depression_dataset['religion_bi'])

depression_dataset['Alone_bi'] = np.where(depression_dataset['Alone_bi'] =='Yes', 0, depression_dataset['Alone_bi'])
depression_dataset['Alone_bi'] = np.where(depression_dataset['Alone_bi'] =='No', 1, depression_dataset['Alone_bi'])

depression_dataset['Others_bi'] = np.where(depression_dataset['Others_bi'] =='Yes', 0, depression_dataset['Others_bi'])
depression_dataset['Others_bi'] = np.where(depression_dataset['Others_bi'] =='No', 1, depression_dataset['Others_bi'])

depression_dataset['Internet_bi'] = np.where(depression_dataset['Internet_bi'] =='Yes', 0, depression_dataset['Internet_bi'])
depression_dataset['Internet_bi'] = np.where(depression_dataset['Internet_bi'] =='No', 1, depression_dataset['Internet_bi'])


In [8]:
depression_dataset = depression_dataset.drop(['Suicide','DepType','ToDep','DepSev'], axis = 1)

In [9]:
col_list = depression_dataset.columns

In [10]:
dataTypeSeries = depression_dataset.dtypes
print(dataTypeSeries)

inter_dom           object
Region              object
Gender              object
Academic            object
Age                  int64
Age_cate             int64
Stay                 int64
Stay_Cate           object
Japanese             int64
Japanese_cate       object
English              int64
English_cate        object
Intimate            object
Religion            object
Dep                 object
ToSC                 int64
APD                  int64
AHome                int64
APH                  int64
Afear                int64
ACS                  int64
AGuilt               int64
AMiscell             int64
ToAS                 int64
Partner              int64
Friends              int64
Parents              int64
Relative             int64
Profess              int64
 Phone               int64
Doctor               int64
Reli                 int64
Alone                int64
Others               int64
Internet           float64
Partner_bi          object
Friends_bi          object
P

In [11]:
X = depression_dataset.drop(['Dep'], axis=1)
y = depression_dataset['Dep']

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3 , random_state=1)

In [13]:
print(X_train)

    inter_dom Region Gender Academic  Age  Age_cate  Stay Stay_Cate  Japanese  \
120         1      1      1        0   23         4     2         1         2   
84          1      2      0        0   21         3     4         2         3   
127         1      2      1        0   27         5     6         2         3   
172         1      2      1        0   21         3     3         1         3   
247         0      3      0        0   22         3     4         2         5   
..        ...    ...    ...      ...  ...       ...   ...       ...       ...   
162         1      2      0        0   19         2     1         0         2   
97          1      0      1        0   20         2     1         0         2   
165         1      0      0        0   21         3     1         0         2   
261         0      3      0        0   21         3     3         1         5   
62          1      2      0        0   21         3     3         1         1   

    Japanese_cate  ...  Fri

In [14]:
print(y_train)

120    0
84     0
127    0
172    0
247    1
      ..
162    1
97     1
165    1
261    1
62     0
Name: Dep, Length: 169, dtype: object


In [15]:
y_train=y_train.astype('int')

In [16]:
param_grid = {
   'max_depth': [7,8,9, 10, 11,12,13],
   'max_features': [5, 6, 7, 8, 9],
  'min_samples_leaf': [10, 15, 20, 25, 30],
  'min_samples_split': [30, 45, 60, 75, 90],
   'n_estimators': [71, 81, 91, 101],

}

rfCl = RandomForestClassifier()

In [17]:
param_grid = {
    'max_depth': [7,8,9,10],
    'max_features': [4, 5, 6, 7],
    'min_samples_leaf': [5, 10, 15, 20],
    'min_samples_split': [10, 20 , 30, 40, 50],
    'n_estimators': [91, 101, 111],
}

rfCl = RandomForestClassifier()

RFCl_model = GridSearchCV(estimator = rfCl, param_grid = param_grid, cv = 3)

In [18]:
RFCl_model.fit(X_train, y_train)

GridSearchCV(cv=3, estimator=RandomForestClassifier(),
             param_grid={'max_depth': [7, 8, 9, 10],
                         'max_features': [4, 5, 6, 7],
                         'min_samples_leaf': [5, 10, 15, 20],
                         'min_samples_split': [10, 20, 30, 40, 50],
                         'n_estimators': [91, 101, 111]})

In [19]:
RFCl_model.best_params_
best_grid_rfc = RFCl_model.best_estimator_

In [20]:
print(best_grid_rfc)

RandomForestClassifier(max_depth=7, max_features=4, min_samples_leaf=5,
                       min_samples_split=10, n_estimators=91)


In [21]:
Bagging_model=BaggingClassifier(base_estimator=best_grid_rfc,random_state=1)
Bagging_model.fit(X_train, y_train)

BaggingClassifier(base_estimator=RandomForestClassifier(max_depth=7,
                                                        max_features=4,
                                                        min_samples_leaf=5,
                                                        min_samples_split=10,
                                                        n_estimators=91),
                  random_state=1)

In [22]:
y_test_Bagging_predict = Bagging_model.predict(X_test)

In [23]:
print(y_test_Bagging_predict)

[1 1 1 1 1 1 1 0 1 1 1 1 0 1 1 1 1 0 0 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1
 0 1 1 0 1 1 0 1 1 1 1 1 1 1 0 0 1 1 0 1 1 1 1 1 1 0 1 1 1 1 1 1 0 1 0 1]


In [24]:
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
def Performance_metrix(y_test, pred):
    TN, FP, FN, TP = confusion_matrix(y_test, pred).ravel()
    Population = TN+FN+TP+FP
    Accuracy   =(TP+TN) / Population
    Precision  = TP / (TP+FP) 
    NPV= TN / (TN+FN)
    Sensitivity= TP/(TP+FN)
    Specificity= TN/(TN+FP)
    print('Accuracy: ' , Accuracy, 'Sensitivity: ', Sensitivity, 'Specificity: ', Specificity,'\n\nPositive predictive value: ' , Precision, 'Negative predictive value: ',  NPV)

In [25]:
y_test=y_test.astype('int')

In [26]:
Performance_metrix(y_test,y_test_Bagging_predict)

Accuracy:  0.726027397260274 Sensitivity:  0.8823529411764706 Specificity:  0.36363636363636365 

Positive predictive value:  0.7627118644067796 Negative predictive value:  0.5714285714285714


In [27]:
roc=roc_auc_score(y_test, Bagging_model.predict_proba(X_test)[:,1])
print('\nAuc: ', roc)


Auc:  0.7557932263814617
