In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

# Predict whether they will be diagnosed with mental health disorder
df = pd.read_csv('./02_Student_Mental_Health_2021-10-10.csv')
data = df.loc[df["Diagnosis"] < 3.0 ]

miss_data = data.isnull().sum() * 100 / len(data)
drops = []
for count, val in enumerate(miss_data):
    if val != 0:
        drops.append(count)

drops = [0, 1, 2, 3, 6, 21, 118]
data = data.drop(data.columns[drops], axis=1)

col = data.pop("Diagnosis")
# insert column with insert(location, column_name, column_value)
data = data.assign(Diagnosis=col)

# Make train and test sets
x = data.iloc[:,:139]
y = data.iloc[:, 139]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=500)


In [None]:
model = tree.DecisionTreeClassifier(random_state=500)
model = model.fit(x_train, y_train)

importances = model.feature_importances_
feature_importances = pd.DataFrame({'Feature': x_train.columns, 'Importance': importances})
feature_importances = feature_importances.sort_values('Importance', ascending=False)

print(feature_importances)

In [None]:
hyp_params_RF = {'n_estimators': [20, 50, 100, 300, 500, 1000, 2000], 'random_state':[25, 125, 200, 300, 500]}
# Make train and test sets
x = data.iloc[:,:139]
y = data.iloc[:, 139]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=500)

model_RF = RandomForestClassifier(n_estimators=2250, oob_score=True, random_state=300)
model_RF = model.fit(x_train, y_train)
pred = model_RF.predict(x_test)
print(roc_auc_score(y_test, pred))
print(accuracy_score(y_test, pred))

In [None]:
x = data.iloc[:,:139]
y = data.iloc[:, 139]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=500)

model_RF = RandomForestClassifier(n_estimators=2250, oob_score=True, random_state=300)
model_RF = model.fit(x_train, y_train)

importances = model_RF.feature_importances_
feature_importances = pd.DataFrame({'Feature': x_train.columns, 'Importance': importances})
feature_importances = feature_importances.sort_values('Importance', ascending=False)

#print(feature_importances['Importance'])

chosen_features_big = feature_importances.loc[feature_importances['Importance'] > 0.002]
chosen_features_med = feature_importances.loc[feature_importances['Importance'] > 0.005]
chosen_features_sml = feature_importances.loc[feature_importances['Importance'] > 0.011]

feat_sml = []
feat_med = []
feat_big = []

for i in chosen_features_sml['Feature']:
    feat_sml.append(i)
feat_sml.append('Diagnosis')

for i in chosen_features_med['Feature']:
    feat_med.append(i)
feat_med.append('Diagnosis')

for i in chosen_features_big['Feature']:
    feat_big.append(i)
feat_big.append('Diagnosis')

# Make new dataframe with only the important features
df_best_sml = data[feat_sml]
df_best_med = data[feat_med]
df_best_big = data[feat_big]
df_best_med.shape


In [None]:

# Make new train and test set
# Make train and test sets
x = df_best_sml.iloc[:,:22]
y = df_best_sml.iloc[:, 22]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=500)

model_RF = RandomForestClassifier(n_estimators=2250, oob_score=True, random_state=300)
model_RF = model_RF.fit(x_train, y_train)
pred = model_RF.predict(x_test)
#print(roc_auc_score(y_test, pred))
print(accuracy_score(y_test, pred))

In [None]:
# Make new train and test set
# Make train and test sets
x = df_best_med.iloc[:,:64]
y = df_best_med.iloc[:, 64]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=500)

model_RF = RandomForestClassifier(n_estimators=2250, oob_score=True, random_state=300)
model_RF = model_RF.fit(x_train, y_train)
pred = model_RF.predict(x_test)
print(roc_auc_score(y_test, pred))
print(accuracy_score(y_test, pred))

In [None]:
# Make new train and test set
# Make train and test sets
x = df_best_big.iloc[:,:52]
y = df_best_big.iloc[:, 52]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=500)

model_RF = RandomForestClassifier(n_estimators=1000, oob_score=True, random_state=300)
model_RF = model_RF.fit(x_train, y_train)
pred = model_RF.predict(x_test)
print(roc_auc_score(y_test, pred))
print(accuracy_score(y_test, pred))

In [None]:
plt.bar(range(len(importances)), importances)
#plt.xticks(range(len(importances)), y, rotation=90)
plt.xlabel('Feature')
plt.ylabel('Importance Score')
plt.show()

In [None]:
# Perform grid search
hyp_params_GB = {'n_estimators': [20, 50, 100, 300, 500, 1000, 2000], 'learning_rate':[0.9, 0.5, 0.3, 0.1, 0.01, 0.001],
             'random_state':[25, 125, 200, 300, 500]}
model = GradientBoostingClassifier(n_estimators=50, learning_rate=0.05, random_state=300)
model = model.fit(x_train, y_train)
importances = model_RF.feature_importances_
feature_importances = pd.DataFrame({'Feature': x_train.columns, 'Importance': importances})
feature_importances = feature_importances.sort_values('Importance', ascending=False)
chosen_features_big = feature_importances.loc[feature_importances['Importance'] > 0.002]
chosen_features_med = feature_importances.loc[feature_importances['Importance'] > 0.005]
chosen_features_sml = feature_importances.loc[feature_importances['Importance'] > 0.01]
#pred = model.predict(x_test)
#print(roc_auc_score(y_test, pred))
#print(accuracy_score(y_test, pred))

In [None]:
plt.bar(range(len(importances)), importances)
#plt.xticks(range(len(importances)), y, rotation=90)
plt.xlabel('Feature')
plt.ylabel('Importance Score')
plt.show()

In [None]:
# Neural network
from sklearn.neural_network import MLPClassifier

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(x_train)
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)

model = MLPClassifier(activation='logistic',alpha=.01, hidden_layer_sizes=(3, 5),max_iter=5000)
model = model.fit(x_train, y_train)
pred = model.predict(x_test)
print(roc_auc_score(y_test, pred))
print(accuracy_score(y_test, pred))