In [24]:
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score

In [2]:
#Reading in county demographic information
df_county_info = pd.read_csv('/content/drive/MyDrive/CSI4142_Inputs/County_Information.csv')

In [3]:
#Reading in wellness table information
df_wellness_fact = pd.read_csv('/content/drive/MyDrive/CSI4142_Inputs/wellness_fact.csv')

In [4]:
#Reading in air quality information
df_air_quality = pd.read_csv('/content/drive/MyDrive/CSI4142_Inputs/air_quality_updated.csv')

  df_air_quality = pd.read_csv('/content/drive/MyDrive/CSI4142_Inputs/air_quality_updated.csv')


In [5]:
#Reading in weather information
df_weather = pd.read_csv('/content/drive/MyDrive/CSI4142_Inputs/weather_data.csv')

In [6]:
#Adding Total Population column to wellness information
df = pd.merge(df_wellness_fact, df_county_info[["TOT_POP", "County_Key"]], left_on= 'county_key', right_on="County_Key", how='left')

In [7]:
#Adding average temperature
df = pd.merge(df, df_weather[["average_temperature", "weather_id"]], left_on= 'weather_key', right_on="weather_id", how='left')

In [8]:
#Adding air quality (PM2.5)
df = pd.merge(df, df_air_quality[["arithmetic_mean PM2.5 - Local Conditions quarterly means of daily means Micrograms/cubic meter (LC)", "index"]], left_on= 'air_key', right_on="index", how='left')

In [9]:
#Dropping unnecessary columns
df=df.drop(columns=["County_Key", "weather_id","index"])

In [10]:
#Renaming PM2.5 column
df = df.rename(columns={"arithmetic_mean PM2.5 - Local Conditions quarterly means of daily means Micrograms/cubic meter (LC)": "PM25"})

In [11]:
#Creating indicator for mentally unhealthy days (0 if less than median, 1 if greater)
df['mentally_unhealthy_indicator'] = (df['mentally_unhealthy']>df['mentally_unhealthy'].median()).astype('int')

In [12]:
df

Unnamed: 0,fact_id,weather_key,county_key,air_key,ypll,fair_poor_health,physically_unhealthy,mentally_unhealthy,smokers,obese,drinking,unemployed,uninsured,TOT_POP,average_temperature,PM25,mentally_unhealthy_indicator
0,1,244,5840,15,8221.7,13.38,3.57,4.06,23.09,24.5,17.85,4.10,20.90,2112,66.06,10.238525,1
1,2,245,36813,16,8321.8,12.80,3.40,4.10,21.90,26.2,19.40,8.80,20.90,186579,67.73,9.953913,1
2,3,246,36814,17,8091.8,12.50,3.40,4.20,22.70,25.5,19.70,8.90,16.40,190203,68.71,9.200826,1
3,4,247,36815,18,7770.2,12.50,3.30,3.80,22.60,25.5,18.70,8.10,19.10,194978,66.68,8.383784,0
4,5,248,36816,19,7770.2,12.70,3.30,3.80,20.60,26.6,17.90,6.77,16.58,199306,65.77,8.906838,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6246,6247,397563,42299,33021,5307.8,21.00,4.21,3.58,16.04,27.7,15.95,4.01,21.53,9324,45.63,6.504128,0
6247,6248,397564,42300,33022,5329.5,21.28,3.86,3.57,15.09,29.3,16.54,3.78,19.01,9236,43.88,6.440299,0
6248,6249,397565,42301,33023,8703.6,21.28,3.86,3.57,15.09,28.7,16.54,3.49,17.72,9144,42.48,5.642816,0
6249,6250,397566,42302,33024,8207.7,21.55,3.99,4.09,14.44,25.9,17.03,3.37,19.78,9237,45.47,5.433871,1


In [13]:
#Choosing columns to base classfier on
feature_cols = ['ypll', 'unemployed', 'uninsured', 'TOT_POP', 'average_temperature', 'PM25']

In [14]:
#Dropping all rows with any null value
df = df.dropna()

In [15]:
df

Unnamed: 0,fact_id,weather_key,county_key,air_key,ypll,fair_poor_health,physically_unhealthy,mentally_unhealthy,smokers,obese,drinking,unemployed,uninsured,TOT_POP,average_temperature,PM25,mentally_unhealthy_indicator
0,1,244,5840,15,8221.7,13.38,3.57,4.06,23.09,24.5,17.85,4.10,20.90,2112,66.06,10.238525,1
1,2,245,36813,16,8321.8,12.80,3.40,4.10,21.90,26.2,19.40,8.80,20.90,186579,67.73,9.953913,1
2,3,246,36814,17,8091.8,12.50,3.40,4.20,22.70,25.5,19.70,8.90,16.40,190203,68.71,9.200826,1
3,4,247,36815,18,7770.2,12.50,3.30,3.80,22.60,25.5,18.70,8.10,19.10,194978,66.68,8.383784,0
4,5,248,36816,19,7770.2,12.70,3.30,3.80,20.60,26.6,17.90,6.77,16.58,199306,65.77,8.906838,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6246,6247,397563,42299,33021,5307.8,21.00,4.21,3.58,16.04,27.7,15.95,4.01,21.53,9324,45.63,6.504128,0
6247,6248,397564,42300,33022,5329.5,21.28,3.86,3.57,15.09,29.3,16.54,3.78,19.01,9236,43.88,6.440299,0
6248,6249,397565,42301,33023,8703.6,21.28,3.86,3.57,15.09,28.7,16.54,3.49,17.72,9144,42.48,5.642816,0
6249,6250,397566,42302,33024,8207.7,21.55,3.99,4.09,14.44,25.9,17.03,3.37,19.78,9237,45.47,5.433871,1


In [16]:
X = df[feature_cols]

In [17]:
y = df.mentally_unhealthy_indicator

In [18]:
#Creating training set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

In [19]:
clf = DecisionTreeClassifier()

#Training Decision Tree Classifer
clf = clf.fit(X_train,y_train)

#Predicting the response for test dataset
y_pred_dt = clf.predict(X_test)

In [26]:
#Printing confusion matrix and accuracy of Decision Tree classifier
print("Decision Tree Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_dt, labels=[1,0]))
print("Precision:",precision_score(y_test, y_pred_dt))
print("Recall:",recall_score(y_test, y_pred_dt))
print("Accuracy:",metrics.accuracy_score(y_test, y_pred_dt))

Decision Tree Confusion Matrix:
[[346 141]
 [152 469]]
Precision: 0.6947791164658634
Recall: 0.7104722792607803
Accuracy: 0.7355595667870036


In [27]:
gb_clf2 = GradientBoostingClassifier()

#Training Gradient Booster classifier
gb_clf2.fit(X_train, y_train)

#Predicting the response of test dataset
y_pred_gb = gb_clf2.predict(X_test)

In [28]:
#Printing confusion matrix and accuracy of Gradient Booster classifier
print("Gradient Booster Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_gb, labels=[1,0]))
print("Precision:",precision_score(y_test, y_pred_gb))
print("Recall:",recall_score(y_test, y_pred_gb))
print("Accuracy:",metrics.accuracy_score(y_test, y_pred_gb))

Gradient Booster Confusion Matrix:
[[372 115]
 [118 503]]
Precision: 0.7591836734693878
Recall: 0.7638603696098563
Accuracy: 0.7897111913357401


In [30]:
rf = RandomForestClassifier()

#Training Random Forest Classifier
rf.fit(X_train, y_train)

#Predicting the response of test dataset
y_pred_rf = rf.predict(X_test)

In [31]:
#Printing confusion matrix and accuracy of Random Forest classifier
print("Random Forest Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_rf, labels=[1,0]))
print("Precision:",precision_score(y_test, y_pred_rf))
print("Recall:",recall_score(y_test, y_pred_rf))
print("Accuracy:",metrics.accuracy_score(y_test, y_pred_rf))

Random Forest Confusion Matrix:
[[373 114]
 [ 94 527]]
Precision: 0.7987152034261242
Recall: 0.7659137577002053
Accuracy: 0.8122743682310469
