In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.datasets import load_iris
from sklearn.metrics import accuracy_score, recall_score, precision_score
import psycopg2
import time

  """)


In [2]:
table_cols = [('age_group', 'reporting_phu_city', 'cause_of_infection', 'gender', 'Resolved', 'Fatal', 'Special_Measure_Key')]
fact_table = []

def connect():
    """ Connect to the PostgreSQL database server """
    conn = None
    try:
        # connect to the PostgreSQL server
        print('Connecting to the PostgreSQL database...')
        return psycopg2.connect(
            host="www.eecs.uottawa.ca",
            database="group_21",
            user="****",
            password="****",
            port="15432"
        )
    except (Exception, psycopg2.DatabaseError) as error:
        print(error)


connection = connect()
print('***Connection established***')

cur = connection.cursor()
        
cur.execute('SELECT I.age_group, I.reporting_phu_city,\
            I.cause_of_infection, I.gender, F."Resolved",\
            F."Fatal", F."Special_Measure_Key" FROM fact_table_v2 as F\
            INNER JOIN individuals as I on I.individual_id = F."Individual_Key"')
rows = cur.fetchall()
fact_table = rows
print(fact_table[:10])

cur.close()
connection.close()
print('***Connection terminated***')

Connecting to the PostgreSQL database...
***Connection established***
[('40s', 'Toronto', 'OB', 'FEMALE', 'yes', 'no', 11), ('30s', 'Toronto', 'OB', 'FEMALE', 'yes', 'no', 11), ('90+', 'Toronto', 'OB', 'FEMALE', 'no', 'yes', 11), ('80s', 'Toronto', 'OB', 'FEMALE', 'yes', 'no', 11), ('50s', 'Toronto', 'OB', 'FEMALE', 'yes', 'no', 11), ('30s', 'Toronto', 'OB', 'FEMALE', 'yes', 'no', 11), ('50s', 'Toronto', 'OB', 'FEMALE', 'yes', 'no', 11), ('50s', 'Toronto', 'OB', 'MALE', 'yes', 'no', 11), ('20s', 'Toronto', 'OB', 'FEMALE', 'yes', 'no', 11), ('80s', 'Toronto', 'NO KNOWN EPI LINK', 'FEMALE', 'yes', 'no', 2)]
***Connection terminated***


In [3]:
result_df = pd.DataFrame(fact_table, columns=["age_group", "reporting_phu_city", "cause_of_infection", 
                                               "gender", "Resolved","Fatal","Special_Measure_Key"])
result_df.head()

Unnamed: 0,age_group,reporting_phu_city,cause_of_infection,gender,Resolved,Fatal,Special_Measure_Key
0,40s,Toronto,OB,FEMALE,yes,no,11
1,30s,Toronto,OB,FEMALE,yes,no,11
2,90+,Toronto,OB,FEMALE,no,yes,11
3,80s,Toronto,OB,FEMALE,yes,no,11
4,50s,Toronto,OB,FEMALE,yes,no,11


In [4]:
new_result_df = pd.get_dummies(result_df, prefix=["age_group", "reporting_phu_city", "cause_of_infection", "gender", "Resolved", "Fatal"])
new_result_df = pd.concat([new_result_df, result_df["age_group"]], axis=1)
X = new_result_df.values
new_result_df.head()

Unnamed: 0,Special_Measure_Key,age_group_20s,age_group_30s,age_group_40s,age_group_50s,age_group_60s,age_group_70s,age_group_80s,age_group_90+,age_group_<20,age_group_UNKNOWN,reporting_phu_city_Ottawa,reporting_phu_city_Toronto,cause_of_infection_CC,cause_of_infection_MISSING INFORMATION,cause_of_infection_NO KNOWN EPI LINK,cause_of_infection_OB,cause_of_infection_TRAVEL,gender_FEMALE,gender_GENDER DIVERSE,gender_MALE,gender_UNSPECIFIED,Resolved_no,Resolved_yes,Fatal_no,Fatal_yes,age_group
0,11,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,1,1,0,40s
1,11,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,1,1,0,30s
2,11,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,1,0,0,0,1,0,0,1,90+
3,11,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,1,1,0,80s
4,11,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,1,1,0,50s


In [5]:
# Create Labeler and Encoder
labelencoder = LabelEncoder()
encoder = OneHotEncoder(dtype=int, handle_unknown='ignore')
# Assigning numerical values and storing in another column
new_result_df['Special_Measure_Key_Tmp'] = labelencoder.fit_transform(new_result_df['Special_Measure_Key'])
new_result_df.head()

Unnamed: 0,Special_Measure_Key,age_group_20s,age_group_30s,age_group_40s,age_group_50s,age_group_60s,age_group_70s,age_group_80s,age_group_90+,age_group_<20,age_group_UNKNOWN,reporting_phu_city_Ottawa,reporting_phu_city_Toronto,cause_of_infection_CC,cause_of_infection_MISSING INFORMATION,cause_of_infection_NO KNOWN EPI LINK,cause_of_infection_OB,cause_of_infection_TRAVEL,gender_FEMALE,gender_GENDER DIVERSE,gender_MALE,gender_UNSPECIFIED,Resolved_no,Resolved_yes,Fatal_no,Fatal_yes,age_group,Special_Measure_Key_Tmp
0,11,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,1,1,0,40s,6
1,11,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,1,1,0,30s,6
2,11,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,1,0,0,0,1,0,0,1,90+,6
3,11,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,1,1,0,80s,6
4,11,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,1,1,0,50s,6


In [6]:
# passing bridge-types-cat column (label encoded values of bridge_types)
enc_df = pd.DataFrame(encoder.fit_transform(new_result_df[['Special_Measure_Key_Tmp']]).toarray())
#  merge with main df bridge_df on key values
new_result_df = new_result_df.join(enc_df)
new_result_df.rename(columns={6: "lockdown", 10: "stage 3 extended to toronto", 2: "Restart", 1: "Gatineau-Ottawa Travel Restriction", 3: "Parks Re-opened"},inplace = True)
new_result_df = new_result_df.drop(['Special_Measure_Key_Tmp'], axis=1)
new_result_df.head()

Unnamed: 0,Special_Measure_Key,age_group_20s,age_group_30s,age_group_40s,age_group_50s,age_group_60s,age_group_70s,age_group_80s,age_group_90+,age_group_<20,age_group_UNKNOWN,reporting_phu_city_Ottawa,reporting_phu_city_Toronto,cause_of_infection_CC,cause_of_infection_MISSING INFORMATION,cause_of_infection_NO KNOWN EPI LINK,cause_of_infection_OB,cause_of_infection_TRAVEL,gender_FEMALE,gender_GENDER DIVERSE,gender_MALE,gender_UNSPECIFIED,Resolved_no,Resolved_yes,Fatal_no,Fatal_yes,age_group,0,Gatineau-Ottawa Travel Restriction,Restart,Parks Re-opened,4,5,lockdown
0,11,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,1,1,0,40s,0,0,0,0,0,0,1
1,11,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,1,1,0,30s,0,0,0,0,0,0,1
2,11,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,1,0,0,0,1,0,0,1,90+,0,0,0,0,0,0,1
3,11,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,1,1,0,80s,0,0,0,0,0,0,1
4,11,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,1,1,0,50s,0,0,0,0,0,0,1


In [7]:
new_result_df = new_result_df.drop(['Special_Measure_Key', 4, 5], axis=1)
new_result_df.head()

Unnamed: 0,age_group_20s,age_group_30s,age_group_40s,age_group_50s,age_group_60s,age_group_70s,age_group_80s,age_group_90+,age_group_<20,age_group_UNKNOWN,reporting_phu_city_Ottawa,reporting_phu_city_Toronto,cause_of_infection_CC,cause_of_infection_MISSING INFORMATION,cause_of_infection_NO KNOWN EPI LINK,cause_of_infection_OB,cause_of_infection_TRAVEL,gender_FEMALE,gender_GENDER DIVERSE,gender_MALE,gender_UNSPECIFIED,Resolved_no,Resolved_yes,Fatal_no,Fatal_yes,age_group,0,Gatineau-Ottawa Travel Restriction,Restart,Parks Re-opened,lockdown
0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,1,1,0,40s,0,0,0,0,1
1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,1,1,0,30s,0,0,0,0,1
2,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,1,0,0,0,1,0,0,1,90+,0,0,0,0,1
3,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,1,1,0,80s,0,0,0,0,1
4,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,1,1,0,50s,0,0,0,0,1


In [8]:
from sklearn.utils import resample

# put undersampling into function
def undersample(column_name):
  df_majority = new_result_df[new_result_df[column_name]==False]
  df_minority = new_result_df[new_result_df[column_name]==True]

  df_majority_upsampled = resample(df_majority, 
                                  replace=True,     # sample with replacement
                                  n_samples=len(df_minority.index),    # to match minority class
                                  random_state=42)

  df_upsampled = pd.concat([df_minority, df_majority_upsampled])

  print(df_upsampled[column_name].value_counts())

  return df_upsampled

In [11]:
df_age_group_20s = undersample("age_group_20s")
df_age_group_30s = undersample("age_group_30s")
df_age_group_40s = undersample("age_group_40s")
df_age_group_50s = undersample("age_group_50s")
df_age_group_60s = undersample("age_group_60s")
df_age_group_70s = undersample("age_group_70s")
df_age_group_80s = undersample("age_group_80s")
df_age_group_90 = undersample("age_group_90+")
df_age_group_20 = undersample("age_group_<20")

1    2131
0    2131
Name: age_group_20s, dtype: int64
1    2037
0    2037
Name: age_group_30s, dtype: int64
1    2107
0    2107
Name: age_group_40s, dtype: int64
1    2397
0    2397
Name: age_group_50s, dtype: int64
1    1644
0    1644
Name: age_group_60s, dtype: int64
1    1121
0    1121
Name: age_group_70s, dtype: int64
1    1687
0    1687
Name: age_group_80s, dtype: int64
1    1283
0    1283
Name: age_group_90+, dtype: int64
1    848
0    848
Name: age_group_<20, dtype: int64


In [24]:
df_age_group_20s.drop(['age_group_30s', 'age_group_40s', 'age_group_50s', 'age_group_60s', 'age_group_70s', 'age_group_80s', 'age_group_90+','age_group_<20', 'age_group_UNKNOWN', 'age_group', 0], axis=1, inplace=True)

In [25]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import export_text
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score
from collections import Counter

y = df_age_group_20s['age_group_20s']
df_age_group_20s.drop(['age_group_20s'], axis=1, inplace=True)
X = df_age_group_20s.values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, shuffle=True, stratify=y)

print("Training set {} ".format(Counter(y_train)))
print("Test set {} ".format(Counter(y_test)))

Training set Counter({1: 1428, 0: 1427}) 
Test set Counter({0: 704, 1: 703}) 


In [27]:
import time
clf = GradientBoostingClassifier(n_estimators=20, max_features=2, max_depth=2, random_state=0)
t0 = time.perf_counter()
clf.fit(X_train, y_train)
t1 = time.perf_counter() - t0

In [31]:
print("Accuracy score (training): {0:.3f}".format(clf.score(X_train, y_train)))
print("Accuracy score (testing): {0:.3f}".format(clf.score(X_test, y_test)))

from sklearn.metrics import classification_report
predictions = clf.predict(X_test)
print("Classification Report")
print(classification_report(y_test, predictions))

Accuracy score (training): 0.663
Accuracy score (testing): 0.637
Classification Report
              precision    recall  f1-score   support

           0       0.67      0.55      0.60       704
           1       0.62      0.73      0.67       703

    accuracy                           0.64      1407
   macro avg       0.64      0.64      0.63      1407
weighted avg       0.64      0.64      0.63      1407

