In [None]:
from configparser import ConfigParser
import psycopg2
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import recall_score, precision_score
#Can be very helpful to notice any imbalance in classes
from collections import Counter 

In [None]:
#Sourced from https://www.postgresqltutorial.com/postgresql-python/connect/
def config(filename='psql_sample.ini', section='postgresql'):
    parser = ConfigParser()
    # read config file
    parser.read(filename) 
 
    # get section, default to postgresql
    db = {}
    if parser.has_section(section):
        params = parser.items(section)
        for param in params:
            db[param[0]] = param[1]
    else:
        raise Exception('Section {0} not found in the {1} file'.format(section, filename))
 
    return db

In [None]:
#Get the configuration file as a python dictionary
cfg = config()

#Establish the connection and create a cursor to the database
try:
    print("Here's an attempt to connect to the database")
    conn = psycopg2.connect(**cfg)
    cursor = conn.cursor()
    print("Look's like it was a success")
    
except (Exception, psycopg2.DatabaseError) as error:
    print(error)

In [None]:
#SELECT QUERY
try:
    #Lets get our data 
    cursor.execute("SELECT age_group,gender,status,holiday,phu,daily_high,daily_low,rain_amount,snow_amount,retail_and_recreation_percentage,grocery_and_pharmacy_percentage,workplaces_percentage,residential_percentage,parks_percentage,is_fatal,is_resolved,is_unresolved from data_mart.fact_table as fact inner join data_mart.mobility_dimension as mobility on fact.mobility_key=mobility.mobility_key inner join data_mart.weather_dimension as weather on fact.weather_key=weather.weather_key inner join data_mart.patient_dimension as patient on fact.patient_key=patient.patient_key inner join data_mart.special_measures_dimension as measures on fact.special_measures_key=measures.special_measures_key inner join data_mart.phu_location_dimension as phu on fact.phu_location_key=phu.phu_location_key inner join data_mart.reported_date_dimension as date on fact.reported_date_key=date.reported_date_key") 

    #Get the complete result set. It will be a list of tuples where each tuple is a row from the result set
    result_list = cursor.fetchall()
        
except (Exception, psycopg2.DatabaseError) as error:
    print(error)

In [None]:
#Ensure to run this cell at the end of all your experiments to close all connections
cursor.close()
conn.close()

In [None]:
#Now, 
result_df = pd.DataFrame(result_list, columns=["age_group","gender","holiday","phu","status","daily_high","daily_low","rain_amount","snow_amount", "parks_percentage", "retail_and_recreation_percentage","grocery_and_pharmacy_percentage","workplaces_percentage","residential_percentage",
                                               "is_fatal", "is_resolved","is_unresolved"])
result_df.head()

In [None]:
#Data preprocessing
#One-hot encoding:
new_result = pd.get_dummies(result_df)
new_result.head()

In [None]:
#Removing null values :
result_df["park"].fillna(result_df["park"].mean(), inplace=True)

In [None]:
#Normalizing data:
transform_data = result_df[["retail_rec","grocery_pharm","parks","transit","workplaces","residential","daily_high","daily_low","rain_amount","snow_amount"]]
X_normalized=preprocessing.normalize(transform_data,norma='12')
normalize_part=pd.DataFrame(X_normalized,columns=transform_data.columns)
non_numerical=result_df[["status","age_group","is_fatal","is_resolved","is_unresolved"]]
result_data=pd.concat([non_numerical,normalize_part],axis=1)

In [None]:
#Undersampling of majority classes: 
from imblearn.under_sampling import NearMiss
X = new_rsult.values
y = result_df["is_resolved"]
undersample = NearMiss(version=1,n_neighbors=3)
X-under, y_under = undersample.fir_resample(X,y)

In [None]:
#PART B
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import recall_score, precision_score, accuracy_score
import datetime
#Gradient Boosting

X_train, X_test, y_train, y_test = train_test_split(x_under, y_under, test_size=0.33, shuffle=True, stratify=y_under)
a = datetime.datetime.now()
classifier = GradientBoostingClassifier(n_estimators=20, learning_rate=0.75, max_features=2, max_depth=2, random_state=0)
classifier.fit(X_train, y_train)
predictions = classifier.predict(X_test)
b = datetime.datetime.now()
c = b - a
print('time in milliseconds')
print(c.total_seconds() * 1000)
print("Confusion Matrix:")
print(confusion_matrix(y_test, predictions))

print("Classification Report")
print(classification_report(y_test, predictions))

accuracy = accuracy_score(predictions, y_test) * 100
recall = recall_score(predictions, y_test) * 100
precision = precision_score(predictions, y_test) * 100

print(accuracy)
print(recall)
print(precision)


In [None]:
#Random Forest
X_train, X_test, y_train, y_test = train_test_split(X_under, y_under, test_size=0.2, shuffle=True, stratify = y_under)

#create and fit random forest
rf = RandomForestClassifier(max_depth=2, random_state=0)
rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)

accuracy = accuracy_score(y_pred, y_test) * 100
recall = recall_score(y_pred, y_test) * 100
precision = precision_score(y_pred, y_test) * 100

print("Accuracy of Random Forest: {:.2f} %".format(accuracy))
print("Precision of Random Forest: {:.2f} %".format(precision))
print("Recall of Random Forest: {:.2f} %".format(recall))
print("Random forest construction time: ", (round((end - start), 4) * 1000), "milliseconds")


In [None]:
#Decision Tree Algorithm:
X_train, X_test, y_train, y_test = train_test_split(X_under, y_under, test_size=0.2, shuffle=True, stratify = y_under)


dt = DecisionTreeClassifier(criterion='entropy')
dt.fit(X_train, y_train)

y_pred = dt.predict(X_test)

accuracy = accuracy_score(y_pred,y_test) * 100  
recall = recall_score(y_pred, y_test) * 100 
precision = precision_score(y_pred, y_test) * 100

print("Accuracy of Decision Tree{:.2f} %".format(accuracy))
print("Recall of Decision Tree{:.2f} %".format(recall))
print("precision of Decision Tree{:.2f} %".format(precision))
