In [2]:
import numpy as np 
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import os
import re
import random
import math
import pandas as pd 
from sklearn import metrics
import matplotlib.pyplot as plt

In [3]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn import datasets

#Import scikit-learn metrics module for accuracy calculation
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn import metrics

## Import Files

In [16]:
o = os.getcwd()

data = pd.read_csv(o + "\\..\\data\\cases_train_processed.csv", parse_dates = True)
data_test = pd.read_csv(o + "\\..\\data\\cases_test_processed.csv", parse_dates=True)
plot_path = o + "\\..\\plots\\"
results_path = o + "\\..\\results\\"

'''
# For Mac/Linux
data = pd.read_csv(o + "/../data/cases_train_processed.csv", parse_dates = True)
data_test = pd.read_csv(o + "/../data/cases_test_processed.csv", parse_dates = True)
plot_path = o + "/../plots/"
results_path = o + "/../results/"
'''

'\n# For Mac/Linux\ndata = pd.read_csv(o + "/../data/cases_train_processed.csv", parse_dates = True)\ndata_test = pd.read_csv(o + "/../data/cases_test_processed.csv", parse_dates = True)\nplot_path = o + "/../plots/"\nresults_path = o + "/../results/"\n'

## Categorize Functions

In [5]:
# converts string Series into a cataegorized int series for data analysis
def categorize_column(data):
    i = 0
    for value in tqdm(data.unique()):
        data.replace(value, i, inplace = True)
        i += 1
    data = data.apply(pd.to_numeric)
    return data

# converts string Series into a cataegorized int series1 for data analysis
def categorize_outcome(data):
    data = data.map({'nonhospitalized':0, 'deceased':1, 'recovered':2, 'hospitalized':3},na_action ='ignore')
    data = data.apply(pd.to_numeric)
    return data

## Data Cleanup Functions

In [19]:
def cleanup(data):
    data = data.drop(columns=['province', 'country'])
    data['sex'] = categorize_column(data['sex'])
    data['outcome'] = categorize_outcome(data['outcome'])
    data['Combined_Key'] = categorize_column(data['Combined_Key'])
    return data

## Data Output Functions

In [20]:
def output_predictions(p):
    pred = []
    arr = {0:'nonhospitalized', 1:'deceased',2:'recovered', 3:'hospitalized'}
    for i in p:
        pred.append(arr[i])
    with open(results_path+"predictions.txt", "w") as f:
        f.writelines('\n'.join(pred))

In [21]:
data = cleanup(data)
data_test = cleanup(data_test)

100%|██████████| 2/2 [00:00<00:00, 23.81it/s]
100%|██████████| 489/489 [00:34<00:00, 14.11it/s]
100%|██████████| 2/2 [00:00<00:00, 166.52it/s]
100%|██████████| 413/413 [00:04<00:00, 100.64it/s]


## Build and Run AdaBoostClassifier

In [9]:
abc  = AdaBoostClassifier()
params = {'n_estimators': [100,150], 'learning_rate': [1.5,2]}
cv = RepeatedStratifiedKFold(n_splits=15, n_repeats=3, random_state=1)
scoring = {'Overall_Accuracy': metrics.make_scorer(metrics.accuracy_score),
           'Overall_Recall': metrics.make_scorer(metrics.recall_score, average = 'macro'),
           'F1_Deceased': metrics.make_scorer(metrics.f1_score, labels=[1], average= 'micro'),
           'Recall_Deceased': metrics.make_scorer(metrics.recall_score, labels = [1], average = 'micro') }

X = data.drop(columns='outcome')
y = data['outcome']

#n_scores = cross_val_score(abc, X, y, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
gs = GridSearchCV(abc, param_grid = params, scoring= scoring, cv=cv, n_jobs=-1, refit = 'Recall_Deceased')

In [10]:
gs.fit(X,y)

GridSearchCV(cv=RepeatedStratifiedKFold(n_repeats=3, n_splits=15, random_state=1),
             estimator=AdaBoostClassifier(), n_jobs=-1,
             param_grid={'learning_rate': [1.5, 2], 'n_estimators': [100, 150]},
             refit='Recall_Deceased',
             scoring={'F1_Deceased': make_scorer(f1_score, labels=[1], average=micro),
                      'Overall_Accuracy': make_scorer(accuracy_score),
                      'Overall_Recall': make_scorer(recall_score, average=macro),
                      'Recall_Deceased': make_scorer(recall_score, labels=[1], average=micro)})

In [11]:
gs.cv_results_

{'mean_fit_time': array([50.56862567, 76.85331836, 50.84249849, 73.73908412]),
 'std_fit_time': array([ 1.90636197,  2.98639475,  2.57341293, 10.38398788]),
 'mean_score_time': array([0.95400356, 1.41500534, 0.89103434, 1.15757711]),
 'std_score_time': array([0.08280985, 0.1481631 , 0.15988959, 0.33343189]),
 'param_learning_rate': masked_array(data=[1.5, 1.5, 2, 2],
              mask=[False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_n_estimators': masked_array(data=[100, 150, 100, 150],
              mask=[False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'learning_rate': 1.5, 'n_estimators': 100},
  {'learning_rate': 1.5, 'n_estimators': 150},
  {'learning_rate': 2, 'n_estimators': 100},
  {'learning_rate': 2, 'n_estimators': 150}],
 'split0_test_Overall_Accuracy': array([0.6566082 , 0.56473602, 0.73002779, 0.73002779]),
 'split1_test_Overall_Accuracy': array([0.43322334, 0.37856026, 0.57389719, 0.57

In [12]:
gs.best_params_

{'learning_rate': 1.5, 'n_estimators': 150}

In [13]:
adb = AdaBoostClassifier(learning_rate=1.5, n_estimators=150)
adb.fit(X,y)

AdaBoostClassifier(learning_rate=1.5, n_estimators=150)

In [18]:
data_test.isna().sum()

age                        0
sex                        0
province                 573
country                    0
latitude                   0
longitude                  0
outcome                46500
Confirmed                  0
Deaths                     0
Recovered                  0
Active                     0
Combined_Key               0
Incidence_Rate             0
Case-Fatality_Ratio        0
dtype: int64

In [23]:
p = adb.predict(data_test.drop(columns='outcome'))
output_predictions(p)


In [None]:
fig, ax = plt.subplots(figsize=(5, 5))
ax.matshow(cm, cmap=plt.cm.Oranges, alpha=0.3)
for i in range(cm.shape[0]):
    for j in range(cm.shape[1]):
        ax.text(x=j, y=i,s=cm[i, j], va='center', ha='center', size='xx-large')
plt.xlabel('Predictions', fontsize=18)
plt.ylabel('Actuals', fontsize=18)
plt.title('Confusion Matrix', fontsize=18)
plt.show()    