In [20]:
# Loading of packages
import pandas as pd  
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor

from xgboost import XGBClassifier, XGBRegressor
from sklearn.svm import SVC, SVR

In [2]:
from datetime import date
from datetime import datetime

date_list = []

mydateparser = lambda x: pd.datetime.strptime(x, "%d/%m/%Y")
df = pd.read_csv("2019_done.csv", parse_dates=['Date'], date_parser=mydateparser)

for row in df.iterrows():
    timestamp = datetime.timestamp(row[1][1])
    
    date_list.append(timestamp)

df["Date"] = date_list
df_new = df.copy()
df_new.info()
df_new.head()

  


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 604 entries, 0 to 603
Data columns (total 11 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Closest Location        604 non-null    object 
 1   Date                    604 non-null    float64
 2   No. of Cases            604 non-null    float64
 3   precipIntensity         604 non-null    float64
 4   precipIntensityMax      604 non-null    float64
 5   precipIntensityMaxTime  604 non-null    int64  
 6   temperatureHigh         604 non-null    float64
 7   temperatureHighTime     604 non-null    int64  
 8   temperatureLow          604 non-null    float64
 9   temperatureLowTime      604 non-null    int64  
 10  humidity                604 non-null    float64
dtypes: float64(7), int64(3), object(1)
memory usage: 52.0+ KB


Unnamed: 0,Closest Location,Date,No. of Cases,precipIntensity,precipIntensityMax,precipIntensityMaxTime,temperatureHigh,temperatureHighTime,temperatureLow,temperatureLowTime,humidity
0,Admiralty,1561997000.0,2.0,0.0147,0.0625,1560814860,84.05,1560847620,79.41,1560877200,0.92
1,Admiralty,1554221000.0,67.0,0.0001,0.0003,1553093940,92.27,1553058060,77.34,1553120640,0.78
2,Admiralty,1556813000.0,124.0,0.017,0.0578,1555665060,91.91,1555649460,78.45,1555704600,0.82
3,Admiralty,1559491000.0,343.0,0.0003,0.001,1558346100,90.53,1558338360,78.67,1558382220,0.83
4,Admiralty,1546531000.0,2.0,0.0129,0.0554,1545373560,88.6,1545367080,76.17,1545428340,0.88


In [3]:
indexed_dataSet, lookupTable = pd.factorize(df["Closest Location"].unique())

In [4]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df_cat = df_new.select_dtypes(['object']).copy()
df_int = df_new.select_dtypes(['int64']).copy()

for column in df_cat.columns:
    le.fit(df_cat[column].unique())
    df_new[column] = le.transform(df_cat[column])

In [5]:
# Get training and testing data
import math

X = df_new.drop(columns=['No. of Cases', 'precipIntensityMaxTime', 'temperatureHighTime', 'temperatureLowTime', 'Date'])
y = df["No. of Cases"]
bins = [0,30,math.pow(2, 32)]
labels= [0,1]
y = pd.cut(y, bins=bins, labels=labels, include_lowest=True)

X_train,X_test,y_train,y_test=train_test_split(X, y, test_size=0.25, random_state=2)

In [59]:
# Check biased dataset
print('Not dengue cluster:', sum(y==0), 'Dengue cluster:', sum(y==1))

Not dengue cluster: 462 Dengue cluster: 142


In [42]:
# XGBoost classify
xgbc = XGBClassifier(objective='binary:logistic', random_state=42)
xgbc.fit(X_train, y_train)

print('training accuracy without tuning is: ', xgbc.score(X_train, y_train))
print('testing accuracy without tuning is: ', xgbc.score(X_test, y_test))
results  = cross_validate(xgbc, X, y, scoring='accuracy', cv=5, return_train_score = True)
print('cross validate training accuracy is:', sum(results['train_score'])/len(results['train_score']))
print('cross validate testing accuracy is:', sum(results['test_score'])/len(results['test_score']))

training accuracy without tuning is:  0.9050772626931567
testing accuracy without tuning is:  0.7947019867549668
cross validate training accuracy is: 0.9205405266670088
cross validate testing accuracy is: 0.4525068870523416


In [24]:
# SVC (test)
svc = SVC(random_state=42, probability=True)
svc.fit(X_train, y_train)

print('training accuracy without tuning is: ', svc.score(X_train, y_train))
print('testing accuracy without tuning is: ', svc.score(X_test, y_test))
results  = cross_validate(svc, X, y, scoring='accuracy', cv=5, return_train_score = True)
print('cross validate training accuracy is:', sum(results['train_score'])/len(results['train_score']))
print('cross validate testing accuracy is:', sum(results['test_score'])/len(results['test_score']))

training accuracy without tuning is:  0.7748344370860927
testing accuracy without tuning is:  0.7350993377483444
cross validate training accuracy is: 0.7649008435569701
cross validate testing accuracy is: 0.7649035812672176


In [12]:
# grid search
param_grid = {"criterion" : ["gini", "entropy"], "max_depth": [1, 2, 4, 6, 8],
              "min_samples_leaf" : [1, 5, 10], "n_estimators": [5, 10, 20, 50, 100]}

gs = GridSearchCV(estimator=xgbc, param_grid=param_grid, scoring='accuracy', cv=5, n_jobs=-1, verbose=1)

gs = gs.fit(X, y)

print(gs.best_score_)
print(gs.best_params_)

Fitting 5 folds for each of 600 candidates, totalling 3000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  48 tasks      | elapsed:    4.1s
[Parallel(n_jobs=-1)]: Done 644 tasks      | elapsed:   14.5s
[Parallel(n_jobs=-1)]: Done 1644 tasks      | elapsed:   30.3s


0.7649035812672176
{'criterion': 'gini', 'learning_rate': 0.01, 'max_depth': 2, 'min_samples_leaf': 1, 'n_estimators': 5}


[Parallel(n_jobs=-1)]: Done 3000 out of 3000 | elapsed:   56.8s finished


In [44]:
xgbc_final = XGBClassifier(
                  objective='binary:logistic',
                  criterion= 'gini',
                  max_depth= 1,
                  min_samples_leaf= 1,
                  n_estimators= 50,
                  random_state=42)

xgbc_final.fit(X_train, y_train)

print('===Final Model===')
print('training accuracy after tuning: ', xgbc_final.score(X_train,y_train))
print('testing accuracy after tuning: ', xgbc_final.score(X_test,y_test))
results  = cross_validate(xgbc_final, X, y, scoring='accuracy', cv=5, return_train_score = True)
print('cross validate training accuracy is:', sum(results['train_score'])/len(results['train_score']))
print('cross validate testing accuracy is:', sum(results['test_score'])/len(results['test_score']))

===Final Model===
training accuracy after tuning:  0.7748344370860927
testing accuracy after tuning:  0.7350993377483444
cross validate training accuracy is: 0.7649008435569701
cross validate testing accuracy is: 0.7649035812672176


In [16]:
import joblib

filename = 'xg_boost.sav'
joblib.dump(xgbc_final, filename)

['xg_boost02.sav']

In [17]:
loaded_model = joblib.load(filename)
result = loaded_model.score(X_test, y_test)
print(loaded_model.predict_proba(X_train))

[[0.5104462  0.48955384]
 [0.51884556 0.48115447]
 [0.5104462  0.48955384]
 [0.5104462  0.48955384]
 [0.51884556 0.48115447]
 [0.51884556 0.48115447]
 [0.50724113 0.49275887]
 [0.51884556 0.48115447]
 [0.51884556 0.48115447]
 [0.51884556 0.48115447]
 [0.51884556 0.48115447]
 [0.5104462  0.48955384]
 [0.51884556 0.48115447]
 [0.5104462  0.48955384]
 [0.51884556 0.48115447]
 [0.50724113 0.49275887]
 [0.50724113 0.49275887]
 [0.51884556 0.48115447]
 [0.51884556 0.48115447]
 [0.51884556 0.48115447]
 [0.5104462  0.48955384]
 [0.5104462  0.48955384]
 [0.51884556 0.48115447]
 [0.5104462  0.48955384]
 [0.5104462  0.48955384]
 [0.50724113 0.49275887]
 [0.5104462  0.48955384]
 [0.5104462  0.48955384]
 [0.51884556 0.48115447]
 [0.5104462  0.48955384]
 [0.51884556 0.48115447]
 [0.5104462  0.48955384]
 [0.5104462  0.48955384]
 [0.51884556 0.48115447]
 [0.5104462  0.48955384]
 [0.51884556 0.48115447]
 [0.5104462  0.48955384]
 [0.51884556 0.48115447]
 [0.51884556 0.48115447]
 [0.5104462  0.48955384]


In [11]:
xgbc_final.predict_proba(X_test)

array([[0.780614  , 0.219386  ],
       [0.8405073 , 0.15949269],
       [0.7577425 , 0.2422575 ],
       [0.77152526, 0.2284747 ],
       [0.7927897 , 0.2072103 ],
       [0.7239187 , 0.27608132],
       [0.77152526, 0.2284747 ],
       [0.78315836, 0.21684165],
       [0.6814109 , 0.31858912],
       [0.77152526, 0.2284747 ],
       [0.7847487 , 0.2152513 ],
       [0.7029885 , 0.29701146],
       [0.7669456 , 0.23305443],
       [0.6959572 , 0.30404285],
       [0.7451896 , 0.2548104 ],
       [0.8638271 , 0.13617288],
       [0.79443866, 0.20556134],
       [0.9170401 , 0.0829599 ],
       [0.78315836, 0.21684165],
       [0.8493781 , 0.15062192],
       [0.9338805 , 0.06611947],
       [0.8798777 , 0.12012231],
       [0.692986  , 0.307014  ],
       [0.88623524, 0.11376477],
       [0.9170401 , 0.0829599 ],
       [0.692986  , 0.307014  ],
       [0.748351  , 0.251649  ],
       [0.7927897 , 0.2072103 ],
       [0.72548056, 0.2745194 ],
       [0.8493781 , 0.15062192],
       [0.