In [46]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, roc_auc_score
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [87]:
train_weather_dummied=pd.read_csv('C:/Users/frl/Desktop/DSI13/MyDSI_11Feb/Projects/project_4/train_weather_dummied.csv')
test_weather_dummied=pd.read_csv('C:/Users/frl/Desktop/DSI13/MyDSI_11Feb/Projects/project_4/test_weather_dummied.csv')

In [63]:
# WnvPresent
train_weather_dummied.columns.difference(test_weather_dummied.columns)

Index(['WnvPresent'], dtype='object')

In [94]:
#Id 
test_weather_dummied.columns.difference(train_weather_dummied.columns)
test_weather_dummied.drop(columns=['Id'],inplace=True)

In [65]:
features=[col for col in train_weather_dummied if col !='WnvPresent']

In [66]:
X=train_weather_dummied[features]
y=train_weather_dummied['WnvPresent']

In [55]:
#create performance metrics
def get_metrics(y_true, y_predict, print_scores = True):
    matrix_def = [['tn','fp'], ['fn','tp']]
    matrix = confusion_matrix(y_true, y_predict)
    tn, fp, fn, tp = matrix.ravel()
    accuracy = (tp+tn)/(tn+fp+fn+tp)
    misclass = 1-accuracy
    sensitivity = tp/(tp+fn)
    specificity = tn/(tn+fp)
    precision = tp/(tp+fp)
    if print_scores:
        print('Matrix Definition')
        print(np.array(matrix_def))
        print('')
        print('Confusion Matrix')
        print(matrix)
        print('')
        print('METRICS')
        print('accuracy:', accuracy)
        print('misclass:', misclass)
        print('sensitivity:', sensitivity)
        print('specificity:', specificity)
    else:
        return accuracy, misclass, sensitivity, specificity, precision

# Scaling

In [67]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=41)

In [68]:
ss = StandardScaler()
X_train = ss.fit_transform(X_train)
X_test = ss.transform(X_test)

# Logistic Regression without PCA

In [112]:
logreg = LogisticRegression()
np.random.seed(41)
params = {
    'penalty': ['l1', 'l2'],
    'C': np.linspace(0.1, 5, 5),
    'class_weight':['balanced']
}
gs_logreg = GridSearchCV(logreg, param_grid=params , scoring='roc_auc', return_train_score=True)

In [113]:
gs_logreg.fit(X_train, y_train)



GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='warn',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='warn',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='warn', n_jobs=None,
             param_grid={'C': array([0.1  , 1.325, 2.55 , 3.775, 5.   ]),
                         'class_weight': ['balanced'],
                         'penalty': ['l1', 'l2']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
             scoring='roc_auc', verbose=0)

In [114]:
gs_logreg.best_params_

{'C': 2.5500000000000003, 'class_weight': 'balanced', 'penalty': 'l1'}

In [115]:
gs_logreg.score(X_train, y_train), gs_logreg.score(X_test, y_test)

(0.8098991488228385, 0.8094547363476867)

In [116]:
get_metrics(gs_logreg.predict(X_test), y_test)

Matrix Definition
[['tn' 'fp']
 ['fn' 'tp']]

Confusion Matrix
[[1584   29]
 [ 714   97]]

METRICS
accuracy: 0.6934818481848185
misclass: 0.3065181518151815
sensitivity: 0.11960542540073983
specificity: 0.9820210787352759


In [101]:
feats = pd.Series(abs(gs_logreg.best_estimator_.coef_.T.reshape(-1)), index=X.columns)
feats[abs(feats).sort_values(ascending=False).index].head()

Depart        2.094780
month         1.242403
sun_period    0.702061
DewPoint      0.656268
Tmax          0.653354
dtype: float64

In [117]:
preds_logreg=gs_logreg.predict(test_weather_dummied)
preds_logreg = pd.Series(data=preds_logreg, index=range(1,preds_logreg.shape[0]+1))
preds_logreg.to_csv('./preds_logreg.csv', index=True, index_label='Id', header=['WnvPresent'])
#Kaggle submission score 0.50000

# Logistic Regression with PCA

In [118]:
pca = PCA()
pca.fit(X_train)
X_train_pca = pca.transform(X_train)
X_test_pca = pca.transform(X_test)

In [119]:
gs_logreg.fit(X_train_pca, y_train)



GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='warn',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='warn',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='warn', n_jobs=None,
             param_grid={'C': array([0.1  , 1.325, 2.55 , 3.775, 5.   ]),
                         'class_weight': ['balanced'],
                         'penalty': ['l1', 'l2']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
             scoring='roc_auc', verbose=0)

In [121]:
gs_logreg.score(X_train_pca, y_train), gs_logreg.score(X_test_pca, y_test)

(0.8099122343823847, 0.8094029314655947)

In [122]:
get_metrics(gs_logreg.predict(X_test_pca), y_test)

Matrix Definition
[['tn' 'fp']
 ['fn' 'tp']]

Confusion Matrix
[[1576   29]
 [ 722   97]]

METRICS
accuracy: 0.6901815181518152
misclass: 0.30981848184818483
sensitivity: 0.11843711843711843
specificity: 0.9819314641744549


In [123]:
feats = pd.Series(abs(gs_logreg.best_estimator_.coef_.T.reshape(-1)))
feats[abs(feats).sort_values(ascending=False).index]

22    3.812782
20    2.154476
17    1.494583
23    1.356908
16    1.213380
7     0.958757
19    0.934341
15    0.716608
6     0.647265
18    0.611723
1     0.556808
3     0.530184
12    0.479863
21    0.425033
11    0.298745
0     0.263600
4     0.178789
2     0.129619
13    0.093635
10    0.076127
14    0.061016
5     0.035934
8     0.004276
9     0.000000
25    0.000000
24    0.000000
26    0.000000
dtype: float64

In [124]:
feats = pd.Series(data=pca.components_[0], index=X.columns)
feats[abs(feats).sort_values(ascending=False).index]

WetBulb                          -0.359087
Tavg                             -0.358063
Tmin                             -0.345932
cool                             -0.342014
DewPoint                         -0.341905
Tmax                             -0.331558
Depart                           -0.294681
heat                              0.266740
SeaLevel                          0.211042
StnPressure                       0.199516
PrecipTotal                      -0.109197
sun_period                       -0.088735
ResultDir                        -0.074600
AvgSpeed                         -0.061817
year                              0.047628
Longitude                        -0.032640
month                             0.029846
Species_CULEX PIPIENS            -0.027741
ResultSpeed                      -0.026557
Species_CULEX TARSALIS            0.025483
Species_CULEX RESTUANS            0.024881
Latitude                          0.023399
Species_CULEX TERRITANS          -0.016774
Species_CUL

In [125]:
preds_logreg=gs_logreg.predict(test_weather_dummied)
preds_logreg = pd.Series(data=preds_logreg, index=range(1,preds_logreg.shape[0]+1))
preds_logreg.to_csv('./preds_logreg.csv', index=True, index_label='Id', header=['WnvPresent'])
#Kaggle submission score 0.5

# Decision Tree without PCA

In [109]:
params = {
    'min_samples_split' : range(2,10),
    'min_samples_leaf': range(2,10),
}

gs_dt = GridSearchCV(DecisionTreeClassifier(), param_grid=params, scoring='roc_auc',)

In [110]:
gs_dt.fit(X_train, y_train)



GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=DecisionTreeClassifier(class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort=False, random_state=None,
                                              splitter='best'),
             iid='warn', n_jobs=None,
             param_grid={'min_samples_leaf': range(2, 10),
                         'min_samples_split': range(2, 10)},
           

In [111]:
gs_dt.score(X_train, y_train), gs_dt.score(X_test, y_test)

(0.9480193466149197, 0.7174095486758673)

In [127]:
get_metrics(gs_dt.predict(X_test), y_test)

Matrix Definition
[['tn' 'fp']
 ['fn' 'tp']]

Confusion Matrix
[[2271  115]
 [  27   11]]

METRICS
accuracy: 0.9414191419141914
misclass: 0.058580858085808596
sensitivity: 0.2894736842105263
specificity: 0.951802179379715


In [128]:
pd.Series(gs_dt.best_estimator_.feature_importances_, index=X.columns).sort_values(ascending=False).head(10)

Latitude                          0.261703
Longitude                         0.225760
sun_period                        0.175006
Tmin                              0.064381
Species_CULEX PIPIENS/RESTUANS    0.049347
Species_CULEX RESTUANS            0.037282
cool                              0.034604
year                              0.033510
WetBulb                           0.027546
Species_CULEX PIPIENS             0.019874
dtype: float64

In [126]:
preds_dt=gs_dt.predict(test_weather_dummied)
preds_dt = pd.Series(data=preds_dt, index=range(1,preds_dt.shape[0]+1))
preds_dt.to_csv('./preds_dt.csv', index=True, index_label='Id', header=['WnvPresent'])
#Kaggle submission score 0.49983

# Random forest without PCA

In [131]:
params = {
    'class_weight': ['balanced'],
    'n_estimators' : [20,30,40],
    'max_depth' : [1,2,3,4,5],
    'min_samples_leaf': [1,2,3,4],
}

gs_rf = GridSearchCV(RandomForestClassifier(), param_grid=params, scoring='roc_auc')

In [132]:
gs_rf.fit(X_train, y_train)



GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators='warn', n_jobs=None,
                                              oob_score=False,
                                              random_state=None, verbose=0,
                                              warm_start=False),
           

In [133]:
gs_rf.best_params_

{'class_weight': 'balanced',
 'max_depth': 5,
 'min_samples_leaf': 2,
 'n_estimators': 30}

In [134]:
gs_rf.score(X_train, y_train), gs_rf.score(X_test, y_test)

(0.8657400422740548, 0.836524514070206)

In [135]:
get_metrics(gs_rf.predict(X_test), y_test)

Matrix Definition
[['tn' 'fp']
 ['fn' 'tp']]

Confusion Matrix
[[1666   24]
 [ 632  102]]

METRICS
accuracy: 0.7293729372937293
misclass: 0.2706270627062707
sensitivity: 0.13896457765667575
specificity: 0.9857988165680474


In [35]:
pd.Series(gs_rf.best_estimator_.feature_importances_, index=X.columns).sort_values(ascending=False).head(10)

sun_period     0.164013
month          0.157993
Tavg           0.083932
Tmin           0.068155
cool           0.063606
Tmax           0.056858
AvgSpeed       0.051401
Depart         0.048889
year           0.044545
ResultSpeed    0.043539
dtype: float64

In [136]:
preds_rf=gs_rf.predict(test_weather_dummied)
preds_rf = pd.Series(data=preds_rf, index=range(1,preds_rf.shape[0]+1))
preds_rf.to_csv('./preds_rf.csv', index=True, index_label='Id', header=['WnvPresent'])
#Kaggle submission score 0.42298

# AdaBoost starting with logistic regression

In [137]:
np.random.seed(41)
ada = AdaBoostClassifier()
params = {
        'base_estimator':[LogisticRegression(class_weight='balanced')],
        'n_estimators': [500,1000, 1500],
}
gs_ada = GridSearchCV(ada, param_grid=params, scoring='roc_auc', verbose=1, return_train_score=True,)

%time gs_ada.fit(X_train, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 3 folds for each of 3 candidates, totalling 9 fits






























































































































































































































































































































































































[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:  2.4min finished




































































Wall time: 2min 56s


GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=AdaBoostClassifier(algorithm='SAMME.R',
                                          base_estimator=None,
                                          learning_rate=1.0, n_estimators=50,
                                          random_state=None),
             iid='warn', n_jobs=None,
             param_grid={'base_estimator': [LogisticRegression(C=1.0,
                                                               class_weight='balanced',
                                                               dual=False,
                                                               fit_intercept=True,
                                                               intercept_scaling=1,
                                                               l1_ratio=None,
                                                               max_iter=100,
                                                               multi_class='warn',
 

In [138]:
gs_ada.best_params_

{'base_estimator': LogisticRegression(C=1.0, class_weight='balanced', dual=False,
                    fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                    max_iter=100, multi_class='warn', n_jobs=None, penalty='l2',
                    random_state=None, solver='warn', tol=0.0001, verbose=0,
                    warm_start=False), 'n_estimators': 1500}

In [139]:
gs_ada.score(X_train, y_train), gs_ada.score(X_test, y_test)

(0.8071026877739308, 0.8093580339011148)

In [140]:
get_metrics(gs_ada.predict(X_test), y_test)

Matrix Definition
[['tn' 'fp']
 ['fn' 'tp']]

Confusion Matrix
[[1578   27]
 [ 720   99]]

METRICS
accuracy: 0.6918316831683168
misclass: 0.3081683168316832
sensitivity: 0.12087912087912088
specificity: 0.983177570093458


In [141]:
preds_ada=gs_ada.predict(test_weather_dummied)
preds_ada = pd.Series(data=preds_ada, index=range(1,preds_ada.shape[0]+1))
preds_ada.to_csv('./preds_ada.csv', index=True, index_label='Id', header=['WnvPresent'])
#Kaggle submission score 0.50000
# long computation time

# KNN

In [142]:
knn = KNeighborsClassifier()
params = {
    'n_neighbors': range(2,20),  
    'weights':['uniform']
}
gs_knn = GridSearchCV(knn, param_grid = params, scoring='roc_auc', verbose=1, return_train_score=True)
gs_knn.fit(X_train, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 3 folds for each of 18 candidates, totalling 54 fits


[Parallel(n_jobs=1)]: Done  54 out of  54 | elapsed:  2.3min finished


GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=None,
                                            n_neighbors=5, p=2,
                                            weights='uniform'),
             iid='warn', n_jobs=None,
             param_grid={'n_neighbors': range(2, 20), 'weights': ['uniform']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
             scoring='roc_auc', verbose=1)

In [144]:
gs_knn.best_params_

{'n_neighbors': 19, 'weights': 'uniform'}

In [145]:
gs_knn.score(X_train, y_train), gs_knn.score(X_test, y_test)

(0.895707128243102, 0.8023799162833105)

In [143]:
preds_knn=gs_knn.predict(test_weather_dummied)
preds_knn = pd.Series(data=preds_knn, index=range(1,preds_knn.shape[0]+1))
preds_knn.to_csv('./preds_knn.csv', index=True, index_label='Id', header=['WnvPresent'])
#Kaggle submission score 0.50000

# SVC

In [149]:
svm = SVC()
params = {
    'C': [1, 10,20],
    'gamma': [0.001, 0.005],
    'kernel': ['linear','rbf','poly','sigmoid'],
    'class_weight': ['balanced'],
}

gs_svm = GridSearchCV(svm, param_grid = params, scoring='roc_auc', verbose=1, return_train_score=True)
gs_svm.fit(X_train, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 3 folds for each of 24 candidates, totalling 72 fits


[Parallel(n_jobs=1)]: Done  72 out of  72 | elapsed:  5.5min finished


GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='auto_deprecated', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='warn', n_jobs=None,
             param_grid={'C': [1, 10, 20], 'class_weight': ['balanced'],
                         'gamma': [0.001, 0.005],
                         'kernel': ['linear', 'rbf', 'poly', 'sigmoid']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
             scoring='roc_auc', verbose=1)

In [151]:
gs_svm.best_params_

{'C': 20, 'class_weight': 'balanced', 'gamma': 0.005, 'kernel': 'rbf'}

In [152]:
gs_svm.score(X_train, y_train), gs_svm.score(X_test, y_test)

(0.864175355734785, 0.8553158716344094)

In [153]:
preds_svm=gs_svm.predict(test_weather_dummied)
preds_svm = pd.Series(data=preds_svm, index=range(1,preds_svm.shape[0]+1))
preds_svm.to_csv('./preds_svm.csv', index=True, index_label='Id', header=['WnvPresent'])
#Kaggle submission score 0.50000