In [1]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, roc_auc_score
from sklearn.model_selection import cross_val_score

import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
import numpy as np
from sklearn.model_selection import GridSearchCV

#### Read in cleaned test data and train data

In [3]:
master_train = pd.read_csv("assets/master_clean_train.csv", index_col=0)
master_test = pd.read_csv("assets/master_clean_test.csv", index_col=0)

print("master_train shape: ", master_train.shape)
print("master_test shape: ", master_test.shape)

master_train shape:  (9686, 28)
master_test shape:  (116293, 27)


#### Engineer some date features

In [4]:
def clean_dates(df):
    new_df = df.copy()
    new_df['dtdate'] = pd.to_datetime(df['dtdate'])
    new_df['month'] = new_df['dtdate'].dt.month
    new_df = pd.get_dummies(new_df, columns=['month'], drop_first=True)
    return new_df

In [5]:
master_train = clean_dates(master_train)
master_test = clean_dates(master_test)
print("master_train shape: ", master_train.shape)
print("master_test shape: ", master_test.shape)

master_train shape:  (9686, 33)
master_test shape:  (116293, 31)


In [6]:
print("Columns in input dataset: ")
print(list(master_train.columns))

Columns in input dataset: 
['address', 'block', 'street', 'trap', 'addressnumberandstreet', 'latitude', 'longitude', 'addressaccuracy', 'nummosquitos', 'wnvpresent', 'neighborhood', 'dtdate', 'species_culex_pipiens', 'species_culex_pipiens_restuans', 'species_culex_restuans', 'neighborhood_infection_category_high', 'neighborhood_infection_category_low', 'neighborhood_infection_category_medium', 'neighborhood_infection_category_none', 'park_score', 'daylight', 'avg_tavg', 'avg_preciptotal', 'avg_avgspeed', 'timelaggeddaylight', 'timelaggedtemperature', 'timelaggedprecipitation', 'timelaggedwindspeed', 'month_6', 'month_7', 'month_8', 'month_9', 'month_10']


In [58]:
print("Columns in test dataset: ")
print(list(master_test.columns))

Columns in test dataset: 
['date', 'address', 'block', 'street', 'trap', 'addressnumberandstreet', 'latitude', 'longitude', 'addressaccuracy', 'neighborhood', 'dtdate', 'species_culex_pipiens', 'species_culex_pipiens_restuans', 'species_culex_restuans', 'neighborhood_infection_category_high', 'neighborhood_infection_category_low', 'neighborhood_infection_category_medium', 'neighborhood_infection_category_none', 'park_score', 'daylight', 'avg_tavg', 'avg_preciptotal', 'avg_avgspeed', 'timelaggeddaylight', 'timelaggedtemperature', 'timelaggedprecipitation', 'timelaggedwindspeed', 'month_7', 'month_8', 'month_9', 'month_10']


Have to put a dummy column for 'month_6' = 0 into 'master_test' 

In [78]:
new_test = pd.concat([master_test[['date', 'address', 'block', 'street', 'trap', 
                                  'addressnumberandstreet', 'latitude', 'longitude', 'addressaccuracy', 
                                  'neighborhood', 'dtdate', 'species_culex_pipiens', 'species_culex_pipiens_restuans', 
                                  'species_culex_restuans', 'neighborhood_infection_category_high', 
                                  'neighborhood_infection_category_low', 'neighborhood_infection_category_medium', 
                                  'neighborhood_infection_category_none', 'park_score', 'daylight', 'avg_tavg', 
                                  'avg_preciptotal', 'avg_avgspeed', 'timelaggeddaylight', 'timelaggedtemperature', 
                                  'timelaggedprecipitation', 'timelaggedwindspeed']], 
                      pd.DataFrame({'month_6':master_test['month_7'].astype('uint8') * 0}),
                      master_test[['month_7', 'month_8', 'month_9', 'month_10']]], axis=1)

In [79]:
new_test.columns

Index(['date', 'address', 'block', 'street', 'trap', 'addressnumberandstreet',
       'latitude', 'longitude', 'addressaccuracy', 'neighborhood', 'dtdate',
       'species_culex_pipiens', 'species_culex_pipiens_restuans',
       'species_culex_restuans', 'neighborhood_infection_category_high',
       'neighborhood_infection_category_low',
       'neighborhood_infection_category_medium',
       'neighborhood_infection_category_none', 'park_score', 'daylight',
       'avg_tavg', 'avg_preciptotal', 'avg_avgspeed', 'timelaggeddaylight',
       'timelaggedtemperature', 'timelaggedprecipitation',
       'timelaggedwindspeed', 'month_6', 'month_7', 'month_8', 'month_9',
       'month_10'],
      dtype='object')

In [80]:
new_test.dtypes

date                                              object
address                                           object
block                                              int64
street                                            object
trap                                              object
addressnumberandstreet                            object
latitude                                         float64
longitude                                        float64
addressaccuracy                                    int64
neighborhood                                      object
dtdate                                    datetime64[ns]
species_culex_pipiens                              int64
species_culex_pipiens_restuans                     int64
species_culex_restuans                             int64
neighborhood_infection_category_high               int64
neighborhood_infection_category_low                int64
neighborhood_infection_category_medium             int64
neighborhood_infection_category

In [83]:
master_test = new_test
del new_test

#### Start to hone in on predictors/features

In [7]:
target = 'wnvpresent'
features = ['species_culex_pipiens',
       'species_culex_pipiens_restuans', 'species_culex_restuans',
       'neighborhood_infection_category_high',
       'neighborhood_infection_category_low',
       'neighborhood_infection_category_medium',
       'neighborhood_infection_category_none', 'park_score', 'daylight',
       'avg_tavg', 'avg_preciptotal', 'avg_avgspeed', 'timelaggeddaylight',
       'timelaggedtemperature', 'timelaggedprecipitation',
       'timelaggedwindspeed', 'month_6', 'month_7', 'month_8', 'month_9',
       'month_10']
X = master_train[features]
y = master_train[target]

In [8]:
print(" ")
print("Features and types:")
print(X.dtypes, "\n--------")

 
Features and types:
species_culex_pipiens                       int64
species_culex_pipiens_restuans              int64
species_culex_restuans                      int64
neighborhood_infection_category_high        int64
neighborhood_infection_category_low         int64
neighborhood_infection_category_medium      int64
neighborhood_infection_category_none        int64
park_score                                float64
daylight                                    int64
avg_tavg                                  float64
avg_preciptotal                           float64
avg_avgspeed                              float64
timelaggeddaylight                        float64
timelaggedtemperature                     float64
timelaggedprecipitation                   float64
timelaggedwindspeed                       float64
month_6                                     uint8
month_7                                     uint8
month_8                                     uint8
month_9                     

#### Fix imbalanced classes by bootstrapping the minoroity class (WNV+) up to 50%

In [9]:
sm = SMOTE()

X_os, y_os = sm.fit_sample(X, y)

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X_os, y_os, random_state=0, test_size=0.25)

ss = StandardScaler()

X_train_s = ss.fit_transform(X_train)
X_test_s = ss.transform(X_test)

#### Trial with support-vector machine (SVM)

In [11]:
supvec = SVC(probability=True)

print("Cross-val ROC-AUC score: ", cross_val_score(supvec, X_train_s, y_train, cv = 5, scoring="roc_auc"))

Cross-val ROC-AUC score:  [0.92118507 0.92595617 0.92361146 0.92758571 0.92924797]


In [12]:
import time

In [23]:
time_start = time.time()
supvec.fit(X_train_s, y_train)
print("SVC results: ")
print("Test ROC-AUC score: ", roc_auc_score(y_test, supvec.predict(X_test_s)))
print('Elapsed time is {0:.2E}s'.format(time.time()-time_start))

SVC results: 
Test ROC-AUC score:  0.8591439717508326
Elapsed time is 3.40E+01s


#### Grid search #1 over params for SVM

In [32]:
gs_params_rbf = {'C': np.logspace(-1,1,3), 'gamma': np.logspace(-1,1,3)/X_train.shape[1]}
svm_rbf = SVC(probability=True)
gs_rbf = GridSearchCV(svm_rbf, gs_params_rbf, scoring='roc_auc', cv=5, verbose=1)

In [33]:
time_start = time.time()
gs_rbf.fit(X_os, y_os)
print('Elapsed time is {0:.2E}s'.format(time.time()-time_start))

Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=1)]: Done  45 out of  45 | elapsed: 38.1min finished


Elapsed time is 2.35E+03s


In [39]:
gs_rbf1_results = pd.DataFrame(gs_rbf.cv_results_).sort_values(by='mean_test_score', ascending=False)



In [40]:
gs_rbf1_results

Unnamed: 0,mean_fit_time,mean_score_time,mean_test_score,mean_train_score,param_C,param_gamma,params,rank_test_score,split0_test_score,split0_train_score,...,split2_test_score,split2_train_score,split3_test_score,split3_train_score,split4_test_score,split4_train_score,std_fit_time,std_score_time,std_test_score,std_train_score
8,36.736293,0.583622,0.863535,0.960905,10.0,0.47619,"{'C': 10.0, 'gamma': 0.47619047619047616}",1,0.795067,0.965726,...,0.963156,0.947331,0.904408,0.953436,0.758295,0.980599,3.894218,0.084458,0.075443,0.011517
5,43.557114,0.809503,0.840094,0.951561,1.0,0.47619,"{'C': 1.0, 'gamma': 0.47619047619047616}",2,0.766952,0.956069,...,0.947225,0.93454,0.886332,0.94402,0.723674,0.975036,4.57897,0.058018,0.082249,0.013636
2,50.95327,1.059431,0.820475,0.925831,0.1,0.47619,"{'C': 0.1, 'gamma': 0.47619047619047616}",3,0.74468,0.933122,...,0.938104,0.902534,0.871278,0.914895,0.685372,0.958331,4.134727,0.089866,0.091883,0.018988
0,61.24852,1.40928,0.717882,0.82572,0.1,0.0047619,"{'C': 0.1, 'gamma': 0.004761904761904762}",4,0.640616,0.835501,...,0.837118,0.80206,0.735766,0.818939,0.700001,0.841319,2.095494,0.036778,0.067221,0.013931
1,48.794475,1.091076,0.710691,0.897617,0.1,0.047619,"{'C': 0.1, 'gamma': 0.047619047619047616}",5,0.629076,0.906716,...,0.862632,0.863347,0.724322,0.89056,0.589642,0.936079,4.508684,0.104569,0.095874,0.023771
3,51.966283,1.110733,0.703219,0.871279,1.0,0.0047619,"{'C': 1.0, 'gamma': 0.004761904761904762}",6,0.652371,0.874334,...,0.837265,0.845959,0.684534,0.865791,0.630257,0.901441,3.05064,0.04714,0.072525,0.017867
6,47.81789,0.910508,0.63669,0.899371,10.0,0.0047619,"{'C': 10.0, 'gamma': 0.004761904761904762}",7,0.618999,0.907986,...,0.737921,0.870819,0.615683,0.887464,0.546977,0.937315,3.994515,0.076668,0.062906,0.022397
4,38.651032,0.810701,0.631445,0.922295,1.0,0.047619,"{'C': 1.0, 'gamma': 0.047619047619047616}",8,0.574463,0.930635,...,0.720504,0.89483,0.650038,0.910324,0.532288,0.959412,4.837878,0.09246,0.068851,0.021823
7,35.114116,0.657879,0.611907,0.944376,10.0,0.047619,"{'C': 10.0, 'gamma': 0.047619047619047616}",9,0.537924,0.949588,...,0.67799,0.923741,0.642024,0.935686,0.533567,0.973291,5.322042,0.090946,0.063296,0.016659


Moderate values of gamma seem to do best.
No pattern in C independent of gamma.
Best mean_test_score is 0.86; std.dev. in test score is 0.07
Fit time is about 52s per grid point per CV fold.  Not great.

In [13]:
gs_params_rbf2 = {'C': np.logspace(-1.5,1.5,3), 'gamma': np.logspace(-0.5,0.5,4)/X_train.shape[1]}
svm_rbf2 = SVC(probability=True)
gs_rbf2 = GridSearchCV(svm_rbf2, gs_params_rbf2, scoring='roc_auc', cv=3, verbose=1, n_jobs=2)

In [14]:
time_start = time.time()
gs_rbf2.fit(X_os, y_os)
print('Elapsed time is {0:.2E}s'.format(time.time()-time_start))

Fitting 3 folds for each of 12 candidates, totalling 36 fits


[Parallel(n_jobs=2)]: Done  36 out of  36 | elapsed: 14.4min finished


Elapsed time is 9.29E+02s


In [15]:
gs_rbf2_results = pd.DataFrame(gs_rbf2.cv_results_).sort_values(by='mean_test_score', ascending=False)



In [16]:
gs_rbf2_results

Unnamed: 0,mean_fit_time,mean_score_time,mean_test_score,mean_train_score,param_C,param_gamma,params,rank_test_score,split0_test_score,split0_train_score,split1_test_score,split1_train_score,split2_test_score,split2_train_score,std_fit_time,std_score_time,std_test_score,std_train_score
11,28.869154,0.874209,0.813576,0.965823,31.6228,0.150585,"{'C': 31.622776601683793, 'gamma': 0.150584650...",1,0.761891,0.976262,0.941103,0.937813,0.737734,0.983394,9.148527,0.279124,0.090713,0.020019
7,34.749557,1.259824,0.760569,0.944041,1.0,0.150585,"{'C': 1.0, 'gamma': 0.15058465048420855}",2,0.714425,0.958922,0.879325,0.901932,0.687957,0.97127,8.104407,0.287621,0.084666,0.0302
3,57.262867,2.156327,0.753418,0.897915,0.0316228,0.150585,"{'C': 0.03162277660168379, 'gamma': 0.15058465...",3,0.715221,0.914331,0.83549,0.837893,0.709543,0.941521,6.666359,0.278724,0.05808,0.04387
10,29.550889,0.950233,0.734075,0.959992,31.6228,0.0698952,"{'C': 31.622776601683793, 'gamma': 0.069895203...",4,0.672746,0.971837,0.846905,0.926766,0.682574,0.981373,9.583956,0.260683,0.079884,0.023815
2,59.612161,2.20357,0.73093,0.888295,0.0316228,0.0698952,"{'C': 0.03162277660168379, 'gamma': 0.06989520...",5,0.678144,0.907554,0.817306,0.824981,0.697341,0.932352,5.435923,0.366762,0.061578,0.0459
1,60.886626,2.291911,0.724218,0.860907,0.0316228,0.0324425,"{'C': 0.03162277660168379, 'gamma': 0.03244247...",6,0.64256,0.87958,0.821868,0.805478,0.708227,0.897661,5.493723,0.191373,0.074071,0.039883
0,64.954729,2.638261,0.72033,0.833866,0.0316228,0.0150585,"{'C': 0.03162277660168379, 'gamma': 0.01505846...",7,0.635776,0.854164,0.825293,0.776867,0.69992,0.870567,6.005949,0.234673,0.078704,0.040857
6,34.307545,1.26216,0.685333,0.936033,1.0,0.0698952,"{'C': 1.0, 'gamma': 0.06989520322009854}",8,0.62803,0.952945,0.789139,0.887047,0.638828,0.968106,8.356345,0.281463,0.073535,0.035186
4,41.435957,1.547291,0.672806,0.906585,1.0,0.0150585,"{'C': 1.0, 'gamma': 0.015058465048420854}",9,0.604203,0.92693,0.783654,0.85239,0.630561,0.940433,6.837783,0.267834,0.079116,0.038716
5,36.758321,1.406724,0.655854,0.924318,1.0,0.0324425,"{'C': 1.0, 'gamma': 0.032442479478950535}",10,0.587269,0.941975,0.776873,0.869874,0.60342,0.961105,8.600725,0.340799,0.085827,0.039282


In [17]:
gs_params_rbf3 = {'C': np.logspace(0.5,2.5,3), 'gamma': np.logspace(0.75,1.75,4)/X_train.shape[1]}
svm_rbf3 = SVC(probability=True)
gs_rbf3 = GridSearchCV(svm_rbf3, gs_params_rbf3, scoring='roc_auc', cv=3, verbose=1, n_jobs=2)

In [18]:
time_start = time.time()
gs_rbf3.fit(X_os, y_os)
print('Elapsed time is {0:.2E}s'.format(time.time()-time_start))

Fitting 3 folds for each of 12 candidates, totalling 36 fits


[Parallel(n_jobs=2)]: Done  36 out of  36 | elapsed: 18.1min finished


Elapsed time is 1.20E+03s


In [19]:
gs_rbf3_results = pd.DataFrame(gs_rbf3.cv_results_).sort_values(by='mean_test_score', ascending=False)



In [20]:
gs_rbf3_results

Unnamed: 0,mean_fit_time,mean_score_time,mean_test_score,mean_train_score,param_C,param_gamma,params,rank_test_score,split0_test_score,split0_train_score,split1_test_score,split1_train_score,split2_test_score,split2_train_score,std_fit_time,std_score_time,std_test_score,std_train_score
9,45.770565,0.737462,0.894909,0.971779,316.228,0.576918,"{'C': 316.22776601683796, 'gamma': 0.576917932...",1,0.847691,0.97955,0.971769,0.95149,0.865268,0.984296,14.959328,0.184033,0.05482,0.014477
10,79.487707,0.935774,0.893435,0.970012,316.228,1.24293,"{'C': 316.22776601683796, 'gamma': 1.242932007...",2,0.852946,0.978078,0.974853,0.950702,0.852506,0.981256,19.889888,0.159695,0.057571,0.013716
5,35.754521,0.93194,0.880859,0.967246,31.6228,0.576918,"{'C': 31.622776601683793, 'gamma': 0.576917932...",3,0.829147,0.97594,0.965213,0.945537,0.848216,0.980261,6.873324,0.177779,0.060154,0.015452
11,154.204618,1.303085,0.879957,0.96947,316.228,2.67782,"{'C': 316.22776601683796, 'gamma': 2.677815834...",4,0.830659,0.975595,0.97068,0.951211,0.838531,0.981603,24.863209,0.122307,0.064231,0.013142
6,49.9463,1.281291,0.876195,0.965877,31.6228,1.24293,"{'C': 31.622776601683793, 'gamma': 1.242932007...",5,0.827954,0.973426,0.957911,0.944088,0.84272,0.980116,9.049907,0.161433,0.058096,0.015647
7,62.318635,1.56126,0.870101,0.966245,31.6228,2.67782,"{'C': 31.622776601683793, 'gamma': 2.677815834...",6,0.823509,0.974388,0.951917,0.944269,0.834879,0.980078,11.017876,0.168769,0.058038,0.015712
8,35.988752,0.756427,0.867388,0.971852,316.228,0.267782,"{'C': 316.22776601683796, 'gamma': 0.267781583...",7,0.811465,0.979809,0.964324,0.950021,0.826375,0.985726,11.706779,0.20581,0.068814,0.015625
2,43.124982,1.437496,0.85939,0.962439,3.16228,1.24293,"{'C': 3.1622776601683795, 'gamma': 1.242932007...",8,0.813238,0.971144,0.940519,0.937159,0.824414,0.979013,6.992025,0.248892,0.057548,0.018162
3,50.113158,1.71432,0.858941,0.962678,3.16228,2.67782,"{'C': 3.1622776601683795, 'gamma': 2.677815834...",9,0.820193,0.970966,0.937777,0.938074,0.818852,0.978995,7.177942,0.232073,0.055748,0.017704
1,37.809367,1.216079,0.853862,0.960989,3.16228,0.576918,"{'C': 3.1622776601683795, 'gamma': 0.576917932...",10,0.803008,0.971092,0.934503,0.932913,0.824075,0.978961,6.49173,0.227832,0.057667,0.020111


Oops -- forgot to scale my predictors for cross-validation.

In [21]:
ss = StandardScaler()
X_os_s = ss.fit_transform(X_os)

In [22]:
svm_rbf4 = SVC(probability=True, C=300, gamma=1, tol=1e-4)
scores = cross_val_score(svm_rbf4, X_os_s, y_os, cv = 3, scoring='roc_auc')

In [23]:
scores

array([0.80787534, 0.9324521 , 0.82393403])

In [24]:
np.mean(scores)

0.8547538217342153

In [25]:
svm_rbf4 = SVC(probability=True, C=300, gamma=1, tol=1e-4)
scores = cross_val_score(svm_rbf4, X_os, y_os, cv = 3, scoring='roc_auc')

In [26]:
scores

array([0.85645608, 0.97471542, 0.86286112])

In [27]:
np.mean(scores)

0.8980108728246373

Oh, well.  The grid search gave me good parameters for the unscaled data.  I think that's what I'm going to use.

In [84]:
features = ['species_culex_pipiens',
       'species_culex_pipiens_restuans', 'species_culex_restuans',
       'neighborhood_infection_category_high',
       'neighborhood_infection_category_low',
       'neighborhood_infection_category_medium',
       'neighborhood_infection_category_none', 'park_score', 'daylight',
       'avg_tavg', 'avg_preciptotal', 'avg_avgspeed', 'timelaggeddaylight',
       'timelaggedtemperature', 'timelaggedprecipitation',
       'timelaggedwindspeed', 'month_6', 'month_7', 'month_8', 'month_9',
       'month_10']
X_test = master_test[features].values

In [102]:
def output_to_kaggle(test_df, X, model, filename):
    p_array = model.predict_proba(X)
    p_series = pd.Series(p_array[:,1])
    out_df = pd.concat([test_df.index.to_series().reset_index(drop=True),p_series], axis=1)
    out_df.columns = ['Id', 'WnvPresent']
    out_df.to_csv(filename, header=True, index=False)
    return out_df

In [130]:
svm_final = SVC(probability=True, C=300, gamma=1, tol=1e-4)
svm_final.fit(X_os, y_os)

SVC(C=300, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=1, kernel='rbf',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.0001, verbose=False)

In [132]:
roc_auc_score(y_os,svm_final.predict_proba(X_os)[:,1])

0.9649836128762236

In [133]:
outfilename = r'output/kagglesubmission'+ '_' + time.strftime('%Y-%m-%d_%H-%M-%S', time.localtime()) + '.csv'
out_df = output_to_kaggle(master_test, X_test, svm_final,outfilename)

In [111]:
master_test.describe()

Unnamed: 0,block,latitude,longitude,addressaccuracy,species_culex_pipiens,species_culex_pipiens_restuans,species_culex_restuans,neighborhood_infection_category_high,neighborhood_infection_category_low,neighborhood_infection_category_medium,...,avg_avgspeed,timelaggeddaylight,timelaggedtemperature,timelaggedprecipitation,timelaggedwindspeed,month_6,month_7,month_8,month_9,month_10
count,116293.0,116293.0,116293.0,116293.0,116293.0,116293.0,116293.0,116293.0,116293.0,116293.0,...,116293.0,116293.0,116293.0,116293.0,116293.0,116293.0,116293.0,116293.0,116293.0,116293.0
mean,41.1311,41.849389,-87.693658,7.954357,0.124866,0.132072,0.126147,0.145529,0.170268,0.605737,...,8.020708,14.708033,73.136369,0.156809,8.26103,0.0,0.306682,0.284755,0.220237,0.020835
std,24.864726,0.106593,0.080699,1.252733,0.330568,0.33857,0.332016,0.352635,0.37587,0.488694,...,2.510389,0.516376,5.461309,0.053972,0.951591,0.0,0.461118,0.4513,0.414408,0.142833
min,10.0,41.644612,-87.930995,3.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.8,13.32854,56.0,0.061479,6.390909,0.0,0.0,0.0,0.0,0.0
25%,18.0,41.753411,-87.750938,8.0,0.0,0.0,0.0,0.0,0.0,0.0,...,6.15,14.312298,69.6875,0.110352,7.622727,0.0,0.0,0.0,0.0,0.0
50%,39.0,41.862292,-87.694991,8.0,0.0,0.0,0.0,0.0,0.0,1.0,...,7.9,14.864196,73.3125,0.160655,8.131818,0.0,0.0,0.0,0.0,0.0
75%,61.0,41.951866,-87.64886,9.0,0.0,0.0,0.0,0.0,0.0,1.0,...,9.9,15.150455,77.125,0.193669,8.834091,0.0,1.0,1.0,0.0,0.0
max,98.0,42.01743,-87.531635,9.0,1.0,1.0,1.0,1.0,1.0,1.0,...,16.95,15.240741,86.3125,0.26757,10.540909,0.0,1.0,1.0,1.0,1.0


In [112]:
master_train.describe()

Unnamed: 0,block,latitude,longitude,addressaccuracy,nummosquitos,wnvpresent,species_culex_pipiens,species_culex_pipiens_restuans,species_culex_restuans,neighborhood_infection_category_high,...,avg_avgspeed,timelaggeddaylight,timelaggedtemperature,timelaggedprecipitation,timelaggedwindspeed,month_6,month_7,month_8,month_9,month_10
count,9686.0,9686.0,9686.0,9686.0,9686.0,9686.0,9686.0,9686.0,9686.0,9686.0,...,9686.0,9686.0,9686.0,9686.0,9686.0,9686.0,9686.0,9686.0,9686.0,9686.0
mean,37.235391,41.847634,-87.702859,7.941255,10.217117,0.051931,0.231158,0.461388,0.275862,0.238282,...,7.465879,14.714524,72.114521,0.137496,7.831706,0.153211,0.250465,0.341317,0.218047,0.028391
std,24.335559,0.109389,0.093454,1.351096,13.141203,0.221898,0.421595,0.498533,0.446971,0.426055,...,2.517195,0.546686,5.584412,0.04089,0.856468,0.360209,0.433303,0.474176,0.412941,0.166097
min,10.0,41.644612,-87.930995,3.0,1.0,0.0,0.0,0.0,0.0,0.0,...,2.4,13.104065,57.0,0.053103,6.020455,0.0,0.0,0.0,0.0,0.0
25%,13.0,41.750498,-87.760886,8.0,2.0,0.0,0.0,0.0,0.0,0.0,...,5.75,14.274578,68.625,0.100669,7.147727,0.0,0.0,0.0,0.0,0.0
50%,36.0,41.867108,-87.698457,8.0,4.0,0.0,0.0,0.0,0.0,0.0,...,7.0,14.917618,72.5,0.144472,7.793182,0.0,0.0,0.0,0.0,0.0
75%,58.0,41.95469,-87.642984,9.0,13.0,0.0,0.0,1.0,1.0,0.0,...,9.35,15.17621,76.75,0.161338,8.484091,0.0,1.0,1.0,0.0,0.0
max,98.0,42.01743,-87.531635,9.0,50.0,1.0,1.0,1.0,1.0,1.0,...,15.65,15.241285,84.0625,0.224085,10.122727,1.0,1.0,1.0,1.0,1.0


In [114]:
master_train.columns

Index(['address', 'block', 'street', 'trap', 'addressnumberandstreet',
       'latitude', 'longitude', 'addressaccuracy', 'nummosquitos',
       'wnvpresent', 'neighborhood', 'dtdate', 'species_culex_pipiens',
       'species_culex_pipiens_restuans', 'species_culex_restuans',
       'neighborhood_infection_category_high',
       'neighborhood_infection_category_low',
       'neighborhood_infection_category_medium',
       'neighborhood_infection_category_none', 'park_score', 'daylight',
       'avg_tavg', 'avg_preciptotal', 'avg_avgspeed', 'timelaggeddaylight',
       'timelaggedtemperature', 'timelaggedprecipitation',
       'timelaggedwindspeed', 'month_6', 'month_7', 'month_8', 'month_9',
       'month_10'],
      dtype='object')

In [115]:
master_test.columns

Index(['date', 'address', 'block', 'street', 'trap', 'addressnumberandstreet',
       'latitude', 'longitude', 'addressaccuracy', 'neighborhood', 'dtdate',
       'species_culex_pipiens', 'species_culex_pipiens_restuans',
       'species_culex_restuans', 'neighborhood_infection_category_high',
       'neighborhood_infection_category_low',
       'neighborhood_infection_category_medium',
       'neighborhood_infection_category_none', 'park_score', 'daylight',
       'avg_tavg', 'avg_preciptotal', 'avg_avgspeed', 'timelaggeddaylight',
       'timelaggedtemperature', 'timelaggedprecipitation',
       'timelaggedwindspeed', 'month_6', 'month_7', 'month_8', 'month_9',
       'month_10'],
      dtype='object')

In [120]:
master_train.mean()

block                                     37.235391
latitude                                  41.847634
longitude                                -87.702859
addressaccuracy                            7.941255
nummosquitos                              10.217117
wnvpresent                                 0.051931
species_culex_pipiens                      0.231158
species_culex_pipiens_restuans             0.461388
species_culex_restuans                     0.275862
neighborhood_infection_category_high       0.238282
neighborhood_infection_category_low        0.137415
neighborhood_infection_category_medium     0.601074
neighborhood_infection_category_none       0.023229
park_score                                 1.619714
daylight                                  13.556164
avg_tavg                                  72.698999
avg_preciptotal                            0.150894
avg_avgspeed                               7.465879
timelaggeddaylight                        14.714524
timelaggedte

In [126]:
master_test[['block', 'latitude', 'longitude', 'addressaccuracy',
             'species_culex_pipiens','species_culex_pipiens_restuans', 'species_culex_restuans',
             'neighborhood_infection_category_high',
             'neighborhood_infection_category_low',
             'neighborhood_infection_category_medium',
             'neighborhood_infection_category_none', 'park_score', 'daylight',
             'avg_tavg', 'avg_preciptotal', 'avg_avgspeed', 'timelaggeddaylight',
             'timelaggedtemperature', 'timelaggedprecipitation',
             'timelaggedwindspeed', 'month_6', 'month_7', 'month_8', 'month_9','month_10']].mean()

block                                     41.131100
latitude                                  41.849389
longitude                                -87.693658
addressaccuracy                            7.954357
species_culex_pipiens                      0.124866
species_culex_pipiens_restuans             0.132072
species_culex_restuans                     0.126147
neighborhood_infection_category_high       0.145529
neighborhood_infection_category_low        0.170268
neighborhood_infection_category_medium     0.605737
neighborhood_infection_category_none       0.065378
park_score                                 1.748528
daylight                                  13.590947
avg_tavg                                  72.106012
avg_preciptotal                            0.153821
avg_avgspeed                               8.020708
timelaggeddaylight                        14.708033
timelaggedtemperature                     73.136369
timelaggedprecipitation                    0.156809
timelaggedwi

In [129]:
np.mean(X.values, axis=0)

array([2.31158373e-01, 4.61387570e-01, 2.75862069e-01, 2.38282057e-01,
       1.37414826e-01, 6.01073715e-01, 2.32294033e-02, 1.61971351e+00,
       1.35561635e+01, 7.26989986e+01, 1.50893816e-01, 7.46587859e+00,
       1.47145245e+01, 7.21145210e+01, 1.37495660e-01, 7.83170626e+00,
       1.53210820e-01, 2.50464588e-01, 3.41317365e-01, 2.18046665e-01,
       2.83914929e-02])

In [128]:
np.mean(X_test, axis=0)

array([1.24865641e-01, 1.32071578e-01, 1.26146888e-01, 1.45528966e-01,
       1.70268202e-01, 6.05737233e-01, 6.53779677e-02, 1.74852842e+00,
       1.35909470e+01, 7.21060124e+01, 1.53821296e-01, 8.02070847e+00,
       1.47080333e+01, 7.31363689e+01, 1.56808823e-01, 8.26103002e+00,
       0.00000000e+00, 3.06682259e-01, 2.84754886e-01, 2.20236816e-01,
       2.08353039e-02])

In [135]:
np.mean(svm_final.predict_proba(X_os)[:,1])

0.5027524356397227