# Pre-Processing and Training Data

In [100]:
import pandas as pd
from pandas import Series
import pandas.core.algorithms as algos
import numpy as np
import seaborn as sns
import os
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats.stats as stats
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
import traceback
import re
import string
import warnings
import collections
import shap
from sklearn import __version__ as sklearn_version
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV, learning_curve
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, f1_score, confusion_matrix, precision_recall_curve
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, log_loss, auc, roc_curve, roc_auc_score
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, f_regression, f_classif
import datetime

from library.sb_utils import save_file

In [101]:
pd.__version__

'1.1.0'

In [102]:
# import data
wnv = pd.read_csv('../data/WestNileVirus_cleaned.csv')
wnv.head().T

Unnamed: 0,0,1,2,3,4
Block,40.0,40.0,40.0,40.0,40.0
Latitude,41.953705,41.953705,41.953705,41.953705,41.953705
Longitude,-87.733974,-87.733974,-87.733974,-87.733974,-87.733974
AddressAccuracy,8.0,8.0,8.0,8.0,8.0
NumMosquitos,1.0,1.0,1.0,1.0,3.0
Day_of_week,1.0,2.0,2.0,2.0,2.0
Day_of_month,26.0,11.0,18.0,1.0,1.0
Week,26.0,28.0,29.0,31.0,31.0
Month,6.0,7.0,7.0,8.0,8.0
Year,2007.0,2007.0,2007.0,2007.0,2007.0


In [103]:
wnv.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8610 entries, 0 to 8609
Data columns (total 40 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Block                           8610 non-null   int64  
 1   Latitude                        8610 non-null   float64
 2   Longitude                       8610 non-null   float64
 3   AddressAccuracy                 8610 non-null   int64  
 4   NumMosquitos                    8610 non-null   int64  
 5   Day_of_week                     8610 non-null   int64  
 6   Day_of_month                    8610 non-null   int64  
 7   Week                            8610 non-null   int64  
 8   Month                           8610 non-null   int64  
 9   Year                            8610 non-null   int64  
 10  DaytimeLength                   8610 non-null   float64
 11  Tavg                            8610 non-null   float64
 12  Depart                          86

### Train/Test Split

#### Undersampling

In [104]:
wnv['WnvPresent'].value_counts()

0    8153
1     457
Name: WnvPresent, dtype: int64

Since we are dealing with an imbalanced classification problem, we need to undersample the majority class to match the minority class.

In [105]:
wnv1 = wnv[wnv['WnvPresent']==1]
wnv0 = wnv[wnv['WnvPresent']==0]
wnv0 = wnv0.sample(n=len(wnv1), random_state=47)
wnv_balanced = pd.concat([wnv1,wnv0],axis=0)
wnv_balanced['WnvPresent'].value_counts()

1    457
0    457
Name: WnvPresent, dtype: int64

In [106]:
wnv_balanced.head().T

Unnamed: 0,36,42,45,48,49
Block,41.0,41.0,41.0,41.0,41.0
Latitude,41.95469,41.95469,41.95469,41.95469,41.95469
Longitude,-87.800991,-87.800991,-87.800991,-87.800991,-87.800991
AddressAccuracy,9.0,9.0,9.0,9.0,9.0
NumMosquitos,71.0,41.0,23.0,27.0,13.0
Day_of_week,2.0,2.0,1.0,4.0,1.0
Day_of_month,1.0,15.0,21.0,24.0,4.0
Week,31.0,33.0,34.0,34.0,36.0
Month,8.0,8.0,8.0,8.0,9.0
Year,2007.0,2007.0,2007.0,2007.0,2007.0


In [107]:
wnv_balanced.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 914 entries, 36 to 1197
Data columns (total 40 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Block                           914 non-null    int64  
 1   Latitude                        914 non-null    float64
 2   Longitude                       914 non-null    float64
 3   AddressAccuracy                 914 non-null    int64  
 4   NumMosquitos                    914 non-null    int64  
 5   Day_of_week                     914 non-null    int64  
 6   Day_of_month                    914 non-null    int64  
 7   Week                            914 non-null    int64  
 8   Month                           914 non-null    int64  
 9   Year                            914 non-null    int64  
 10  DaytimeLength                   914 non-null    float64
 11  Tavg                            914 non-null    float64
 12  Depart                          91

Now we need to make sure that there are no colums with only one unique value.

In [108]:
# find columns with only one unique value
cols = list(wnv_balanced.columns)
tally = 0
for column in cols:
    if list(wnv_balanced[column].value_counts())[0] == 914:
        print(column)
    else:
        tally += 1
if tally != 0:
    print('There are no columns with only one unique value.')

There are no columns with only one unique value.


In [109]:
# train/test split
X_train, X_test, y_train, y_test = train_test_split(wnv_balanced.drop(columns='WnvPresent'), 
                                                    wnv_balanced['WnvPresent'], 
                                                    test_size=0.3, 
                                                    random_state=47)

In [110]:
# training virus percerntage
print(y_train.value_counts())
print(y_train.value_counts(normalize=True))

1    324
0    315
Name: WnvPresent, dtype: int64
1    0.507042
0    0.492958
Name: WnvPresent, dtype: float64


In [111]:
# test virus percentage
print(y_test.value_counts())
print(y_test.value_counts(normalize=True))

0    142
1    133
Name: WnvPresent, dtype: int64
0    0.516364
1    0.483636
Name: WnvPresent, dtype: float64


In [112]:
X_train

Unnamed: 0,Block,Latitude,Longitude,AddressAccuracy,NumMosquitos,Day_of_week,Day_of_month,Week,Month,Year,...,Species_CULEX PIPIENS/RESTUANS,Species_CULEX RESTUANS,Season_Fall,Season_Spring,Season_Summer,Month_Name_Aug,Month_Name_July,Month_Name_June,Month_Name_Oct,Month_Name_Sep
6916,13,41.678618,-87.559308,5,27,4,24,25,6,2011,...,0,1,0,0,1,0,0,1,0,0
6412,63,41.776156,-87.778927,9,1,4,19,33,8,2011,...,0,0,0,0,1,1,0,0,0,0
2500,21,41.919343,-87.694259,8,1,1,29,22,5,2007,...,1,0,0,1,0,0,0,0,0,0
6933,13,41.678618,-87.559308,5,1,0,12,37,9,2011,...,0,0,1,0,0,0,0,0,0,1
3831,50,41.803423,-87.642984,8,3,2,1,31,8,2007,...,1,0,0,0,1,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2660,22,41.921965,-87.632085,8,17,3,26,39,9,2013,...,0,0,1,0,0,0,0,0,0,1
5175,12,41.673408,-87.599862,5,136,3,12,37,9,2013,...,1,0,1,0,0,0,0,0,0,1
6700,89,41.732984,-87.649642,8,19,3,29,35,8,2013,...,1,0,0,0,1,1,0,0,0,0
79,41,41.954690,-87.800991,9,14,4,17,29,7,2009,...,0,1,0,0,1,0,1,0,0,0


#### Weight of Evidence

In [113]:
wnv_balanced.columns

Index(['Block', 'Latitude', 'Longitude', 'AddressAccuracy', 'NumMosquitos',
       'Day_of_week', 'Day_of_month', 'Week', 'Month', 'Year', 'DaytimeLength',
       'Tavg', 'Depart', 'Heat', 'Cool', 'PrecipTotal', 'SeaLevel',
       'ResultSpeed', 'ResultDir', 'RelativeHumidity', 'DewPointDepression',
       'WnvPresent', 'Wnv_percent_yearly', 'Wnv_percent_seasonal',
       'Wnv_percent_monthly', 'Wnv_percent_weekly', 'Sevenday_lag',
       'Fourteenday_lag', 'Twentyoneday_lag', 'Species_CULEX PIPIENS',
       'Species_CULEX PIPIENS/RESTUANS', 'Species_CULEX RESTUANS',
       'Season_Fall', 'Season_Spring', 'Season_Summer', 'Month_Name_Aug',
       'Month_Name_July', 'Month_Name_June', 'Month_Name_Oct',
       'Month_Name_Sep'],
      dtype='object')

In [114]:
max_bin = 20
force_bin = 3

# define a binning function
def mono_bin(Y, X, n = max_bin):    
    df1 = pd.DataFrame({"X": X, "Y": Y})
    justmiss = df1[['X','Y']][df1.X.isnull()]
    notmiss = df1[['X','Y']][df1.X.notnull()]
    #print("justmiss", justmiss)
    #print("notmiss", notmiss)
    r = 0
    while np.abs(r) < 1:
        try:
            d1 = pd.DataFrame({"X": notmiss.X, "Y": notmiss.Y, "Bucket": pd.qcut(notmiss.X, n)})
            d2 = d1.groupby('Bucket', as_index=True)
            r, p = stats.spearmanr(d2.mean().X, d2.mean().Y)
            #print("I am here 1",r, n,len(d2))
            n = n - 1 
            
        except Exception as e:
            n = n - 1
            #print("I am here e",n)

    if len(d2) == 1:
        #print("I am second step ",r, n)
        n = force_bin         
        bins = algos.quantile(notmiss.X, np.linspace(0, 1, n))
        if len(np.unique(bins)) == 2:
            bins = np.insert(bins, 0, 1)
            bins[1] = bins[1]-(bins[1]/2)
        d1 = pd.DataFrame({"X": notmiss.X, "Y": notmiss.Y, "Bucket": pd.cut(notmiss.X, 
                                                                            np.unique(bins),
                                                                            include_lowest=True)}) 
        d2 = d1.groupby('Bucket', as_index=True)
    
    d3 = pd.DataFrame({},index=[])
    d3["MIN_VALUE"] = d2.min().X
    d3["MAX_VALUE"] = d2.max().X
    d3["COUNT"] = d2.count().Y
    d3["EVENT"] = d2.sum().Y
    d3["NONEVENT"] = d2.count().Y - d2.sum().Y
    d3=d3.reset_index(drop=True)
    
    if len(justmiss.index) > 0:
        d4 = pd.DataFrame({'MIN_VALUE':np.nan},index=[0])
        d4["MAX_VALUE"] = np.nan
        #print(justmiss.count().Y)
        d4["COUNT"] = justmiss.count().Y
        d4["EVENT"] = justmiss.sum().Y
        d4["NONEVENT"] = justmiss.count().Y - justmiss.sum().Y
        d3 = d3.append(d4,ignore_index=True)
    
    d3["EVENT_RATE"] = d3.EVENT/d3.COUNT
    d3["NON_EVENT_RATE"] = d3.NONEVENT/d3.COUNT
    d3["DIST_EVENT"] = d3.EVENT/d3.sum().EVENT
    d3["DIST_NON_EVENT"] = d3.NONEVENT/d3.sum().NONEVENT
    print(np.log(d3.DIST_EVENT/d3.DIST_NON_EVENT))
    d3["WOE"] = np.log(d3.DIST_EVENT/d3.DIST_NON_EVENT)
    d3["IV"] = (d3.DIST_EVENT-d3.DIST_NON_EVENT)*np.log(d3.DIST_EVENT/d3.DIST_NON_EVENT)
    d3["VAR_NAME"] = "VAR"
    d3 = d3[['VAR_NAME','MIN_VALUE', 'MAX_VALUE', 'COUNT', 'EVENT', 'EVENT_RATE', 'NONEVENT', 
             'NON_EVENT_RATE', 'DIST_EVENT','DIST_NON_EVENT','WOE', 'IV']]       
    d3 = d3.replace([np.inf, -np.inf], 0)
    d3.IV = d3.IV.sum()
    
    return(d3)

def char_bin(Y, X):
        
    df1 = pd.DataFrame({"X": X, "Y": Y})
    justmiss = df1[['X','Y']][df1.X.isnull()]
    notmiss = df1[['X','Y']][df1.X.notnull()]    
    df2 = notmiss.groupby('X',as_index=True)
    d3 = pd.DataFrame({},index=[])
    d3["COUNT"] = df2.count().Y
    d3["MIN_VALUE"] = df2.sum().Y.index
    d3["MAX_VALUE"] = d3["MIN_VALUE"]
    d3["EVENT"] = df2.sum().Y
    d3["NONEVENT"] = df2.count().Y - df2.sum().Y
    
    if len(justmiss.index) > 0:
        d4 = pd.DataFrame({'MIN_VALUE':np.nan},index=[0])
        d4["MAX_VALUE"] = np.nan
        d4["COUNT"] = justmiss.count().Y
        d4["EVENT"] = justmiss.sum().Y
        d4["NONEVENT"] = justmiss.count().Y - justmiss.sum().Y
        d3 = d3.append(d4,ignore_index=True)
    
    d3["EVENT_RATE"] = d3.EVENT/d3.COUNT
    d3["NON_EVENT_RATE"] = d3.NONEVENT/d3.COUNT
    d3["DIST_EVENT"] = d3.EVENT/d3.sum().EVENT
    d3["DIST_NON_EVENT"] = d3.NONEVENT/d3.sum().NONEVENT
    d3["WOE"] = np.log(d3.DIST_EVENT/d3.DIST_NON_EVENT)
    d3["IV"] = (d3.DIST_EVENT-d3.DIST_NON_EVENT)*np.log(d3.DIST_EVENT/d3.DIST_NON_EVENT)
    d3["VAR_NAME"] = "VAR"
    d3 = d3[['VAR_NAME','MIN_VALUE', 'MAX_VALUE', 'COUNT', 'EVENT', 'EVENT_RATE', 'NONEVENT', 
             'NON_EVENT_RATE', 'DIST_EVENT','DIST_NON_EVENT','WOE', 'IV']]      
    d3 = d3.replace([np.inf, -np.inf], 0)
    d3.IV = d3.IV.sum()
    #print("hi",d3.IV )
    d3 = d3.reset_index(drop=True)
    
    return(d3)

def data_vars(df1, target):
    
    stack = traceback.extract_stack()
    filename, lineno, function_name, code = stack[-2]
    vars_name = re.compile(r'\((.*?)\).*$').search(code).groups()[0]
    final = (re.findall(r"[\w']+", vars_name))[-1]
    
    x = df1.dtypes.index
    count = -1
    for i in x:
        print(i)
        if i.upper() not in (final.upper()):
            if np.issubdtype(df1[i], np.number) and len(Series.unique(df1[i])) > 2:
                #print("Number and unique value greater than 2")
                conv = mono_bin(target, df1[i])
                conv["VAR_NAME"] = i
                count = count + 1
            else:
                #print("I am here 2")
                conv = char_bin(target, df1[i])
                conv["VAR_NAME"] = i            
                count = count + 1
                
            if count == 0:
                iv_df = conv
            else:
                iv_df = iv_df.append(conv,ignore_index=True)
    
    iv = pd.DataFrame({'IV':iv_df.groupby('VAR_NAME').IV.max()})
    iv = iv.reset_index()
    return(iv_df,iv)

In [115]:
final_iv, IV = data_vars(X_train, y_train)

Block
0    0.033198
1   -0.034561
dtype: float64
Latitude


Please use `spearmanr` from the `scipy.stats` namespace, the `scipy.stats.stats` namespace is deprecated.
Please use `spearmanr` from the `scipy.stats` namespace, the `scipy.stats.stats` namespace is deprecated.
Please use `spearmanr` from the `scipy.stats` namespace, the `scipy.stats.stats` namespace is deprecated.
Please use `spearmanr` from the `scipy.stats` namespace, the `scipy.stats.stats` namespace is deprecated.
Please use `spearmanr` from the `scipy.stats` namespace, the `scipy.stats.stats` namespace is deprecated.
Please use `spearmanr` from the `scipy.stats` namespace, the `scipy.stats.stats` namespace is deprecated.
Please use `spearmanr` from the `scipy.stats` namespace, the `scipy.stats.stats` namespace is deprecated.
Please use `spearmanr` from the `scipy.stats` namespace, the `scipy.stats.stats` namespace is deprecated.
Please use `spearmanr` from the `scipy.stats` namespace, the `scipy.stats.stats` namespace is deprecated.
Please use `spearmanr` from the `scipy.stats` 

0    0.009334
1   -0.009362
dtype: float64
Longitude
0    0.230137
1   -0.105132
2   -0.131549
dtype: float64
AddressAccuracy
0   -0.037740
1    0.071459
dtype: float64
NumMosquitos
0   -1.793955
1   -0.520647
2    0.751154
3    1.734175
dtype: float64
Day_of_week


Please use `spearmanr` from the `scipy.stats` namespace, the `scipy.stats.stats` namespace is deprecated.
Please use `spearmanr` from the `scipy.stats` namespace, the `scipy.stats.stats` namespace is deprecated.
Please use `spearmanr` from the `scipy.stats` namespace, the `scipy.stats.stats` namespace is deprecated.
Please use `spearmanr` from the `scipy.stats` namespace, the `scipy.stats.stats` namespace is deprecated.
Please use `spearmanr` from the `scipy.stats` namespace, the `scipy.stats.stats` namespace is deprecated.
Please use `spearmanr` from the `scipy.stats` namespace, the `scipy.stats.stats` namespace is deprecated.
Please use `spearmanr` from the `scipy.stats` namespace, the `scipy.stats.stats` namespace is deprecated.
Please use `spearmanr` from the `scipy.stats` namespace, the `scipy.stats.stats` namespace is deprecated.
Please use `spearmanr` from the `scipy.stats` namespace, the `scipy.stats.stats` namespace is deprecated.
Please use `spearmanr` from the `scipy.stats` 

0    0.215305
1   -0.418369
dtype: float64
Day_of_month
0    0.085384
1   -0.093984
dtype: float64
Week
0   -0.333553
1    0.358451
dtype: float64
Month


Please use `spearmanr` from the `scipy.stats` namespace, the `scipy.stats.stats` namespace is deprecated.
Please use `spearmanr` from the `scipy.stats` namespace, the `scipy.stats.stats` namespace is deprecated.
Please use `spearmanr` from the `scipy.stats` namespace, the `scipy.stats.stats` namespace is deprecated.
Please use `spearmanr` from the `scipy.stats` namespace, the `scipy.stats.stats` namespace is deprecated.
Please use `spearmanr` from the `scipy.stats` namespace, the `scipy.stats.stats` namespace is deprecated.
Please use `spearmanr` from the `scipy.stats` namespace, the `scipy.stats.stats` namespace is deprecated.
Please use `spearmanr` from the `scipy.stats` namespace, the `scipy.stats.stats` namespace is deprecated.
Please use `spearmanr` from the `scipy.stats` namespace, the `scipy.stats.stats` namespace is deprecated.
Please use `spearmanr` from the `scipy.stats` namespace, the `scipy.stats.stats` namespace is deprecated.
Please use `spearmanr` from the `scipy.stats` 

0    0.050130
1   -0.128814
dtype: float64
Year
0   -0.202524
1    0.206336
dtype: float64
DaytimeLength
0    -1.915241
1    -0.684950
2     0.146183
3     1.763589
4     0.607818
5     0.888120
6     1.358123
7          NaN
8     0.468266
9     0.970358
10    1.184852
11    0.259511
12   -0.182322
13   -0.721318
14   -0.088795
15   -1.532248
16        -inf
17   -3.395467
dtype: float64
Tavg


Please use `spearmanr` from the `scipy.stats` namespace, the `scipy.stats.stats` namespace is deprecated.
Please use `spearmanr` from the `scipy.stats` namespace, the `scipy.stats.stats` namespace is deprecated.
Please use `spearmanr` from the `scipy.stats` namespace, the `scipy.stats.stats` namespace is deprecated.
Please use `spearmanr` from the `scipy.stats` namespace, the `scipy.stats.stats` namespace is deprecated.
divide by zero encountered in log
Please use `spearmanr` from the `scipy.stats` namespace, the `scipy.stats.stats` namespace is deprecated.
Please use `spearmanr` from the `scipy.stats` namespace, the `scipy.stats.stats` namespace is deprecated.
Please use `spearmanr` from the `scipy.stats` namespace, the `scipy.stats.stats` namespace is deprecated.
Please use `spearmanr` from the `scipy.stats` namespace, the `scipy.stats.stats` namespace is deprecated.
Please use `spearmanr` from the `scipy.stats` namespace, the `scipy.stats.stats` namespace is deprecated.
Please use `

0   -0.128254
1    0.128891
dtype: float64
Depart
0   -0.373917
1    0.123514
2    0.312434
dtype: float64
Heat
0    0.149688
1   -1.600568
dtype: float64
Cool


Please use `spearmanr` from the `scipy.stats` namespace, the `scipy.stats.stats` namespace is deprecated.
Please use `spearmanr` from the `scipy.stats` namespace, the `scipy.stats.stats` namespace is deprecated.
Please use `spearmanr` from the `scipy.stats` namespace, the `scipy.stats.stats` namespace is deprecated.
Please use `spearmanr` from the `scipy.stats` namespace, the `scipy.stats.stats` namespace is deprecated.
Please use `spearmanr` from the `scipy.stats` namespace, the `scipy.stats.stats` namespace is deprecated.
Please use `spearmanr` from the `scipy.stats` namespace, the `scipy.stats.stats` namespace is deprecated.
Please use `spearmanr` from the `scipy.stats` namespace, the `scipy.stats.stats` namespace is deprecated.
Please use `spearmanr` from the `scipy.stats` namespace, the `scipy.stats.stats` namespace is deprecated.
Please use `spearmanr` from the `scipy.stats` namespace, the `scipy.stats.stats` namespace is deprecated.
Please use `spearmanr` from the `scipy.stats` 

0   -0.128254
1    0.128891
dtype: float64
PrecipTotal


Please use `spearmanr` from the `scipy.stats` namespace, the `scipy.stats.stats` namespace is deprecated.
Please use `spearmanr` from the `scipy.stats` namespace, the `scipy.stats.stats` namespace is deprecated.
Please use `spearmanr` from the `scipy.stats` namespace, the `scipy.stats.stats` namespace is deprecated.
Please use `spearmanr` from the `scipy.stats` namespace, the `scipy.stats.stats` namespace is deprecated.
Please use `spearmanr` from the `scipy.stats` namespace, the `scipy.stats.stats` namespace is deprecated.
Please use `spearmanr` from the `scipy.stats` namespace, the `scipy.stats.stats` namespace is deprecated.
Please use `spearmanr` from the `scipy.stats` namespace, the `scipy.stats.stats` namespace is deprecated.
Please use `spearmanr` from the `scipy.stats` namespace, the `scipy.stats.stats` namespace is deprecated.
Please use `spearmanr` from the `scipy.stats` namespace, the `scipy.stats.stats` namespace is deprecated.
Please use `spearmanr` from the `scipy.stats` 

0    0.022634
1   -2.107612
dtype: float64
SeaLevel


Please use `spearmanr` from the `scipy.stats` namespace, the `scipy.stats.stats` namespace is deprecated.
Please use `spearmanr` from the `scipy.stats` namespace, the `scipy.stats.stats` namespace is deprecated.
Please use `spearmanr` from the `scipy.stats` namespace, the `scipy.stats.stats` namespace is deprecated.
Please use `spearmanr` from the `scipy.stats` namespace, the `scipy.stats.stats` namespace is deprecated.
Please use `spearmanr` from the `scipy.stats` namespace, the `scipy.stats.stats` namespace is deprecated.
Please use `spearmanr` from the `scipy.stats` namespace, the `scipy.stats.stats` namespace is deprecated.
Please use `spearmanr` from the `scipy.stats` namespace, the `scipy.stats.stats` namespace is deprecated.
Please use `spearmanr` from the `scipy.stats` namespace, the `scipy.stats.stats` namespace is deprecated.
Please use `spearmanr` from the `scipy.stats` namespace, the `scipy.stats.stats` namespace is deprecated.
Please use `spearmanr` from the `scipy.stats` 

0   -0.028171
1    0.035455
dtype: float64
ResultSpeed
0    0.231049
1   -0.037740
2   -0.214070
dtype: float64
ResultDir


Please use `spearmanr` from the `scipy.stats` namespace, the `scipy.stats.stats` namespace is deprecated.
Please use `spearmanr` from the `scipy.stats` namespace, the `scipy.stats.stats` namespace is deprecated.
Please use `spearmanr` from the `scipy.stats` namespace, the `scipy.stats.stats` namespace is deprecated.
Please use `spearmanr` from the `scipy.stats` namespace, the `scipy.stats.stats` namespace is deprecated.
Please use `spearmanr` from the `scipy.stats` namespace, the `scipy.stats.stats` namespace is deprecated.
Please use `spearmanr` from the `scipy.stats` namespace, the `scipy.stats.stats` namespace is deprecated.
Please use `spearmanr` from the `scipy.stats` namespace, the `scipy.stats.stats` namespace is deprecated.
Please use `spearmanr` from the `scipy.stats` namespace, the `scipy.stats.stats` namespace is deprecated.
Please use `spearmanr` from the `scipy.stats` namespace, the `scipy.stats.stats` namespace is deprecated.
Please use `spearmanr` from the `scipy.stats` 

0     0.138883
1    -0.433636
2          NaN
3     0.045937
4    -0.875469
5     0.045937
6     0.419854
7     0.975131
8    -0.060961
9     0.916291
10    0.216952
11    0.183138
12   -1.096012
13   -0.115182
dtype: float64
RelativeHumidity


Please use `spearmanr` from the `scipy.stats` namespace, the `scipy.stats.stats` namespace is deprecated.
Please use `spearmanr` from the `scipy.stats` namespace, the `scipy.stats.stats` namespace is deprecated.
Please use `spearmanr` from the `scipy.stats` namespace, the `scipy.stats.stats` namespace is deprecated.
Please use `spearmanr` from the `scipy.stats` namespace, the `scipy.stats.stats` namespace is deprecated.
Please use `spearmanr` from the `scipy.stats` namespace, the `scipy.stats.stats` namespace is deprecated.
Please use `spearmanr` from the `scipy.stats` namespace, the `scipy.stats.stats` namespace is deprecated.
Please use `spearmanr` from the `scipy.stats` namespace, the `scipy.stats.stats` namespace is deprecated.
Please use `spearmanr` from the `scipy.stats` namespace, the `scipy.stats.stats` namespace is deprecated.
Please use `spearmanr` from the `scipy.stats` namespace, the `scipy.stats.stats` namespace is deprecated.
Please use `spearmanr` from the `scipy.stats` 

0    0.148760
1   -0.168871
dtype: float64
DewPointDepression
0   -0.077871
1    0.079188
dtype: float64
Wnv_percent_yearly


Please use `spearmanr` from the `scipy.stats` namespace, the `scipy.stats.stats` namespace is deprecated.
Please use `spearmanr` from the `scipy.stats` namespace, the `scipy.stats.stats` namespace is deprecated.
Please use `spearmanr` from the `scipy.stats` namespace, the `scipy.stats.stats` namespace is deprecated.
Please use `spearmanr` from the `scipy.stats` namespace, the `scipy.stats.stats` namespace is deprecated.
Please use `spearmanr` from the `scipy.stats` namespace, the `scipy.stats.stats` namespace is deprecated.
Please use `spearmanr` from the `scipy.stats` namespace, the `scipy.stats.stats` namespace is deprecated.
Please use `spearmanr` from the `scipy.stats` namespace, the `scipy.stats.stats` namespace is deprecated.
Please use `spearmanr` from the `scipy.stats` namespace, the `scipy.stats.stats` namespace is deprecated.
Please use `spearmanr` from the `scipy.stats` namespace, the `scipy.stats.stats` namespace is deprecated.
Please use `spearmanr` from the `scipy.stats` 

0   -0.359816
1         NaN
2    0.750743
dtype: float64
Wnv_percent_seasonal
0   -1.351945
1   -0.544862
2    0.476667
3         NaN
4    0.606707
5    1.161413
dtype: float64
Wnv_percent_monthly
0   -1.739888
1    0.218926
2    1.161413
3    1.171794
dtype: float64
Wnv_percent_weekly


Please use `spearmanr` from the `scipy.stats` namespace, the `scipy.stats.stats` namespace is deprecated.
Please use `spearmanr` from the `scipy.stats` namespace, the `scipy.stats.stats` namespace is deprecated.
Please use `spearmanr` from the `scipy.stats` namespace, the `scipy.stats.stats` namespace is deprecated.
Please use `spearmanr` from the `scipy.stats` namespace, the `scipy.stats.stats` namespace is deprecated.
Please use `spearmanr` from the `scipy.stats` namespace, the `scipy.stats.stats` namespace is deprecated.
Please use `spearmanr` from the `scipy.stats` namespace, the `scipy.stats.stats` namespace is deprecated.
Please use `spearmanr` from the `scipy.stats` namespace, the `scipy.stats.stats` namespace is deprecated.
Please use `spearmanr` from the `scipy.stats` namespace, the `scipy.stats.stats` namespace is deprecated.
Please use `spearmanr` from the `scipy.stats` namespace, the `scipy.stats.stats` namespace is deprecated.
Please use `spearmanr` from the `scipy.stats` 

0   -2.232316
1   -0.275031
2    0.934309
3    1.516027
dtype: float64
Sevenday_lag
0   -0.663498
1   -0.227660
2    1.064132
dtype: float64
Fourteenday_lag
0   -1.097369
1   -0.343688
2    0.372307
3    1.358123
dtype: float64


Please use `spearmanr` from the `scipy.stats` namespace, the `scipy.stats.stats` namespace is deprecated.
Please use `spearmanr` from the `scipy.stats` namespace, the `scipy.stats.stats` namespace is deprecated.
Please use `spearmanr` from the `scipy.stats` namespace, the `scipy.stats.stats` namespace is deprecated.
Please use `spearmanr` from the `scipy.stats` namespace, the `scipy.stats.stats` namespace is deprecated.
Please use `spearmanr` from the `scipy.stats` namespace, the `scipy.stats.stats` namespace is deprecated.
Please use `spearmanr` from the `scipy.stats` namespace, the `scipy.stats.stats` namespace is deprecated.
Please use `spearmanr` from the `scipy.stats` namespace, the `scipy.stats.stats` namespace is deprecated.
Please use `spearmanr` from the `scipy.stats` namespace, the `scipy.stats.stats` namespace is deprecated.
Please use `spearmanr` from the `scipy.stats` namespace, the `scipy.stats.stats` namespace is deprecated.
Please use `spearmanr` from the `scipy.stats` 

Twentyoneday_lag
0   -0.886568
1   -0.634307
2    0.360200
3    1.514822
dtype: float64
Species_CULEX PIPIENS
Species_CULEX PIPIENS/RESTUANS
Species_CULEX RESTUANS
Season_Fall
Season_Spring
Season_Summer
Month_Name_Aug
Month_Name_July
Month_Name_June
Month_Name_Oct
Month_Name_Sep


divide by zero encountered in log


In [116]:
final_iv

Unnamed: 0,VAR_NAME,MIN_VALUE,MAX_VALUE,COUNT,EVENT,EVENT_RATE,NONEVENT,NON_EVENT_RATE,DIST_EVENT,DIST_NON_EVENT,WOE,IV
0,Block,10.000000,40.000000,326,168,0.515337,158,0.484663,0.518519,0.501587,0.033198,0.001147
1,Block,41.000000,93.000000,313,156,0.498403,157,0.501597,0.481481,0.498413,-0.034561,0.001147
2,Latitude,41.644612,41.869107,320,163,0.509375,157,0.490625,0.503086,0.498413,0.009334,0.000087
3,Latitude,41.869216,42.011601,319,161,0.504702,158,0.495298,0.496914,0.501587,-0.009362,0.000087
4,Longitude,-87.930995,-87.752411,218,123,0.564220,95,0.435780,0.379630,0.301587,0.230137,0.027323
...,...,...,...,...,...,...,...,...,...,...,...,...
120,Month_Name_June,1.000000,1.000000,49,1,0.020408,48,0.979592,0.003086,0.152381,-3.899372,0.606375
121,Month_Name_Oct,0.000000,0.000000,628,323,0.514331,305,0.485669,0.996914,0.968254,0.029170,0.067635
122,Month_Name_Oct,1.000000,1.000000,11,1,0.090909,10,0.909091,0.003086,0.031746,-2.330756,0.067635
123,Month_Name_Sep,0.000000,0.000000,471,240,0.509554,231,0.490446,0.740741,0.733333,0.010050,0.000283


In [117]:
IV

Unnamed: 0,VAR_NAME,IV
0,AddressAccuracy,0.002696
1,Block,0.001147
2,Cool,0.016508
3,Day_of_month,0.008019
4,Day_of_week,0.089407
5,DaytimeLength,1.137673
6,Depart,0.07922
7,DewPointDepression,0.006163
8,Fourteenday_lag,0.73492
9,Heat,0.235065


In [118]:
features = list(IV[(IV['IV'] >= 0.01) & (IV['IV'] <= 0.8)]['VAR_NAME'])
X2 = X_train[features]
display(X2.shape)
X2.head()

(639, 23)

Unnamed: 0,Cool,Day_of_week,Depart,Fourteenday_lag,Heat,Longitude,Month_Name_Aug,Month_Name_July,Month_Name_June,Month_Name_Oct,...,ResultSpeed,Season_Summer,Sevenday_lag,Species_CULEX PIPIENS,Species_CULEX RESTUANS,Tavg,Week,Wnv_percent_seasonal,Wnv_percent_yearly,Year
6916,0.0,4,-7.0,0.0,1.0,-87.559308,0,0,1,0,...,8.95,1,0.0,0,1,64.0,25,2.145594,2.787068,2011
6412,11.5,4,5.0,-0.347222,0.0,-87.778927,1,0,0,0,...,4.0,1,-3.309179,0,0,76.0,33,2.145594,2.787068,2011
2500,10.5,1,10.0,0.0,0.0,-87.694259,0,0,0,0,...,5.8,0,0.0,0,0,75.25,22,0.0,6.661967,2007
6933,10.0,0,10.0,4.6875,0.0,-87.559308,0,0,0,0,...,9.85,0,4.6875,1,0,75.0,37,4.498978,2.787068,2011
3831,15.5,2,8.0,4.438284,0.0,-87.642984,1,0,0,0,...,2.8,1,-8.985361,0,0,80.25,31,8.282209,6.661967,2007


In [119]:
display(X2.shape[1])
for i in range(X2.shape[1]):
    print((i, variance_inflation_factor(X2.values, i) ))

23

(0, 6045.935200089099)
(1, 8.259700340569635)
(2, 24.882171071961004)
(3, 2.664295357499848)
(4, 424.1115127572786)
(5, 668610.267959011)
(6, inf)
(7, inf)
(8, inf)
(9, 1.556785579687715)
(10, 2.732803210954599)
(11, 126.23463746651379)
(12, 6.408240353900339)
(13, 7.365140740239032)
(14, inf)
(15, 1.6698630111334474)
(16, 1.8245662437347745)
(17, 1.5949209476910857)
(18, 305209.07766064635)
(19, 488.0622009430028)
(20, 25.059846842956933)
(21, 28.104681132048313)
(22, 763023.0567815827)


divide by zero encountered in double_scalars


In [120]:
def iterate_vif(df, vif_threshold=5, max_vif=6):
    count = 0
    while max_vif > vif_threshold:
        count += 1
        print("Iteration # "+str(count))
        vif = pd.DataFrame()
        vif["VIFactor"] = [variance_inflation_factor(df.values, i) for i in range(df.shape[1])]
        vif["features"] = df.columns
        
        if vif['VIFactor'].max() > vif_threshold:
            print('Removing %s with VIF of %f' % (vif[vif['VIFactor'] == vif['VIFactor'].max()]['features'].values[0],
                                                  vif['VIFactor'].max()))
            df = df.drop(vif[vif['VIFactor'] == vif['VIFactor'].max()]['features'].values[0], axis=1)
            max_vif = vif['VIFactor'].max()
        else:
            print('Complete')
            return df, vif.sort_values('VIFactor')  

X1 = X2._get_numeric_data()
final_df, final_vif = iterate_vif(X1)

Iteration # 1
Removing Month_Name_Aug with VIF of inf
Iteration # 2
Removing Year with VIF of 763023.056782
Iteration # 3
Removing Tavg with VIF of 259783.900306
Iteration # 4
Removing Longitude with VIF of 708.845004
Iteration # 5
Removing RelativeHumidity with VIF of 104.289066
Iteration # 6
Removing Cool with VIF of 37.534379
Iteration # 7
Removing Wnv_percent_yearly with VIF of 24.308883
Iteration # 8
Removing Week with VIF of 21.934191
Iteration # 9
Removing ResultSpeed with VIF of 6.015666
Iteration # 10
Removing Wnv_percent_seasonal with VIF of 5.756301
Iteration # 11
Complete


In [127]:
X_train = final_df
display(X_train.shape)
display(X_train.columns)
X_train.head()

(639, 13)

Index(['Day_of_week', 'Depart', 'Fourteenday_lag', 'Heat', 'Month_Name_July',
       'Month_Name_June', 'Month_Name_Oct', 'PrecipTotal', 'ResultDir',
       'Season_Summer', 'Sevenday_lag', 'Species_CULEX PIPIENS',
       'Species_CULEX RESTUANS'],
      dtype='object')

Unnamed: 0,Day_of_week,Depart,Fourteenday_lag,Heat,Month_Name_July,Month_Name_June,Month_Name_Oct,PrecipTotal,ResultDir,Season_Summer,Sevenday_lag,Species_CULEX PIPIENS,Species_CULEX RESTUANS
6916,4,-7.0,0.0,1.0,0,1,0,0.0,29.0,1,0.0,0,1
6412,4,5.0,-0.347222,0.0,0,0,0,0.0,13.5,1,-3.309179,0,0
2500,1,10.0,0.0,0.0,0,0,0,0.0,17.0,0,0.0,0,0
6933,0,10.0,4.6875,0.0,0,0,0,0.0,23.0,0,4.6875,1,0
3831,2,8.0,4.438284,0.0,0,0,0,0.0,11.0,1,-8.985361,0,0


In [128]:
X_test = X_test[X_train.columns]
display(X_test.shape)
display(X_test.columns)
X_test.head()

(275, 13)

Index(['Day_of_week', 'Depart', 'Fourteenday_lag', 'Heat', 'Month_Name_July',
       'Month_Name_June', 'Month_Name_Oct', 'PrecipTotal', 'ResultDir',
       'Season_Summer', 'Sevenday_lag', 'Species_CULEX PIPIENS',
       'Species_CULEX RESTUANS'],
      dtype='object')

Unnamed: 0,Day_of_week,Depart,Fourteenday_lag,Heat,Month_Name_July,Month_Name_June,Month_Name_Oct,PrecipTotal,ResultDir,Season_Summer,Sevenday_lag,Species_CULEX PIPIENS,Species_CULEX RESTUANS
5169,3,2.0,0.881959,0.0,0,0,0,0.295,23.5,1,6.097106,0,1
7798,3,8.0,9.55665,0.0,0,0,0,0.0,10.0,1,3.459545,0,0
7872,3,-7.0,3.882784,0.0,0,0,0,0.0,23.0,1,-5.215146,1,0
6175,4,-6.0,6.0,10.0,0,0,0,0.01,36.0,0,3.297297,0,0
5046,2,9.0,-1.875533,0.0,0,0,0,0.625,21.0,1,13.31029,1,0


In [129]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(639, 13)
(275, 13)
(639,)
(275,)


In [None]:
# save data to a new .csv files
datapath = '../data'
save_file(X_train, 'X_train.csv', datapath)
save_file(X_test, 'X_test.csv', datapath)
save_file(y_train, 'y_train.csv', datapath)
save_file(y_test, 'y_test.csv', datapath)