In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score
%matplotlib inline

# We have 3 data sets, load them all in, and clean the data

In [2]:
spray = pd.read_csv('./assets/input/spray.csv')
weather = pd.read_csv('./assets/input/weather.csv')
train = pd.read_csv('./assets/input/train.csv')
test = pd.read_csv('./assets/input/test.csv')

In [3]:
#Change the date's in the train set and weather set into a datetime object
train['Date'] = pd.to_datetime(train['Date'])
weather['Date'] = pd.to_datetime(weather['Date'])


In [4]:
#Split and combine on the 2 weather stations so they are all on one date
weather_stn1 = weather[weather['Station']==1]
weather_stn2 = weather[weather['Station']==2]
weather_stn1 = weather_stn1.drop('Station', axis=1)
weather_stn2 = weather_stn2.drop('Station', axis=1)
weather = weather_stn1.merge(weather_stn2, on='Date')

#replace all the weird M's, "-"'s, and T's with -1's to be removed later
weather = weather.replace('M', -1)
weather = weather.replace('-', -1)
weather = weather.replace('T', -1)
weather = weather.replace(' T', -1)
weather = weather.replace('  T', -1)

#merge the train set with the weather set so the train set has additional features 
df = train.merge(weather, how = 'inner', on = 'Date', copy = False)

In [5]:
#Create Month, Date, and Year Columns
df['Month'] = df['Date'].dt.month
df['Year'] = df['Date'].dt.year
df['Day'] = df['Date'].dt.day

In [6]:
#Label Encode the Trap values
from sklearn import preprocessing
lbl = preprocessing.LabelEncoder()
lbl.fit(list(df['Trap'].values) + list(test['Trap'].values))
df['Trap'] = lbl.transform(df['Trap'].values)
test['Trap'] = lbl.transform(test['Trap'].values)

# EDA

# Model Testing

In [7]:
df.columns

Index(['Date', 'Address', 'Species', 'Block', 'Street', 'Trap',
       'AddressNumberAndStreet', 'Latitude', 'Longitude', 'AddressAccuracy',
       'NumMosquitos', 'WnvPresent', 'Tmax_x', 'Tmin_x', 'Tavg_x', 'Depart_x',
       'DewPoint_x', 'WetBulb_x', 'Heat_x', 'Cool_x', 'Sunrise_x', 'Sunset_x',
       'CodeSum_x', 'Depth_x', 'Water1_x', 'SnowFall_x', 'PrecipTotal_x',
       'StnPressure_x', 'SeaLevel_x', 'ResultSpeed_x', 'ResultDir_x',
       'AvgSpeed_x', 'Tmax_y', 'Tmin_y', 'Tavg_y', 'Depart_y', 'DewPoint_y',
       'WetBulb_y', 'Heat_y', 'Cool_y', 'Sunrise_y', 'Sunset_y', 'CodeSum_y',
       'Depth_y', 'Water1_y', 'SnowFall_y', 'PrecipTotal_y', 'StnPressure_y',
       'SeaLevel_y', 'ResultSpeed_y', 'ResultDir_y', 'AvgSpeed_y', 'Month',
       'Year', 'Day'],
      dtype='object')

In [8]:
#Baseline Accuracy
df['WnvPresent'].value_counts(normalize = True)

0    0.947554
1    0.052446
Name: WnvPresent, dtype: float64

In [9]:
mos_dum = pd.get_dummies(df['Species'], drop_first= True)


In [10]:
from sklearn import preprocessing
lbl = preprocessing.LabelEncoder()
lbl.fit(list(train['Trap'].values) + list(test['Trap'].values))
train['Trap'] = lbl.transform(train['Trap'].values)
test['Trap'] = lbl.transform(test['Trap'].values)

In [11]:
#to match test set
X = df.drop(['Date', 'Address', 'Street', 'AddressNumberAndStreet', 'Latitude', 
             'Longitude', 'AddressAccuracy', 'NumMosquitos', 'WnvPresent', 'SeaLevel_x', 'SeaLevel_y',
            'CodeSum_x', 'CodeSum_y', 'Species','Tmax_x', 'WetBulb_x', 'Cool_x', 'Depth_x', 'StnPressure_x', 'Tmin_y',
       'DewPoint_y', 'WetBulb_y', 'Cool_y'], axis = 1)

#make all numbers
for col_name in X:
    X[col_name] = pd.to_numeric(X[col_name])
    
    
X['UNSPECIFIED CULEX'] = 0
X = X.join(mos_dum)

In [12]:
#drop rows with -1
X = X.ix[:,(X != -1).any(axis=0)]

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  from ipykernel import kernelapp as app


In [13]:
y = df['WnvPresent']

In [14]:
X.shape

(10506, 32)

In [15]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y)

## GridSearch LR

In [16]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
lr = LogisticRegression()
poly = PolynomialFeatures()
ss = StandardScaler()
pipe_gs = Pipeline([
    ('poly', poly),
      ('ss', ss),
    ('lr', lr)
])

params = {
        'lr__penalty':['l2'],
        'lr__C': [1.0]
    
}
gs_lr = GridSearchCV(pipe_gs, param_grid = params, cv = 5, scoring = 'roc_auc')
gs_lr.fit(X_train,y_train)

print(gs_lr.best_score_) #train score
print(gs_lr.best_params_)


0.8408828511212142
{'lr__C': 1.0, 'lr__penalty': 'l2'}


In [17]:
gs_lr.score(X_train, y_train)

0.8769840553549939

In [18]:
gs_lr.score(X_test,y_test)

0.8149332933913919

In [19]:
gs_lr.predict_proba(X_test)

array([[9.55025376e-01, 4.49746240e-02],
       [8.49490896e-01, 1.50509104e-01],
       [8.10341603e-01, 1.89658397e-01],
       ...,
       [9.48923202e-01, 5.10767980e-02],
       [9.99710683e-01, 2.89317047e-04],
       [9.99841212e-01, 1.58788302e-04]])

In [20]:
gs_lr.predict_proba(X_train)

array([[9.35614803e-01, 6.43851971e-02],
       [9.99738593e-01, 2.61406527e-04],
       [9.99802163e-01, 1.97837191e-04],
       ...,
       [9.84930724e-01, 1.50692758e-02],
       [9.31785165e-01, 6.82148348e-02],
       [9.82800699e-01, 1.71993008e-02]])

# Testing on Test set


In [21]:

test['Date'] = pd.to_datetime(test['Date'])
test['Month'] = test['Date'].dt.month
test['Year'] = test['Date'].dt.year
test['Day'] = test['Date'].dt.day

test

Unnamed: 0,Id,Date,Address,Species,Block,Street,Trap,AddressNumberAndStreet,Latitude,Longitude,AddressAccuracy,Month,Year,Day
0,1,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX PIPIENS/RESTUANS,41,N OAK PARK AVE,1,"4100 N OAK PARK AVE, Chicago, IL",41.954690,-87.800991,9,6,2008,11
1,2,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX RESTUANS,41,N OAK PARK AVE,1,"4100 N OAK PARK AVE, Chicago, IL",41.954690,-87.800991,9,6,2008,11
2,3,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX PIPIENS,41,N OAK PARK AVE,1,"4100 N OAK PARK AVE, Chicago, IL",41.954690,-87.800991,9,6,2008,11
3,4,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX SALINARIUS,41,N OAK PARK AVE,1,"4100 N OAK PARK AVE, Chicago, IL",41.954690,-87.800991,9,6,2008,11
4,5,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX TERRITANS,41,N OAK PARK AVE,1,"4100 N OAK PARK AVE, Chicago, IL",41.954690,-87.800991,9,6,2008,11
5,6,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX TARSALIS,41,N OAK PARK AVE,1,"4100 N OAK PARK AVE, Chicago, IL",41.954690,-87.800991,9,6,2008,11
6,7,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",UNSPECIFIED CULEX,41,N OAK PARK AVE,1,"4100 N OAK PARK AVE, Chicago, IL",41.954690,-87.800991,9,6,2008,11
7,8,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX ERRATICUS,41,N OAK PARK AVE,1,"4100 N OAK PARK AVE, Chicago, IL",41.954690,-87.800991,9,6,2008,11
8,9,2008-06-11,"6200 North Mandell Avenue, Chicago, IL 60646, USA",CULEX PIPIENS/RESTUANS,62,N MANDELL AVE,127,"6200 N MANDELL AVE, Chicago, IL",41.994991,-87.769279,9,6,2008,11
9,10,2008-06-11,"6200 North Mandell Avenue, Chicago, IL 60646, USA",CULEX RESTUANS,62,N MANDELL AVE,127,"6200 N MANDELL AVE, Chicago, IL",41.994991,-87.769279,9,6,2008,11


In [22]:
weather = pd.read_csv('./assets/input/weather.csv')
weather['Date'] = pd.to_datetime(weather['Date'])
weather_stn1 = weather[weather['Station']==1]
weather_stn2 = weather[weather['Station']==2]
weather_stn1 = weather_stn1.drop('Station', axis=1)
weather_stn2 = weather_stn2.drop('Station', axis=1)
weather = weather_stn1.merge(weather_stn2, on='Date')


weather = weather.replace('M', -1)
weather = weather.replace('-', -1)
weather = weather.replace('T', -1)
weather = weather.replace(' T', -1)
weather = weather.replace('  T', -1)

df_test = test.merge(weather, how = 'inner', on = 'Date', copy = False)

In [23]:
df_test

Unnamed: 0,Id,Date,Address,Species,Block,Street,Trap,AddressNumberAndStreet,Latitude,Longitude,...,CodeSum_y,Depth_y,Water1_y,SnowFall_y,PrecipTotal_y,StnPressure_y,SeaLevel_y,ResultSpeed_y,ResultDir_y,AvgSpeed_y
0,1,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX PIPIENS/RESTUANS,41,N OAK PARK AVE,1,"4100 N OAK PARK AVE, Chicago, IL",41.954690,-87.800991,...,,-1,-1,-1,0.00,29.34,29.97,9.4,18,10.4
1,2,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX RESTUANS,41,N OAK PARK AVE,1,"4100 N OAK PARK AVE, Chicago, IL",41.954690,-87.800991,...,,-1,-1,-1,0.00,29.34,29.97,9.4,18,10.4
2,3,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX PIPIENS,41,N OAK PARK AVE,1,"4100 N OAK PARK AVE, Chicago, IL",41.954690,-87.800991,...,,-1,-1,-1,0.00,29.34,29.97,9.4,18,10.4
3,4,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX SALINARIUS,41,N OAK PARK AVE,1,"4100 N OAK PARK AVE, Chicago, IL",41.954690,-87.800991,...,,-1,-1,-1,0.00,29.34,29.97,9.4,18,10.4
4,5,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX TERRITANS,41,N OAK PARK AVE,1,"4100 N OAK PARK AVE, Chicago, IL",41.954690,-87.800991,...,,-1,-1,-1,0.00,29.34,29.97,9.4,18,10.4
5,6,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX TARSALIS,41,N OAK PARK AVE,1,"4100 N OAK PARK AVE, Chicago, IL",41.954690,-87.800991,...,,-1,-1,-1,0.00,29.34,29.97,9.4,18,10.4
6,7,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",UNSPECIFIED CULEX,41,N OAK PARK AVE,1,"4100 N OAK PARK AVE, Chicago, IL",41.954690,-87.800991,...,,-1,-1,-1,0.00,29.34,29.97,9.4,18,10.4
7,8,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX ERRATICUS,41,N OAK PARK AVE,1,"4100 N OAK PARK AVE, Chicago, IL",41.954690,-87.800991,...,,-1,-1,-1,0.00,29.34,29.97,9.4,18,10.4
8,9,2008-06-11,"6200 North Mandell Avenue, Chicago, IL 60646, USA",CULEX PIPIENS/RESTUANS,62,N MANDELL AVE,127,"6200 N MANDELL AVE, Chicago, IL",41.994991,-87.769279,...,,-1,-1,-1,0.00,29.34,29.97,9.4,18,10.4
9,10,2008-06-11,"6200 North Mandell Avenue, Chicago, IL 60646, USA",CULEX RESTUANS,62,N MANDELL AVE,127,"6200 N MANDELL AVE, Chicago, IL",41.994991,-87.769279,...,,-1,-1,-1,0.00,29.34,29.97,9.4,18,10.4


In [24]:
#drop rows with -1


#to match test set
X_t = df_test.drop(['Date', 'Address', 'Street', 'AddressNumberAndStreet', 'Latitude', 
             'Longitude', 'AddressAccuracy', 'Species', 'Id', 'SeaLevel_x', 'SeaLevel_y',
            'CodeSum_x', 'CodeSum_y','Tmax_x', 'WetBulb_x', 'Cool_x', 'Depth_x', 'StnPressure_x', 'Tmin_y',
       'DewPoint_y', 'WetBulb_y', 'Cool_y'],  axis = 1)

#make all numbers

mos_dum_test = pd.get_dummies(test['Species'], drop_first= True)
    
X_t = X_t.join(mos_dum_test)

for col_name in X_t:
    X_t[col_name] = pd.to_numeric(X_t[col_name])
X_t = X_t.ix[:,(X_t != -1).any(axis=0)]
    

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated


In [25]:
X_t.shape

(116293, 32)

In [26]:
X_t2 = X_t[X_train.columns]

In [27]:
[x[1] for x in gs_lr.predict_proba(X_t2)]

[2.154019478846633e-05,
 5.46516870568641e-06,
 2.6896469969812563e-05,
 1.4793288433354035e-06,
 8.234184988457294e-07,
 1.1300560892274208e-07,
 2.6320778189268217e-06,
 2.6320778189268217e-06,
 8.393026561157691e-06,
 3.74951234851049e-06,
 1.3080314525291993e-05,
 2.3599972586648257e-07,
 2.573102318425332e-07,
 7.34749579062791e-08,
 1.1659684101563054e-06,
 1.1659684101563054e-06,
 6.865057938535013e-06,
 1.5800214176960199e-06,
 1.1155031911011235e-05,
 2.852728929484709e-07,
 2.1937578779203853e-07,
 3.092501589915514e-08,
 7.894814522762141e-07,
 7.894814522762141e-07,
 4.366607982694174e-06,
 2.473398276125127e-06,
 4.984312518459344e-06,
 2.035252359010454e-07,
 1.632950864696143e-07,
 5.164637952639445e-08,
 6.741373648309025e-07,
 6.741373648309025e-07,
 4.69369455300314e-06,
 2.374656746413528e-06,
 5.680924775489976e-06,
 2.0904105907354575e-07,
 1.7033696920479816e-07,
 4.903448563894363e-08,
 6.970773001644672e-07,
 6.970773001644672e-07,
 4.228071567063081e-06,
 2.561

In [28]:
arya = pd.DataFrame(gs_lr.predict_proba(X_t))

  np.exp(prob, prob)


In [29]:
arya[0].value_counts()

1.0    116293
Name: 0, dtype: int64

In [30]:
submit = pd.read_csv('./assets/input/sampleSubmission.csv')

In [31]:
submit['WnvPresent'] = [x[1] for x in gs_lr.predict_proba(X_t2)]

In [33]:
submit.to_csv('HSTTRT Submission $Poly2.csv', index= False)