In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, cross_val_score, cross_val_predict
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam
from keras.layers import Dropout
import csv

Using TensorFlow backend.


In [88]:
#load data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
weather = pd.read_csv('weather.csv')
spray = pd.read_csv('spray.csv')

In [89]:
#merge data (only using station 1 so as not to double the data)
trw = train.merge(weather[weather['Station']==1], how='left', on='Date')
tsw = test.merge(weather[weather['Station']==1], how='left', on='Date')

In [126]:
#trw.columns

In [10]:
tsw.columns

Index(['Id', 'Date', 'Address', 'Species', 'Block', 'Street', 'Trap',
       'AddressNumberAndStreet', 'Latitude', 'Longitude', 'AddressAccuracy',
       'Station', 'Tmax', 'Tmin', 'Tavg', 'Depart', 'DewPoint', 'WetBulb',
       'Heat', 'Cool', 'Sunrise', 'Sunset', 'CodeSum', 'Depth', 'Water1',
       'SnowFall', 'PrecipTotal', 'StnPressure', 'SeaLevel', 'ResultSpeed',
       'ResultDir', 'AvgSpeed'],
      dtype='object')

In [90]:
#create y
ytr = trw.WnvPresent

In [91]:
#create X
Xtr = pd.DataFrame()
Xts = pd.DataFrame()

In [None]:
############################
#FEATURE ENGINEERING
############################

In [None]:
#clean data, eliminate nulls
#create dummies 
#engineer data data (day of year?, sunrise?)
#engineer geographic variables
#engineer weather variables (lagging data)

In [92]:
#get dummy variables for mosquito species that have Wnv
tsw_species = pd.get_dummies(tsw['Species'])[['CULEX PIPIENS/RESTUANS','CULEX PIPIENS','CULEX RESTUANS']]
trw_species = pd.get_dummies(trw['Species'])[['CULEX PIPIENS/RESTUANS','CULEX PIPIENS','CULEX RESTUANS']]

In [31]:
#trw[trw.WnvPresent==1].Species.value_counts()

In [93]:
#Build X 
Xtr['Latitude'] = trw.Latitude
Xts['Latitude'] = tsw.Latitude
#
Xtr['Longitude'] = trw.Longitude
Xts['Longitude'] = tsw.Longitude
#
Xtr['Tmax'] = trw.Tmax.astype(float)
Xts['Tmax'] = tsw.Tmax.astype(float)
#
Xtr['CULEX PIPIENS/RESTUANS'] = trw_species['CULEX PIPIENS/RESTUANS']
Xts['CULEX PIPIENS/RESTUANS'] = tsw_species['CULEX PIPIENS/RESTUANS']
Xtr['CULEX PIPIENS'] = trw_species['CULEX PIPIENS']
Xts['CULEX PIPIENS'] = tsw_species['CULEX PIPIENS']
Xtr['CULEX RESTUANS'] = trw_species['CULEX RESTUANS']
Xts['CULEX RESTUANS'] = tsw_species['CULEX RESTUANS']
#Xtr['PrecipTotal']= trw.PrecipTotal.apply(lambda x: 0.1 if x == 'T' else x)
#Xts['PrecipTotal']= tsw.PrecipTotal.apply(lambda x: 0.1 if x == 'T' else x)

In [129]:
#check dtypes
#Xtr.dtypes

In [39]:
#check for nulls
#Xtr.isnull().sum()

In [41]:
#Xts.isnull().sum()

In [None]:
#####################################
#BUILD MODELS
######################################

In [121]:
#######RANDOM FOREST model
X_train, X_test, y_train, y_test = train_test_split(Xtr, ytr, test_size=0.30, random_state=12)
#run random forest with kfold (may not be necessary, but will give an estimate of variance)
model = RandomForestClassifier(max_features = 3, max_depth = 1000) 
scores = cross_val_score(model, X_train, y_train, cv=3)
scores

array([0.92822186, 0.93553652, 0.92900857])

In [122]:
#fit against full training set
model.fit(Xtr,ytr)
model.score(Xtr,ytr)

0.9589758233390444

In [None]:
#####################################################

In [52]:
################################
#KERAS MODEL
##############################

In [112]:
#Create keras Model
X_train, X_test, y_train, y_test = train_test_split(Xtr, ytr, test_size=0.30, random_state=11)

ss = StandardScaler()
X_train = ss.fit_transform(X_train)  #the scaler is fit only to the training data
X_test = ss.transform(X_test)

model = Sequential()

input_units = X_train.shape[1] #number of features in training set
hidden_units = input_units   #hidden layer has the same number of nodes as input

#first input layer
model.add(Dense(hidden_units            
                ,input_dim=input_units  
                ,activation='relu'
                #uncomment this to add L2 regularization
                #,kernel_regularizer=regularizers.l2(0.0001) 
               ))


#hidden layer (try with and without)
#node_reduction = 0
#model.add(Dense(hidden_units - node_reduction          
#                ,input_dim=input_units  
#                ,activation='relu'
#                #,kernel_regularizer=regularizers.l2(0.0001) 
#               ))
#model.add(Dropout(0.8))

#final layer
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy'
              ,optimizer='adam'
               #added later (not part of original solution
              ,metrics=['binary_accuracy']
             )

In [113]:
#Run Keras model
history = model.fit(X_train, y_train, validation_data=(X_test, y_test), 
              epochs=20, batch_size=None, verbose=1)

Train on 7354 samples, validate on 3152 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [None]:
#add some visualization here

In [None]:
##############################################
##############################################

In [None]:
#####################################
#SCORE MODEL
####################################

In [114]:
#calculate AUC-ROC against the test portion of our train-test split
y_preds = model.predict(X_test)
metrics.roc_auc_score(y_test,y_preds)

0.670295565786386

In [123]:
#run model against the kaggle test dataset
test_preds = model.predict(Xts)

In [124]:
#tsw.Id

In [125]:
#generate output file  (not working for neural network)

output_file = pd.DataFrame({'Id':tsw.Id, 'WnvPresent':test_preds})
#output_file.head()
csv_name = 'test_csv.csv'
output_file.to_csv(csv_name, index=False)

In [None]:
##########################################