In [147]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, cross_val_score, cross_val_predict
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam
from keras.layers import Dropout
import csv

In [148]:
#load data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
weather = pd.read_csv('weather.csv')
spray = pd.read_csv('spray.csv')

In [149]:
#merge data (only using station 1 so as not to double the data)
trw = train.merge(weather[weather['Station']==1], how='left', on='Date')
tsw = test.merge(weather[weather['Station']==1], how='left', on='Date')

In [126]:
#trw.columns

In [151]:
tsw.columns

Index(['Id', 'Date', 'Address', 'Species', 'Block', 'Street', 'Trap',
       'AddressNumberAndStreet', 'Latitude', 'Longitude', 'AddressAccuracy',
       'Station', 'Tmax', 'Tmin', 'Tavg', 'Depart', 'DewPoint', 'WetBulb',
       'Heat', 'Cool', 'Sunrise', 'Sunset', 'CodeSum', 'Depth', 'Water1',
       'SnowFall', 'PrecipTotal', 'StnPressure', 'SeaLevel', 'ResultSpeed',
       'ResultDir', 'AvgSpeed'],
      dtype='object')

In [152]:
#create y
ytr = trw.WnvPresent

In [153]:
#create X
Xtr = pd.DataFrame()
Xts = pd.DataFrame()

In [None]:
############################
#FEATURE ENGINEERING
############################

In [None]:
#clean data, eliminate nulls
#create dummies 
#engineer data data (day of year?, sunrise?)
#engineer geographic variables
#engineer weather variables (lagging data)

In [154]:
#Build X 
Xtr['Latitude'] = trw.Latitude
Xts['Latitude'] = tsw.Latitude
#
Xtr['Longitude'] = trw.Longitude
Xts['Longitude'] = tsw.Longitude
#
Xtr['Tmax'] = trw.Tmax.astype(float)
Xts['Tmax'] = tsw.Tmax.astype(float)
#
#Xtr['PrecipTotal']= trw.PrecipTotal.apply(lambda x: 0.1 if x == 'T' else x)
#Xts['PrecipTotal']= tsw.PrecipTotal.apply(lambda x: 0.1 if x == 'T' else x)

In [129]:
#check dtypes
#Xtr.dtypes

In [130]:
#check for nulls
#Xtr.isnull().sum()

In [131]:
#Xts.isnull().sum()

In [None]:
#####################################
#BUILD MODELS
######################################

In [155]:
#######RANDOM FOREST model
X_train, X_test, y_train, y_test = train_test_split(Xtr, ytr, test_size=0.30, random_state=12)
#run random forest with kfold (may not be necessary, but will give an estimate of variance)
model = RandomForestClassifier(max_features = 3, max_depth = 1000) 
scores = cross_val_score(model, X_train, y_train, cv=3)
scores

array([0.93270799, 0.93431253, 0.93023256])

In [156]:
#fit against full training set
model.fit(Xtr,ytr)
model.score(Xtr,ytr)

0.9502189225204645

In [None]:
#####################################################

In [52]:
################################
#KERAS MODEL
##############################

In [74]:
#Create keras Model
X_train, X_test, y_train, y_test = train_test_split(Xtr, ytr, test_size=0.30, random_state=11)

ss = StandardScaler()
X_train = ss.fit_transform(X_train)  #the scaler is fit only to the training data
X_test = ss.transform(X_test)

model = Sequential()

input_units = X_train.shape[1] #number of features in training set
hidden_units = input_units   #hidden layer has the same number of nodes as input

#first input layer
model.add(Dense(hidden_units            
                ,input_dim=input_units  
                ,activation='relu'
                #uncomment this to add L2 regularization
                #,kernel_regularizer=regularizers.l2(0.0001) 
               ))


#hidden layer (try with and without)
#node_reduction = 0
#model.add(Dense(hidden_units - node_reduction          
#                ,input_dim=input_units  
#                ,activation='relu'
#                #,kernel_regularizer=regularizers.l2(0.0001) 
#               ))
#model.add(Dropout(0.8))

#final layer
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy'
              ,optimizer='adam'
               #added later (not part of original solution
              ,metrics=['binary_accuracy']
             )

In [77]:
#Run Keras model
history = model.fit(X_train, y_train, validation_data=(X_test, y_test), 
              epochs=10, batch_size=None, verbose=1)

Train on 14708 samples, validate on 6304 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
#add some visualization here

In [None]:
##############################################
##############################################

In [None]:
#####################################
#SCORE MODEL
####################################

In [157]:
#calculate AUC-ROC
y_preds = model.predict(Xtr)
metrics.roc_auc_score(ytr,y_preds)

0.5656974534224336

In [158]:
#run model against test
test_preds = model.predict(Xts)

In [135]:
#tsw.Id

In [159]:
#generate output file

output_file = pd.DataFrame({'Id':tsw.Id, 'WnvPresent':test_preds})
#output_file.head()
csv_name = 'test_csv.csv'
output_file.to_csv(csv_name, index=False)

In [None]:
##########################################