In [28]:
import pandas as pd
import numpy as np
from sklearn.lda import LDA
from geopy.distance import vincenty
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [29]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score,roc_auc_score
from sklearn.cross_validation import cross_val_score, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier, BaggingClassifier
from sklearn import svm
from sklearn.grid_search import GridSearchCV

In [30]:
df = pd.read_csv("../assets/train.csv")
test = pd.read_csv("../assets/test.csv")
sample = pd.read_csv('../Assets/sampleSubmission.csv')

In [31]:
df["Date"] = pd.to_datetime(df["Date"])
test["Date"] = pd.to_datetime(test["Date"])

In [32]:
df["Latlong"] = zip(df["Latitude"], df["Longitude"])
test["Latlong"] = zip(test["Latitude"], test["Longitude"])

In [33]:
#Station 1: CHICAGO O'HARE INTERNATIONAL AIRPORT Lat: 41.995 Lon: -87.933 Elev: 662 ft. above sea level
#Station 2: CHICAGO MIDWAY INTL ARPT Lat: 41.786 Lon: -87.752 Elev: 612 ft. above sea level

station_1 = (41.995, -87.933)
station_2 = (41.786, -87.752)

print(vincenty(station_1, station_2).miles)

17.1810367501


In [34]:
def checkDistance(x):
    if vincenty(station_1,x).miles > vincenty(station_2,x).miles:
        return 2
    else:
        return 1

In [35]:
df["Station"] = df["Latlong"].apply(checkDistance)
test["Station"] = test["Latlong"].apply(checkDistance)

In [36]:
weather = pd.read_csv("../assets/weather.csv")

In [37]:
weather["Date"] = pd.to_datetime(weather["Date"])

In [38]:
df3 = df.merge(weather,on=["Date","Station"])
test3 = test.merge(weather,on=["Date","Station"])

In [39]:
def replaceNulls(x):
    if (x == "M") | (x==" ") | (x=="-"):
        return np.nan
    else:
        return x

In [40]:
df3 = df3.applymap(replaceNulls)
test3 = test3.applymap(replaceNulls)

In [41]:
df3['PrecipTotal'].fillna(value = 0, inplace = True)

In [42]:
df3['StnPressure'].fillna(value = 29.29, inplace=True)

In [43]:
df3.drop(["Depart","Depth","SnowFall","Water1","CodeSum","WetBulb"],axis=1, inplace=True)
test3.drop(["Depart","Depth","SnowFall","Water1","CodeSum","WetBulb"],axis=1, inplace=True)

In [44]:
weather.drop(["Depart","Depth","SnowFall","Water1","CodeSum","WetBulb"],axis=1, inplace=True)

In [45]:
def replaceTrace(x):
    if ("T" in str(x)):
        return .0001
    else:
        return x

In [53]:
df3['PrecipTotal'] = df3['PrecipTotal'].apply(replaceTrace)
test3['PrecipTotal']=test3['PrecipTotal'].apply(replaceTrace)
test = test3

In [55]:
df4 = df3.sort_values(by="Date").ffill().copy(deep=True)
test4 = test.sort_values(by="Date").ffill().copy(deep=True)

In [56]:
df2007 = df4[df4.Date.dt.year == 2007]
df2009 = df4[df4.Date.dt.year == 2009]
df2011 = df4[df4.Date.dt.year == 2011]
df2013 = df4[df4.Date.dt.year == 2013]

In [57]:
df2008 = test4[test4.Date.dt.year == 2008]
df2010 = test4[test4.Date.dt.year == 2010]
df2012 = test4[test4.Date.dt.year == 2012]
df2014 = test4[test4.Date.dt.year == 2014]

In [58]:
def score(model, name, myX,myy):
    s = cross_val_score(model, myX, myy, cv=cv, scoring = 'roc_auc')
    print "{} Score:\t{:0.3} ± {:0.3}".format(name, s.mean().round(3), s.std().round(3))

In [59]:
def fitAndPrint(model):
    model.fit(X_train,y_train)
    y_pred = model.predict(X_test)
    print roc_auc_score(y_test,y_pred)

In [60]:
weather_list = ["AvgSpeed","Tavg","Heat","ResultDir","SeaLevel","Station","Sunrise","Sunset","Longitude","Latitude"]

In [61]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
le.fit(list(df4["Species"].values) + list(test4["Species"].values))

LabelEncoder()

In [62]:
df4["Species"] = le.transform(df4["Species"].values)
test4["Species"] = le.transform(test4["Species"].values)

In [63]:
le.fit(list(df4["Trap"].values) + list(test4["Trap"].values))
df4["Trap"] = le.transform(df4["Trap"].values)
test4["Trap"] = le.transform(test4["Trap"].values)

In [64]:
le.fit(list(df4["Street"].values) + list(test4["Street"].values))
df4["Street"] = le.transform(df4["Street"].values)
test4["Street"] = le.transform(test4["Street"].values)

In [65]:
species_dummies_2007 = pd.get_dummies(df2007.Species)
species_dummies_2009 = pd.get_dummies(df2009.Species)
species_dummies_2011 = pd.get_dummies(df2011.Species)
species_dummies_2013 = pd.get_dummies(df2013.Species)

species_dummies_2007.reset_index(drop=True,inplace=True)
species_dummies_2009.reset_index(drop=True,inplace=True)
species_dummies_2011.reset_index(drop=True,inplace=True)
species_dummies_2013.reset_index(drop=True,inplace=True)

In [66]:
df4.columns

Index([u'Date', u'Address', u'Species', u'Block', u'Street', u'Trap',
       u'AddressNumberAndStreet', u'Latitude', u'Longitude',
       u'AddressAccuracy', u'NumMosquitos', u'WnvPresent', u'Latlong',
       u'Station', u'Tmax', u'Tmin', u'Tavg', u'DewPoint', u'Heat', u'Cool',
       u'Sunrise', u'Sunset', u'PrecipTotal', u'StnPressure', u'SeaLevel',
       u'ResultSpeed', u'ResultDir', u'AvgSpeed'],
      dtype='object')

In [67]:
x_list = (weather.columns - ["Date","Heat","Station"]).tolist()

  if __name__ == '__main__':


In [81]:
x_list

['AvgSpeed',
 'Cool',
 'DewPoint',
 'PrecipTotal',
 'ResultDir',
 'ResultSpeed',
 'SeaLevel',
 'StnPressure',
 'Sunrise',
 'Sunset',
 'Tavg',
 'Tmax',
 'Tmin']

In [68]:
df_combo = pd.concat([df4[["Species","Trap","Street","Block","Latitude","Longitude"]], df4[x_list]],axis=1)
df_combo_test = pd.concat([test4[["Species","Trap","Street","Block","Latitude","Longitude"]], test4[x_list]],axis=1)

In [69]:
df_combo_scaled = StandardScaler().fit_transform(df_combo)
df_combo_scaled_test = StandardScaler().fit_transform(df_combo_test)

In [70]:
X_train, X_test, y_train, y_test = train_test_split(df_combo_scaled,df4["WnvPresent"],test_size=.70)

In [73]:
rf = RandomForestClassifier()

In [74]:
fitAndPrint(rf)

0.548800780459


In [287]:
for a,b in zip(rf.feature_importances_,df_combo.columns):
    print a,b

0.0945621227417 Species
0.0805657604619 Trap
0.0634073294241 Street
0.045970355703 Block
0.0524666643771 Latitude
0.0835604917323 Longitude
0.0180410623754 AddressAccuracy
0.0837504610889 AvgSpeed
0.0534705598329 Tavg
0.00593737913072 Heat
0.0721552164829 ResultDir
0.0671962278899 SeaLevel
0.0015558815532 Station
0.0827880829907 Sunrise
0.0739712903441 Sunset
0.0601062573094 Longitude
0.0604948565619 Latitude


In [284]:
predictions = rf.predict_proba(df_combo_scaled_test)[:,1]
sample['WnvPresent'] = predictions
sample.to_csv('df_combo_rf.csv', index=False)

In [87]:
clf = svm.SVC(probability=True,kernel="linear")

In [88]:
fitAndPrint(clf)

0.5


In [84]:
fitAndPrint(clf_test)

0.5


In [325]:
predictions = clf.predict_proba(df_combo_scaled_test)[:,1]
sample['WnvPresent'] = predictions
sample.to_csv('df_combo_clf.csv', index=False)

In [85]:
gamma_range = 10.**np.arange(-3, 2)
C_range = 10.**np.arange(-1, 2)
param_grid = dict(gamma=gamma_range, C=C_range)
grid = GridSearchCV(clf, param_grid, cv=10, scoring='accuracy')

In [86]:
fitAndPrint(grid)

0.509959136341


In [219]:
predictions = grid.predict_proba(df_combo_scaled_test)[:,1]
sample['WnvPresent'] = predictions
sample.to_csv('df_combo_clf_grid.csv', index=False)

In [110]:
X_weather_scaled = StandardScaler().fit_transform(df4[x_list])
X_weather_scaled_test = StandardScaler().fit_transform(test4[x_list])

In [116]:
print X_weather_scaled.shape
print X_weather_scaled_test.shape

(10506, 13)
(116293, 13)


In [258]:
X_train, X_test, y_train, y_test = train_test_split(X_weather_scaled,df4["WnvPresent"],test_size=.60)

In [259]:
fitAndPrint(clf)

0.5


In [260]:
predictions = clf.predict_proba(X_weather_scaled_test)[:,1]
sample['WnvPresent'] = predictions
sample.to_csv('weather_clf_60.csv', index=False)

In [149]:
X_species_weather = pd.concat([species_dummies, df4[x_list]],axis=1)

In [50]:
X_weather_woheat_nummos = StandardScaler().fit_transform(df4[x2_list])

In [29]:
X_species_weather_scaled = StandardScaler().fit_transform(X_species_weather)

In [245]:
X_weather_scaled = StandardScaler().fit_transform(df4[x_list])

In [31]:
X_weather_scaled_2007 = StandardScaler().fit_transform(df2007[x_list])

In [150]:
X_weather_dropcorr = StandardScaler().fit_transform(df4[weather_list])

In [48]:
x1_list = ['AvgSpeed',
 'Cool',
 'DewPoint',
 'ResultDir',
 'ResultSpeed',
 'SeaLevel',
 'Station',
 'Sunrise',
 'Sunset',
 'Tavg',
 'Tmax',
 'Tmin']

In [36]:
X_weather_woheat_scaled = StandardScaler().fit_transform(df4[x1_list])

In [61]:
pca = PCA(n_components=3)
X_species_weather_scaled_pca = pca.fit_transform(X_species_weather_scaled)
X_weather_scaled_pca = pca.fit_transform(X_weather_scaled)
X_weather_woheat_scaled_pca = pca.fit_transform(X_weather_woheat_scaled)

In [38]:
trapm = df4.groupby(["Trap"])["NumMosquitos"].median()

In [39]:
trapm = pd.DataFrame({"Trap":trapm.index,"NumMosquitos":trapm.values})

In [40]:
trapm.head()

Unnamed: 0,NumMosquitos,Trap
0,1.0,T001
1,14.0,T002
2,5.5,T003
3,2.5,T004
4,2.5,T005


In [44]:
test4 = pd.merge(test4,trapm,on="Trap", how="left")

In [46]:
test4["NumMosquitos"].fillna(df4["NumMosquitos"].median(),inplace=True)

In [52]:
y = df4["WnvPresent"]

In [54]:
def score(model, name, myX,myy):
    s = cross_val_score(model, myX, myy, cv=cv)
    print "{} Score:\t{:0.3} ± {:0.3}".format(name, s.mean().round(3), s.std().round(3))

In [55]:
def score_year(model, name, myX,myy):
    s = cross_val_score(model, myX, myy, cv=cvyear)
    print "{} Score:\t{:0.3} ± {:0.3}".format(name, s.mean().round(3), s.std().round(3))

In [112]:
dt = DecisionTreeClassifier()
bdt = BaggingClassifier(DecisionTreeClassifier())
rf = RandomForestClassifier()
et = ExtraTreesClassifier()
lr = LogisticRegression()

In [111]:
def fitAndPrint(model):
    model.fit(X_train,y_train)
    y_pred = model.predict(X_test)
    print roc_auc_score(y_test,y_pred)

In [85]:
X_train, X_test, y_train, y_test = train_test_split(X_species_weather_scaled,y)
fitAndPrint(lr)

0.952036543586


In [58]:
from sklearn.metrics import roc_auc_score

In [63]:
X_train, X_test, y_train, y_test = train_test_split(X_weather_dropcorr,y)
fitAndPrint(rf)

0.590869618024


In [111]:
rf.feature_importances_

array([ 0.13397344,  0.11141725,  0.02167609,  0.08025946,  0.09622674,
        0.03378082,  0.29025642,  0.23240978])

In [64]:
X_weather_test_scaled = StandardScaler().fit_transform(test4[weather_list])
predictions = rf.predict_proba(X_weather_test_scaled)[:,1]
sample['WnvPresent'] = predictions
sample.to_csv('weather_pred_rf_nummos.csv', index=False)