In [1]:
import pandas as pd
import numpy as np
from geopy.distance import vincenty
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [2]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.cross_validation import cross_val_score, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier, BaggingClassifier
from sklearn import svm
from sklearn.grid_search import GridSearchCV

In [3]:
df = pd.read_csv("../assets/train.csv")
test = pd.read_csv("../assets/test.csv")
sample = pd.read_csv('../Assets/sampleSubmission.csv')

In [4]:
df.head(2)

Unnamed: 0,Date,Address,Species,Block,Street,Trap,AddressNumberAndStreet,Latitude,Longitude,AddressAccuracy,NumMosquitos,WnvPresent
0,2007-05-29,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX PIPIENS/RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,1,0
1,2007-05-29,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,1,0


In [5]:
df.dtypes

Date                       object
Address                    object
Species                    object
Block                       int64
Street                     object
Trap                       object
AddressNumberAndStreet     object
Latitude                  float64
Longitude                 float64
AddressAccuracy             int64
NumMosquitos                int64
WnvPresent                  int64
dtype: object

In [6]:
df.isnull().sum()

Date                      0
Address                   0
Species                   0
Block                     0
Street                    0
Trap                      0
AddressNumberAndStreet    0
Latitude                  0
Longitude                 0
AddressAccuracy           0
NumMosquitos              0
WnvPresent                0
dtype: int64

In [7]:
df["Date"] = pd.to_datetime(df["Date"])
test["Date"] = pd.to_datetime(test["Date"])

In [8]:
df["Latlong"] = zip(df["Latitude"], df["Longitude"])
test["Latlong"] = zip(test["Latitude"], test["Longitude"])

In [9]:
#Station 1: CHICAGO O'HARE INTERNATIONAL AIRPORT Lat: 41.995 Lon: -87.933 Elev: 662 ft. above sea level
#Station 2: CHICAGO MIDWAY INTL ARPT Lat: 41.786 Lon: -87.752 Elev: 612 ft. above sea level

station_1 = (41.995, -87.933)
station_2 = (41.786, -87.752)

print(vincenty(station_1, station_2).miles)

17.1810367501


In [10]:
def checkDistance(x):
    if vincenty(station_1,x).miles > vincenty(station_2,x).miles:
        return 2
    else:
        return 1

In [11]:
df["Station"] = df["Latlong"].apply(checkDistance)
test["Station"] = test["Latlong"].apply(checkDistance)

In [12]:
weather = pd.read_csv("../assets/weather.csv")

In [13]:
weather.head(2)

Unnamed: 0,Station,Date,Tmax,Tmin,Tavg,Depart,DewPoint,WetBulb,Heat,Cool,...,CodeSum,Depth,Water1,SnowFall,PrecipTotal,StnPressure,SeaLevel,ResultSpeed,ResultDir,AvgSpeed
0,1,2007-05-01,83,50,67,14,51,56,0,2,...,,0,M,0.0,0.0,29.1,29.82,1.7,27,9.2
1,2,2007-05-01,84,52,68,M,51,57,0,3,...,,M,M,M,0.0,29.18,29.82,2.7,25,9.6


In [14]:
weather.dtypes

Station          int64
Date            object
Tmax             int64
Tmin             int64
Tavg            object
Depart          object
DewPoint         int64
WetBulb         object
Heat            object
Cool            object
Sunrise         object
Sunset          object
CodeSum         object
Depth           object
Water1          object
SnowFall        object
PrecipTotal     object
StnPressure     object
SeaLevel        object
ResultSpeed    float64
ResultDir        int64
AvgSpeed        object
dtype: object

In [15]:
weather["Date"] = pd.to_datetime(weather["Date"])

In [16]:
df3 = df.merge(weather,on=["Date","Station"])
test3 = test.merge(weather,on=["Date","Station"])

In [17]:
df3.dtypes

Date                      datetime64[ns]
Address                           object
Species                           object
Block                              int64
Street                            object
Trap                              object
AddressNumberAndStreet            object
Latitude                         float64
Longitude                        float64
AddressAccuracy                    int64
NumMosquitos                       int64
WnvPresent                         int64
Latlong                           object
Station                            int64
Tmax                               int64
Tmin                               int64
Tavg                              object
Depart                            object
DewPoint                           int64
WetBulb                           object
Heat                              object
Cool                              object
Sunrise                           object
Sunset                            object
CodeSum         

In [18]:
def replaceNulls(x):
    if (x == "M") | (x==" ") | (x=="-"):
        return np.nan
    else:
        return x

In [19]:
df3 = df3.applymap(replaceNulls)
test3 = test3.applymap(replaceNulls)

In [20]:
df3.isnull().sum()

Date                          0
Address                       0
Species                       0
Block                         0
Street                        0
Trap                          0
AddressNumberAndStreet        0
Latitude                      0
Longitude                     0
AddressAccuracy               0
NumMosquitos                  0
WnvPresent                    0
Latlong                       0
Station                       0
Tmax                          0
Tmin                          0
Tavg                          0
Depart                     7208
DewPoint                      0
WetBulb                      36
Heat                          0
Cool                          0
Sunrise                    7208
Sunset                     7208
CodeSum                    5941
Depth                      7208
Water1                    10506
SnowFall                   7208
PrecipTotal                  24
StnPressure                  36
SeaLevel                      0
ResultSp

In [21]:
df3.drop(["Depart","Depth","SnowFall","Water1","CodeSum","WetBulb","PrecipTotal","StnPressure"],axis=1, inplace=True)
test3.drop(["Depart","Depth","SnowFall","Water1","CodeSum","WetBulb","PrecipTotal","StnPressure"],axis=1, inplace=True)

In [22]:
weather.drop(["Depart","Depth","SnowFall","Water1","CodeSum","WetBulb","PrecipTotal","StnPressure"],axis=1, inplace=True)

In [23]:
df3.isnull().sum()

Date                         0
Address                      0
Species                      0
Block                        0
Street                       0
Trap                         0
AddressNumberAndStreet       0
Latitude                     0
Longitude                    0
AddressAccuracy              0
NumMosquitos                 0
WnvPresent                   0
Latlong                      0
Station                      0
Tmax                         0
Tmin                         0
Tavg                         0
DewPoint                     0
Heat                         0
Cool                         0
Sunrise                   7208
Sunset                    7208
SeaLevel                     0
ResultSpeed                  0
ResultDir                    0
AvgSpeed                     0
dtype: int64

In [24]:
weather.isnull().sum()

Station        0
Date           0
Tmax           0
Tmin           0
Tavg           0
DewPoint       0
Heat           0
Cool           0
Sunrise        0
Sunset         0
SeaLevel       0
ResultSpeed    0
ResultDir      0
AvgSpeed       0
dtype: int64

In [25]:
def replaceTrace(x):
    if (x == "T"):
        return .00001
    else:
        return x

In [26]:
df3 = df3.applymap(replaceTrace).copy(deep=True)
test=test3.applymap(replaceTrace).copy(deep=True)

In [27]:
df3.drop("NumMosquitos",axis=1,inplace=True)

In [28]:
df4 = df3.sort_values(by="Date").ffill().copy(deep=True)
test4 = test.sort_values(by="Date").ffill().copy(deep=True)

In [29]:
species_dummies = pd.get_dummies(df4.Species)
species_dummies_test = pd.get_dummies(test4.Species)

In [30]:
species_dummies.reset_index(drop=True,inplace=True)
species_dummies_test.reset_index(drop=True,inplace=True)

In [31]:
len(species_dummies)

10506

In [32]:
x_list = (weather.columns - ["Date"]).tolist()

  if __name__ == '__main__':


In [33]:
species_dummies_test.drop(["UNSPECIFIED CULEX","CULEX TARSALIS","CULEX ERRATICUS"],axis=1,inplace=True)

In [34]:
species_dummies.drop(["CULEX TARSALIS","CULEX ERRATICUS"],axis=1,inplace=True)

In [35]:
X_species_weather = pd.concat([species_dummies, df4[x_list]],axis=1)

In [36]:
X_species_weather_scaled = StandardScaler().fit_transform(X_species_weather)

In [37]:
X_weather_scaled = StandardScaler().fit_transform(df4[x_list])

In [38]:
x1_list = ['AvgSpeed',
 'Cool',
 'DewPoint',
 'ResultDir',
 'ResultSpeed',
 'SeaLevel',
 'Station',
 'Sunrise',
 'Sunset',
 'Tavg',
 'Tmax',
 'Tmin']

In [39]:
X_weather_woheat_scaled = StandardScaler().fit_transform(df4[x1_list])

In [40]:
pca = PCA(n_components=3)
X_species_weather_scaled_pca = pca.fit_transform(X_species_weather_scaled)
X_weather_scaled_pca = pca.fit_transform(X_weather_scaled)
X_weather_woheat_scaled_pca = pca.fit_transform(X_weather_woheat_scaled)

In [41]:
y = df4["WnvPresent"]

In [42]:
cv = StratifiedKFold(y, n_folds=5,shuffle=True)

In [43]:
def score(model, name, myX):
    s = cross_val_score(model, myX, y, cv=cv, n_jobs=-1)
    print "{} Score:\t{:0.3} ± {:0.3}".format(name, s.mean().round(3), s.std().round(3))

In [44]:
dt = DecisionTreeClassifier(random_state=42)
bdt = BaggingClassifier(DecisionTreeClassifier(random_state=42))
rf = RandomForestClassifier(random_state=42)
et = ExtraTreesClassifier(random_state=42)
lr = LogisticRegression(random_state=42)

In [45]:
def fitAndPrint(model):
    model.fit(X_train,y_train)
    y_pred = model.predict(X_test)
    print accuracy_score(y_test,y_pred)

In [85]:
X_train, X_test, y_train, y_test = train_test_split(X_species_weather_scaled,y)
fitAndPrint(lr)

0.952036543586


In [88]:
fitAndPrint(dt)

0.952036543586


In [92]:
fitAndPrint(bdt)

0.950894556528


In [86]:
fitAndPrint(rf)

0.951275218881


In [94]:
fitAndPrint(et)

0.952036543586


In [95]:
#predictions for species + weather
X_species_weather_test = pd.concat([species_dummies_test, test4[x_list]],axis=1)
X_species_weather_test_scaled= StandardScaler().fit_transform(X_species_weather_test)
predictions = et.predict_proba(X_species_weather_test_scaled)[:,1]
sample['WnvPresent'] = predictions
sample.to_csv('species_weather_et.csv', index=False)

In [96]:
X_train, X_test, y_train, y_test = train_test_split(X_weather_scaled,y)
fitAndPrint(lr)

0.949371907118


In [99]:
fitAndPrint(dt)

0.949371907118


In [101]:
fitAndPrint(bdt)

0.949371907118


In [57]:
fitAndPrint(rf)

0.949752569471


In [104]:
fitAndPrint(et)

0.949371907118


In [105]:
#predictions for weather
X_weather_test_scaled = StandardScaler().fit_transform(test4[x_list])
predictions = et.predict_proba(X_weather_test_scaled)[:,1]
sample['WnvPresent'] = predictions
sample.to_csv('weather_pred_et.csv', index=False)

In [111]:
X_train, X_test, y_train, y_test = train_test_split(X_weather_woheat_scaled,y)
fitAndPrint(lr)

0.947849257708


In [107]:
fitAndPrint(dt)

0.950513894176


In [109]:
fitAndPrint(bdt)

0.950513894176


In [113]:
fitAndPrint(rf)

0.947849257708


In [115]:
fitAndPrint(et)

0.947849257708


In [116]:
#predictions for weather w/o heat
X_weather_woheat_test_scaled = StandardScaler().fit_transform(test4[x1_list])
predictions = et.predict_proba(X_weather_woheat_test_scaled)[:,1]
sample['WnvPresent'] = predictions
sample.to_csv('weather_woheat_et.csv', index=False)

In [117]:
X_train, X_test, y_train, y_test = train_test_split(X_species_weather_scaled_pca,y)
fitAndPrint(lr)

0.950133231823


In [118]:
fitAndPrint(dt)

0.950133231823


In [120]:
fitAndPrint(bdt)

0.948991244766


In [122]:
fitAndPrint(rf)

0.946707270651


In [124]:
fitAndPrint(et)

0.950133231823


In [125]:
#PCA1
X_species_weather_test_scaled_pca = pca.fit_transform(X_species_weather_test_scaled)
predictions = et.predict_proba(X_species_weather_test_scaled_pca)[:,1]
sample['WnvPresent'] = predictions
sample.to_csv('species_weather_pca_et.csv', index=False)

In [126]:
X_train, X_test, y_train, y_test = train_test_split(X_weather_scaled_pca,y)
fitAndPrint(lr)

0.940616673011


In [129]:
fitAndPrint(dt)

0.940616673011


In [131]:
fitAndPrint(bdt)

0.940616673011


In [127]:
fitAndPrint(rf)

0.940616673011


In [134]:
fitAndPrint(et)

0.940616673011


In [135]:
#PCA2
X_weather_test_scaled_pca = pca.fit_transform(X_weather_test_scaled)
predictions = et.predict_proba(X_weather_test_scaled_pca)[:,1]
sample['WnvPresent'] = predictions
sample.to_csv('weather_pca_et.csv', index=False)

In [136]:
X_train, X_test, y_train, y_test = train_test_split(X_weather_woheat_scaled_pca,y)
fitAndPrint(lr)

0.943281309478


In [137]:
fitAndPrint(dt)

0.943281309478


In [138]:
fitAndPrint(bdt)

0.943281309478


In [139]:
fitAndPrint(rf)

0.943281309478


In [140]:
fitAndPrint(et)

0.943281309478


In [142]:
#PCA3
X_weather_woheat_test_scaled_pca = pca.fit_transform(X_weather_woheat_test_scaled)
predictions = rf.predict_proba(X_weather_woheat_test_scaled_pca)[:,1]
sample['WnvPresent'] = predictions
sample.to_csv('weather_woheat_pca_rf.csv', index=False)

In [None]:
# clf = svm.SVC()
# gamma_range = 10.**np.arange(-5, 2)
# C_range = 10.**np.arange(-2, 3)
# kernel_range = ['rbf', 'sigmoid', 'linear', 'poly']
# param_grid = dict(gamma=gamma_range, C=C_range, kernel=kernel_range)
# grid = GridSearchCV(clf, param_grid, cv=10, scoring='accuracy')
# fitAndPrint(grid)