In [150]:
import pandas as pd
import numpy as np
from geopy.distance import vincenty
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [151]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.cross_validation import cross_val_score, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier, BaggingClassifier
from sklearn import svm
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import roc_auc_score

In [152]:
df = pd.read_csv("../assets/train.csv")
test = pd.read_csv("../assets/test.csv")
sample = pd.read_csv('../Assets/sampleSubmission.csv')

In [167]:
df.head(2)

Unnamed: 0,Date,Address,Species,Block,Street,Trap,AddressNumberAndStreet,Latitude,Longitude,AddressAccuracy,NumMosquitos,WnvPresent,Latlong,Station
0,2007-05-29,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX PIPIENS/RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,1,0,"(41.95469, -87.800991)",1
1,2007-05-29,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,1,0,"(41.95469, -87.800991)",1


In [5]:
df.dtypes

Date                       object
Address                    object
Species                    object
Block                       int64
Street                     object
Trap                       object
AddressNumberAndStreet     object
Latitude                  float64
Longitude                 float64
AddressAccuracy             int64
NumMosquitos                int64
WnvPresent                  int64
dtype: object

In [6]:
df.isnull().sum()

Date                      0
Address                   0
Species                   0
Block                     0
Street                    0
Trap                      0
AddressNumberAndStreet    0
Latitude                  0
Longitude                 0
AddressAccuracy           0
NumMosquitos              0
WnvPresent                0
dtype: int64

In [153]:
df["Date"] = pd.to_datetime(df["Date"])
test["Date"] = pd.to_datetime(test["Date"])

In [154]:
df["Latlong"] = zip(df["Latitude"], df["Longitude"])
test["Latlong"] = zip(test["Latitude"], test["Longitude"])

In [155]:
#Station 1: CHICAGO O'HARE INTERNATIONAL AIRPORT Lat: 41.995 Lon: -87.933 Elev: 662 ft. above sea level
#Station 2: CHICAGO MIDWAY INTL ARPT Lat: 41.786 Lon: -87.752 Elev: 612 ft. above sea level

station_1 = (41.995, -87.933)
station_2 = (41.786, -87.752)

print(vincenty(station_1, station_2).miles)

17.1810367501


In [156]:
def checkDistance(x):
    if vincenty(station_1,x).miles > vincenty(station_2,x).miles:
        return 2
    else:
        return 1

In [157]:
df["Station"] = df["Latlong"].apply(checkDistance)
test["Station"] = test["Latlong"].apply(checkDistance)

In [158]:
weather = pd.read_csv("../assets/weather.csv")

In [159]:
weather.head(2)

Unnamed: 0,Station,Date,Tmax,Tmin,Tavg,Depart,DewPoint,WetBulb,Heat,Cool,...,CodeSum,Depth,Water1,SnowFall,PrecipTotal,StnPressure,SeaLevel,ResultSpeed,ResultDir,AvgSpeed
0,1,2007-05-01,83,50,67,14,51,56,0,2,...,,0,M,0.0,0.0,29.1,29.82,1.7,27,9.2
1,2,2007-05-01,84,52,68,M,51,57,0,3,...,,M,M,M,0.0,29.18,29.82,2.7,25,9.6


In [168]:
weather["Date"] = pd.to_datetime(weather["Date"])

In [169]:
df3 = df.merge(weather,on=["Date","Station"])
test3 = test.merge(weather,on=["Date","Station"])

In [170]:
df3.dtypes

Date                      datetime64[ns]
Address                           object
Species                           object
Block                              int64
Street                            object
Trap                              object
AddressNumberAndStreet            object
Latitude                         float64
Longitude                        float64
AddressAccuracy                    int64
NumMosquitos                       int64
WnvPresent                         int64
Latlong                           object
Station                            int64
Tmax                               int64
Tmin                               int64
Tavg                              object
Depart                            object
DewPoint                           int64
WetBulb                           object
Heat                              object
Cool                              object
Sunrise                           object
Sunset                            object
CodeSum         

In [40]:
block_mosq=df3.groupby(["Block"])["NumMosquitos"].mean()

block_mosq=pd.DataFrame({'Block':block_mosq.index, 'NumMosquitos':block_mosq.values})

test3 = pd.merge(test3,block_mosq,on='Block', how='left')
test3.head(10)

Unnamed: 0,Id,Date,Address,Species,Block,Street,Trap,AddressNumberAndStreet,Latitude,Longitude,...,DewPoint_y,Heat_y,Cool_y,Sunrise_y,Sunset_y,SeaLevel_y,ResultSpeed_y,ResultDir_y,AvgSpeed_y,NumMosquitos
0,1,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX PIPIENS/RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,...,56,0,9,416,1926,29.99,8.9,18,10.0,20.054054
1,2,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,...,56,0,9,416,1926,29.99,8.9,18,10.0,20.054054
2,3,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX PIPIENS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,...,56,0,9,416,1926,29.99,8.9,18,10.0,20.054054
3,4,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX SALINARIUS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,...,56,0,9,416,1926,29.99,8.9,18,10.0,20.054054
4,5,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX TERRITANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,...,56,0,9,416,1926,29.99,8.9,18,10.0,20.054054
5,6,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX TARSALIS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,...,56,0,9,416,1926,29.99,8.9,18,10.0,20.054054
6,7,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",UNSPECIFIED CULEX,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,...,56,0,9,416,1926,29.99,8.9,18,10.0,20.054054
7,8,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX ERRATICUS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,...,56,0,9,416,1926,29.99,8.9,18,10.0,20.054054
8,9,2008-06-11,"6200 North Mandell Avenue, Chicago, IL 60646, USA",CULEX PIPIENS/RESTUANS,62,N MANDELL AVE,T007,"6200 N MANDELL AVE, Chicago, IL",41.994991,-87.769279,...,56,0,9,416,1926,29.99,8.9,18,10.0,8.071429
9,10,2008-06-11,"6200 North Mandell Avenue, Chicago, IL 60646, USA",CULEX RESTUANS,62,N MANDELL AVE,T007,"6200 N MANDELL AVE, Chicago, IL",41.994991,-87.769279,...,56,0,9,416,1926,29.99,8.9,18,10.0,8.071429


In [171]:
def replaceNulls(x):
    if (x == "M") | (x==" ") | (x=="-"):
        return np.nan
    else:
        return x

In [172]:
df3 = df3.applymap(replaceNulls)
test3 = test3.applymap(replaceNulls)

In [179]:
df3['PrecipTotal'].fillna(value = 0, inplace = True)

In [185]:
df3['StnPressure'].fillna(value = 29.29, inplace=True)

In [186]:
df3.drop(["Depart","Depth","SnowFall","Water1","CodeSum","WetBulb",],axis=1, inplace=True)
test3.drop(["Depart","Depth","SnowFall","Water1","CodeSum","WetBulb",],axis=1, inplace=True)

In [187]:
weather.drop(["Depart","Depth","SnowFall","Water1","CodeSum","WetBulb"],axis=1, inplace=True)

In [188]:
df3.isnull().sum()

Date                         0
Address                      0
Species                      0
Block                        0
Street                       0
Trap                         0
AddressNumberAndStreet       0
Latitude                     0
Longitude                    0
AddressAccuracy              0
NumMosquitos                 0
WnvPresent                   0
Latlong                      0
Station                      0
Tmax                         0
Tmin                         0
Tavg                         0
DewPoint                     0
Heat                         0
Cool                         0
Sunrise                   7208
Sunset                    7208
PrecipTotal                  0
StnPressure                  0
SeaLevel                     0
ResultSpeed                  0
ResultDir                    0
AvgSpeed                     0
dtype: int64

In [60]:
weather.isnull().sum()

Station        0
Date           0
Tmax           0
Tmin           0
Tavg           0
DewPoint       0
Heat           0
Cool           0
Sunrise        0
Sunset         0
SeaLevel       0
ResultSpeed    0
ResultDir      0
AvgSpeed       0
dtype: int64

In [202]:
def replaceTrace(x):
    if (x == "T"):
        return .00001
    else:
        return x

In [203]:
df3 = df3.applymap(replaceTrace).copy(deep=True)
test=test3.applymap(replaceTrace).copy(deep=True)

In [25]:
#df3.drop("NumMosquitos",axis=1,inplace=True)

In [191]:
df4 = df3.sort_values(by="Date").ffill().copy(deep=True)
test4 = test.sort_values(by="Date").ffill().copy(deep=True)

In [192]:
species_dummies = pd.get_dummies(df4.Species)
species_dummies_test = pd.get_dummies(test4.Species)

In [65]:
species_dummies.reset_index(drop=True,inplace=True)
species_dummies_test.reset_index(drop=True,inplace=True)

In [66]:
len(species_dummies)

10506

In [210]:
df4['PrecipTotal'].replace(to_replace = 'T', value = '0.001', inplace = True)

In [225]:
df4[df4['PrecipTotal']=='T']

Unnamed: 0,Date,Address,Species,Block,Street,Trap,AddressNumberAndStreet,Latitude,Longitude,AddressAccuracy,...,Cool,Sunrise,Sunset,PrecipTotal,StnPressure,SeaLevel,ResultSpeed,ResultDir,AvgSpeed,DateMD


In [221]:
df4['PrecipTotal'].value_counts()

0.00    5924
  T      862
0.01     307
0.06     291
0.23     217
0.83     201
0.36     197
0.09     176
0.02     173
0.88     139
0.20     135
0.16     132
0.03     130
0.84     110
0.92     108
0.13      90
0.08      85
0.12      80
0.59      76
0.11      75
0.17      71
0.87      65
1.31      58
0.58      57
0.39      56
0.70      54
0.19      51
0.24      51
1.55      51
0.33      43
0.50      43
0.14      42
0.79      41
0.04      40
0.52      40
0.27      35
1.19      34
3.97      28
0.42      25
0         24
0.46      23
0.44      22
0.89      21
0.95      20
0.05       3
Name: PrecipTotal, dtype: int64

In [193]:
def datemd (x):
    return (x.month*100)+x.day

In [194]:
df4['DateMD'] = df4['Date'].apply(datemd)

In [195]:
df4.columns

Index([u'Date', u'Address', u'Species', u'Block', u'Street', u'Trap',
       u'AddressNumberAndStreet', u'Latitude', u'Longitude',
       u'AddressAccuracy', u'NumMosquitos', u'WnvPresent', u'Latlong',
       u'Station', u'Tmax', u'Tmin', u'Tavg', u'DewPoint', u'Heat', u'Cool',
       u'Sunrise', u'Sunset', u'PrecipTotal', u'StnPressure', u'SeaLevel',
       u'ResultSpeed', u'ResultDir', u'AvgSpeed', u'DateMD'],
      dtype='object')

In [226]:
x_list = ['Station', 'Tavg', 'DewPoint', 'SeaLevel', 'ResultSpeed', 
        'ResultDir', 'AvgSpeed','DateMD','Latitude','Longitude', 'StnPressure']

In [197]:
x_list

['Station',
 'Tavg',
 'DewPoint',
 'SeaLevel',
 'ResultSpeed',
 'ResultDir',
 'AvgSpeed',
 'DateMD',
 'Latitude',
 'Longitude',
 'PrecipTotal',
 'StnPressure']

In [198]:
species_dummies_test.drop(["UNSPECIFIED CULEX","CULEX TARSALIS","CULEX ERRATICUS"],axis=1,inplace=True)

In [199]:
species_dummies.drop(["CULEX TARSALIS","CULEX ERRATICUS"],axis=1,inplace=True)

In [227]:
X_species_weather = pd.concat([species_dummies, df4[x_list]],axis=1)

In [228]:
X_species_weather_scaled = StandardScaler().fit_transform(X_species_weather)

In [106]:
X_weather_scaled = StandardScaler().fit_transform(df4[x_list])

In [36]:
x1_list = ['AvgSpeed',
 'Cool',
 'DewPoint',
 'ResultDir',
 'ResultSpeed',
 'SeaLevel',
 'Station',
 'Sunrise',
 'Sunset',
 'Tavg',
 'Tmax',
 'Tmin']

In [107]:
X_weather_woheat_scaled = StandardScaler().fit_transform(df4[x1_list])

In [230]:
pca = PCA(n_components=5)
X_species_weather_scaled_pca = pca.fit_transform(X_species_weather_scaled)
X_weather_scaled_pca = pca.fit_transform(X_weather_scaled)
X_weather_woheat_scaled_pca = pca.fit_transform(X_weather_woheat_scaled)

In [108]:
y = df4["WnvPresent"]

In [109]:
cv = StratifiedKFold(y, n_folds=5,shuffle=True)

In [110]:
def score(model, name, myX):
    s = cross_val_score(model, myX, y, cv=cv, n_jobs=-1)
    print "{} Score:\t{:0.3} ± {:0.3}".format(name, s.mean().round(3), s.std().round(3))

In [111]:
dt = DecisionTreeClassifier(random_state=42)
bdt = BaggingClassifier(DecisionTreeClassifier(random_state=42))
rf = RandomForestRegressor(random_state=42)
et = ExtraTreesClassifier(random_state=42)
lr = LogisticRegression(random_state=42)
svc = SVM(kernel='polynomial')

In [112]:
def fitAndPrint(model):
    model.fit(X_train,y_train)
    y_pred = model.predict(X_test)
    print roc_auc_score(y_test, y_pred)
    print accuracy_score(y_test,y_pred)

In [245]:
X_train, X_test, y_train, y_test = train_test_split(X_weather_scaled,y)
#fitAndPrint(lr)

fitAndPrint(lr)

0.5
0.948229920061


In [253]:
params = {'penalty':['l1','l2'], 'C':[0.1,1,5,10,20]}

gslr = GridSearchCV(lr, params, scoring = 'roc_auc')
fitAndPrint(gslr)

0.5
0.948229920061


In [246]:
fitAndPrint(dt)

0.563894726899
0.937571374191


In [247]:
fitAndPrint(bdt)

0.574522693461
0.937952036544


In [254]:
params = {'n_estimators':[2,3,5,7,10,12], 'max_samples':[1,3,5,7,10,12,15,17,20], 'max_features':[1,2,3,4,5,6,7,8,9,10]}
bdt = BaggingClassifier(DecisionTreeClassifier(random_state=42))
gsbdt = GridSearchCV(bdt, params, scoring = 'roc_auc')
fitAndPrint(gsdt)

0.513702269346
0.947849257708


In [255]:
params = {'max_depth':[3,6,8,10,12,14,20], 'min_samples_split':[2,5,8,9,11]}
bdt = DecisionTreeClassifier()
gsdt = GridSearchCV(dt, params, scoring = 'roc_auc')
fitAndPrint(gsdt)

0.513902991947
0.948229920061


In [248]:
fitAndPrint(rf)

0.573318357853
0.935668062429


In [249]:
fitAndPrint(et)

0.557344676128
0.938332698896


In [None]:
#predictions for species + weather
X_species_weather_test = pd.concat([species_dummies_test, test4[x_list]],axis=1)
X_species_weather_test_scaled= StandardScaler().fit_transform(X_species_weather_test)
predictions = et.predict_proba(X_species_weather_test_scaled)[:,1]
sample['WnvPresent'] = predictions
sample.to_csv('species_weather_et.csv', index=False)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_weather_scaled,y)
fitAndPrint(lr)

In [None]:
fitAndPrint(dt)

In [None]:
fitAndPrint(bdt)

In [None]:
fitAndPrint(rf)

In [None]:
fitAndPrint(et)

In [None]:
#predictions for weather
X_weather_test_scaled = StandardScaler().fit_transform(test4[x_list])
predictions = et.predict_proba(X_weather_test_scaled)[:,1]
sample['WnvPresent'] = predictions
sample.to_csv('weather_pred_et.csv', index=False)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_weather_woheat_scaled,y)
fitAndPrint(lr)

In [None]:
fitAndPrint(dt)

In [None]:
fitAndPrint(bdt)

In [None]:
fitAndPrint(rf)

In [None]:
fitAndPrint(et)

In [None]:
#predictions for weather w/o heat
X_weather_woheat_test_scaled = StandardScaler().fit_transform(test4[x1_list])
predictions = et.predict_proba(X_weather_woheat_test_scaled)[:,1]
sample['WnvPresent'] = predictions
sample.to_csv('weather_woheat_et.csv', index=False)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_species_weather_scaled_pca,y)
fitAndPrint(lr)

In [None]:
fitAndPrint(dt)

In [None]:
fitAndPrint(bdt)

In [None]:
fitAndPrint(rf)

In [None]:
fitAndPrint(et)

In [None]:
#PCA1
X_species_weather_test_scaled_pca = pca.fit_transform(X_species_weather_test_scaled)
predictions = et.predict_proba(X_species_weather_test_scaled_pca)[:,1]
sample['WnvPresent'] = predictions
sample.to_csv('species_weather_pca_et.csv', index=False)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_weather_scaled_pca,y)
fitAndPrint(lr)

In [None]:
fitAndPrint(dt)

In [None]:
fitAndPrint(bdt)

In [None]:
fitAndPrint(rf)

In [None]:
fitAndPrint(et)

In [None]:
#PCA2
X_weather_test_scaled_pca = pca.fit_transform(X_weather_test_scaled)
predictions = et.predict_proba(X_weather_test_scaled_pca)[:,1]
sample['WnvPresent'] = predictions
sample.to_csv('weather_pca_et.csv', index=False)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_weather_woheat_scaled_pca,y)
fitAndPrint(lr)

In [None]:
fitAndPrint(dt)

In [None]:
fitAndPrint(bdt)

In [None]:
fitAndPrint(rf)

In [None]:
fitAndPrint(et)

In [None]:
#PCA3
X_weather_woheat_test_scaled_pca = pca.fit_transform(X_weather_woheat_test_scaled)
predictions = rf.predict_proba(X_weather_woheat_test_scaled_pca)[:,1]
sample['WnvPresent'] = predictions
sample.to_csv('weather_woheat_pca_rf.csv', index=False)

In [None]:
# clf = svm.SVC()
# gamma_range = 10.**np.arange(-5, 2)
# C_range = 10.**np.arange(-2, 3)
# kernel_range = ['rbf', 'sigmoid', 'linear', 'poly']
# param_grid = dict(gamma=gamma_range, C=C_range, kernel=kernel_range)
# grid = GridSearchCV(clf, param_grid, cv=10, scoring='accuracy')
# fitAndPrint(grid)