In [56]:
import pandas as pd
import numpy as np
from geopy.distance import vincenty

In [57]:
df = pd.read_csv("../assets/train.csv")
test = pd.read_csv("../assets/test.csv")

In [58]:
df.head(2)

Unnamed: 0,Date,Address,Species,Block,Street,Trap,AddressNumberAndStreet,Latitude,Longitude,AddressAccuracy,NumMosquitos,WnvPresent
0,2007-05-29,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX PIPIENS/RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,1,0
1,2007-05-29,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,1,0


In [59]:
df.dtypes

Date                       object
Address                    object
Species                    object
Block                       int64
Street                     object
Trap                       object
AddressNumberAndStreet     object
Latitude                  float64
Longitude                 float64
AddressAccuracy             int64
NumMosquitos                int64
WnvPresent                  int64
dtype: object

In [60]:
df.isnull().sum()

Date                      0
Address                   0
Species                   0
Block                     0
Street                    0
Trap                      0
AddressNumberAndStreet    0
Latitude                  0
Longitude                 0
AddressAccuracy           0
NumMosquitos              0
WnvPresent                0
dtype: int64

In [61]:
df["Date"] = pd.to_datetime(df["Date"])
test["Date"] = pd.to_datetime(test["Date"])

In [62]:
df["Latlong"] = zip(df["Latitude"], df["Longitude"])
test["Latlong"] = zip(test["Latitude"], test["Longitude"])

In [63]:
#Station 1: CHICAGO O'HARE INTERNATIONAL AIRPORT Lat: 41.995 Lon: -87.933 Elev: 662 ft. above sea level
#Station 2: CHICAGO MIDWAY INTL ARPT Lat: 41.786 Lon: -87.752 Elev: 612 ft. above sea level

station_1 = (41.995, -87.933)
station_2 = (41.786, -87.752)

print(vincenty(station_1, station_2).miles)

17.1810367501


In [64]:
def checkDistance(x):
    if vincenty(station_1,x).miles > vincenty(station_2,x).miles:
        return 2
    else:
        return 1

In [65]:
df["Station"] = df["Latlong"].apply(checkDistance)
test["Station"] = test["Latlong"].apply(checkDistance)

In [66]:
weather = pd.read_csv("../assets/weather.csv")
spray = pd.read_csv("../assets/spray.csv")

In [24]:
weather.head(2)

Unnamed: 0,Station,Date,Tmax,Tmin,Tavg,Depart,DewPoint,WetBulb,Heat,Cool,...,CodeSum,Depth,Water1,SnowFall,PrecipTotal,StnPressure,SeaLevel,ResultSpeed,ResultDir,AvgSpeed
0,1,2007-05-01,83,50,67,14,51,56,0,2,...,,0,M,0.0,0.0,29.1,29.82,1.7,27,9.2
1,2,2007-05-01,84,52,68,M,51,57,0,3,...,,M,M,M,0.0,29.18,29.82,2.7,25,9.6


In [25]:
spray.head(2)

Unnamed: 0,Date,Time,Latitude,Longitude
0,2011-08-29,6:56:58 PM,42.391623,-88.089163
1,2011-08-29,6:57:08 PM,42.391348,-88.089163


In [26]:
spray["Latlong"] = zip(spray["Latitude"], spray["Longitude"])

In [27]:
weather.dtypes

Station          int64
Date            object
Tmax             int64
Tmin             int64
Tavg            object
Depart          object
DewPoint         int64
WetBulb         object
Heat            object
Cool            object
Sunrise         object
Sunset          object
CodeSum         object
Depth           object
Water1          object
SnowFall        object
PrecipTotal     object
StnPressure     object
SeaLevel        object
ResultSpeed    float64
ResultDir        int64
AvgSpeed        object
dtype: object

In [28]:
weather["Date"] = pd.to_datetime(weather["Date"])

In [29]:
df3 = df.merge(weather,on=["Date","Station"])
test3 = test.merge(weather,on=["Date","Station"])

In [30]:
df3.dtypes

Date                      datetime64[ns]
Address                           object
Species                           object
Block                              int64
Street                            object
Trap                              object
AddressNumberAndStreet            object
Latitude                         float64
Longitude                        float64
AddressAccuracy                    int64
NumMosquitos                       int64
WnvPresent                         int64
Latlong                           object
Station                            int64
Tmax                               int64
Tmin                               int64
Tavg                              object
Depart                            object
DewPoint                           int64
WetBulb                           object
Heat                              object
Cool                              object
Sunrise                           object
Sunset                            object
CodeSum         

In [88]:
def replaceNulls(x):
    if (x == "M") | (x==" ") | (x=="-"):
        return np.nan
    else:
        return x

In [89]:
df3 = df3.applymap(replaceNulls)
test3 = test3.applymap(replaceNulls)
weather = weather.applymap(replaceNulls)

In [68]:
df3.isnull().sum()

Date                      0
Address                   0
Species                   0
Block                     0
Street                    0
Trap                      0
AddressNumberAndStreet    0
Latitude                  0
Longitude                 0
AddressAccuracy           0
WnvPresent                0
Latlong                   0
Station                   0
Tmax                      0
Tmin                      0
Tavg                      0
DewPoint                  0
Heat                      0
Cool                      0
Sunrise                   0
Sunset                    0
SeaLevel                  0
ResultSpeed               0
ResultDir                 0
AvgSpeed                  0
dtype: int64

In [34]:
len(df3)

10506

In [None]:
X

In [35]:
#drop water -> all data is 0
#drop Depart, Depth, SnowFall (69% of data missing)

In [36]:
df3.drop(["Depart","Depth","SnowFall","Water1","CodeSum","WetBulb","PrecipTotal","StnPressure"],axis=1, inplace=True)
test3.drop(["Depart","Depth","SnowFall","Water1","CodeSum","WetBulb","PrecipTotal","StnPressure"],axis=1, inplace=True)

In [69]:
weather.drop(["Depart","Depth","SnowFall","Water1","CodeSum","WetBulb","PrecipTotal","StnPressure"],axis=1, inplace=True)

In [37]:
df3.isnull().sum()

Date                      0
Address                   0
Species                   0
Block                     0
Street                    0
Trap                      0
AddressNumberAndStreet    0
Latitude                  0
Longitude                 0
AddressAccuracy           0
NumMosquitos              0
WnvPresent                0
Latlong                   0
Station                   0
Tmax                      0
Tmin                      0
Tavg                      0
DewPoint                  0
Heat                      0
Cool                      0
Sunrise                   0
Sunset                    0
SeaLevel                  0
ResultSpeed               0
ResultDir                 0
AvgSpeed                  0
dtype: int64

In [70]:
weather.isnull().sum()

Station         0
Date            0
Tmax            0
Tmin            0
Tavg           11
DewPoint        0
Heat           11
Cool           11
Sunrise         0
Sunset          0
SeaLevel        9
ResultSpeed     0
ResultDir       0
AvgSpeed        3
dtype: int64

In [38]:
def replaceTrace(x):
    if (x == "T"):
        return .00001
    else:
        return x

In [120]:
df3 = df3.applymap(replaceTrace).copy(deep=True)
test=test3.applymap(replaceTrace).copy(deep=True)

In [41]:
df3.drop("NumMosquitos",axis=1,inplace=True)

In [233]:
species_dummies = pd.get_dummies(df4.Species)
species_dummies_test = pd.get_dummies(test4.Species)

In [269]:
species_dummies.reset_index(drop=True,inplace=True)
species_dummies_test.reset_index(drop=True,inplace=True)

In [262]:
len(species_dummies)

10506

In [261]:
len(df4[x_list])

10506

In [236]:
species_dummies_test.drop(["UNSPECIFIED CULEX","CULEX TARSALIS","CULEX ERRATICUS"],axis=1,inplace=True)

In [237]:
species_dummies.drop(["CULEX TARSALIS","CULEX ERRATICUS"],axis=1,inplace=True)

In [213]:
df4.Species.value_counts()

CULEX PIPIENS/RESTUANS    4752
CULEX RESTUANS            2740
CULEX PIPIENS             2699
CULEX TERRITANS            222
CULEX SALINARIUS            86
CULEX TARSALIS               6
CULEX ERRATICUS              1
Name: Species, dtype: int64

In [214]:
test4.Species.value_counts()

CULEX PIPIENS/RESTUANS    15359
CULEX RESTUANS            14670
CULEX PIPIENS             14521
CULEX SALINARIUS          14355
CULEX TERRITANS           14351
CULEX TARSALIS            14347
CULEX ERRATICUS           14345
UNSPECIFIED CULEX         14345
Name: Species, dtype: int64

In [77]:
x_list = (weather.columns - ["Date"]).tolist()

  if __name__ == '__main__':


In [121]:
df4 = df3.sort_values(by="Date").ffill().copy(deep=True)
test4 = test.sort_values(by="Date").ffill().copy(deep=True)

In [265]:
X = pd.concat([species_dummies, df4[x_list]],axis=1)

In [373]:
X_scaled = StandardScaler().fit_transform(df4[x_list])

In [315]:
type(X_scaled)

numpy.ndarray

In [348]:
x1_list = ['AvgSpeed',
 'Cool',
 'DewPoint',
 'ResultDir',
 'ResultSpeed',
 'SeaLevel',
 'Station',
 'Sunrise',
 'Sunset',
 'Tavg',
 'Tmax',
 'Tmin']

In [369]:
X1 = StandardScaler().fit_transform(df4[x1_list])

In [371]:
X_train, X_test, y_train, y_test = train_test_split(X1,y)

In [294]:
from sklearn.preprocessing import StandardScaler
X_scaled= StandardScaler().fit_transform(X)

In [370]:
from sklearn.decomposition import PCA

pca = PCA(n_components=3)
X1 = pca.fit_transform(X1)

In [374]:
X_scaled = pca.fit_transform(X_scaled)

In [254]:
y = df4["WnvPresent"]

In [190]:
len(y)

10506

In [291]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.cross_validation import cross_val_score, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier, BaggingClassifier

In [191]:
cv = StratifiedKFold(y, n_folds=5,shuffle=True)

In [318]:
dt = DecisionTreeClassifier()
bdt = BaggingClassifier(DecisionTreeClassifier())
rf = RandomForestClassifier()
et = ExtraTreesClassifier()

In [301]:
def fitAndPrint(model):
    model.fit(X_train,y_train)
    y_pred = model.predict(X_test)
    print accuracy_score(y_test,y_pred)

In [365]:
fitAndPrint(dt)

0.942900647126


In [220]:
rf = RandomForestClassifier()

In [276]:
lr = LogisticRegression()

In [372]:
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
accuracy_score(y_test,y_pred)

0.94594594594594594

In [367]:
np.mean(cross_val_score(rf,X_scaled, y, cv=cv))

ValueError: Found arrays with inconsistent numbers of samples: [ 10506 116293]

In [375]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled,y)

In [376]:
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
accuracy_score(y_test,y_pred)

0.94366197183098588

In [368]:
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
accuracy_score(y_test,y_pred)

0.94290064712599919

In [287]:
for i in range(2,10):
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_test)
    print i, accuracy_score(y_test,y_pred)

2 0.940236010659
3 0.931100114199
4 0.940236010659
5 0.933764750666
6 0.945184621241
7 0.940236010659
8 0.947087933003
9 0.947087933003


In [288]:
knn = KNeighborsClassifier(n_neighbors=8)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
accuracy_score(y_test,y_pred)

0.94708793300342597

In [303]:
for a,b in zip(rf.feature_importances_,X.columns):
    print a, b

0.0532596418185 AvgSpeed
0.0441461374067 Cool
0.140950905872 DewPoint
0.000698445020753 Heat
0.0382380498337 ResultDir
0.057908953043 ResultSpeed
0.0606292326948 SeaLevel
0.0467918148297 Station
0.185763678463 Sunrise
0.174714540847 Sunset
0.0333239560426 Tavg
0.0686349575194 Tmax
0.0949396866088 Tmin


In [178]:
from sklearn import svm

clf = svm.SVC(probability=True)

X_scaled = StandardScaler().fit_transform(X_cut)
X_train, X_test, y_train, y_test = train_test_split(X_scaled,y)

In [197]:
np.mean(cross_val_score(clf,X_scaled, y, cv=cv))

0.94755381312238163

In [179]:
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
accuracy_score(y_test,y_pred)

0.94328130947849254

In [270]:
#predictions
X = pd.concat([test4[x_list],species_dummies_test],axis=1)
X_scaled= StandardScaler().fit_transform(X)
predictions = rf.predict_proba(X_scaled)[:,1]
sample['WnvPresent'] = predictions

In [279]:
#predictions
X = test4[x_list]
X_scaled= StandardScaler().fit_transform(X)
predictions = lr.predict_proba(X_scaled)[:,1]
sample['WnvPresent'] = predictions

In [379]:
#predictions
X = test4[x_list]
X_scaled= StandardScaler().fit_transform(X)
X_scaled = pca.fit_transform(X_scaled)
predictions = rf.predict_proba(X_scaled)[:,1]
sample['WnvPresent'] = predictions

In [358]:
#predictions
X = test4[x1_list]
X_scaled= StandardScaler().fit_transform(X)
#X_pca = pca.fit_transform(X_scaled)
predictions = lr.predict_proba(X_scaled)[:,1]
sample['WnvPresent'] = predictions

In [289]:
#predictions
X = test4[x_list]
X_scaled= StandardScaler().fit_transform(X)
predictions = knn.predict_proba(X_scaled)[:,1]
sample['WnvPresent'] = predictions

In [180]:
X = test4[["Sunset","Sunrise"]]
X_scaled= StandardScaler().fit_transform(X)
predictions = clf.predict_proba(X_scaled)[:,1]
sample['WnvPresent'] = predictions

In [132]:
len(predictions)

116293

In [280]:
sample

Unnamed: 0,Id,WnvPresent
0,1,0.020280
1,2,0.011183
2,3,0.011183
3,4,0.011183
4,5,0.011183
5,6,0.011183
6,7,0.011183
7,8,0.011183
8,9,0.011183
9,10,0.011183


In [113]:
sample = pd.read_csv('../Assets/sampleSubmission.csv')

In [24]:
df3.head()

Unnamed: 0,Date,Address,Species,Block,Street,Trap,AddressNumberAndStreet,Latitude,Longitude,AddressAccuracy,...,Cool,Sunrise,Sunset,CodeSum,PrecipTotal,StnPressure,SeaLevel,ResultSpeed,ResultDir,AvgSpeed
0,2007-05-29,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX PIPIENS/RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,...,9,421,1917,BR HZ,0.0,29.39,30.11,5.8,18,6.5
1,2007-05-29,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,...,9,421,1917,BR HZ,0.0,29.39,30.11,5.8,18,6.5
2,2007-05-29,"6200 North Mandell Avenue, Chicago, IL 60646, USA",CULEX RESTUANS,62,N MANDELL AVE,T007,"6200 N MANDELL AVE, Chicago, IL",41.994991,-87.769279,9,...,9,421,1917,BR HZ,0.0,29.39,30.11,5.8,18,6.5
3,2007-05-29,"7900 West Foster Avenue, Chicago, IL 60656, USA",CULEX PIPIENS/RESTUANS,79,W FOSTER AVE,T015,"7900 W FOSTER AVE, Chicago, IL",41.974089,-87.824812,8,...,9,421,1917,BR HZ,0.0,29.39,30.11,5.8,18,6.5
4,2007-05-29,"7900 West Foster Avenue, Chicago, IL 60656, USA",CULEX RESTUANS,79,W FOSTER AVE,T015,"7900 W FOSTER AVE, Chicago, IL",41.974089,-87.824812,8,...,9,421,1917,BR HZ,0.0,29.39,30.11,5.8,18,6.5


In [114]:
labels = df3.WnvPresent.values

In [115]:
X = df3[['Station','Latitude','Longitude','DewPoint']]
y = labels

from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestClassifier

X_train, X_test, y_train, y_test = train_test_split(X,y)
rf = RandomForestClassifier()
rf.fit(X_train,y_train)
y_pred = rf.predict(X_test)

rf.score(X, y[, sample_weight])

SyntaxError: invalid syntax (<ipython-input-115-9e45a00d5be4>, line 12)

In [116]:
#random forest
X = df3[['Station','Latitude','Longitude','DewPoint']]
y = df3['WnvPresent']
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf.fit(X,labels)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [38]:
#sheena
#random forest
X = df3[['Station','Latitude','Longitude','DewPoint']]
y = df3['WnvPresent']
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y)
rf = RandomForestClassifier()
rf.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [39]:
y_pred = rf.predict(X_test)

2627

In [30]:
rf.feature_importances_

array([ 0.00630645,  0.25998343,  0.24072007,  0.49299004])

In [118]:
# predictions
test=test3[['Station','Latitude','Longitude','DewPoint']]
predictions = rf.predict_proba(test)[:,1]
sample['WnvPresent'] = predictions

In [133]:
len(sample)

116293

In [380]:
#creating a csv file
sample.to_csv('trialrfpca.csv', index=False)

In [37]:
from sklearn.metrics import confusion_matrix

In [None]:
confusion_matrix(predictions,)