In [73]:
import pandas as pd
import numpy as np
from geopy.distance import vincenty

In [74]:
df = pd.read_csv("../assets/train.csv")
test = pd.read_csv("../assets/test.csv")

In [75]:
df.head(2)

Unnamed: 0,Date,Address,Species,Block,Street,Trap,AddressNumberAndStreet,Latitude,Longitude,AddressAccuracy,NumMosquitos,WnvPresent
0,2007-05-29,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX PIPIENS/RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,1,0
1,2007-05-29,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,1,0


In [76]:
df.dtypes

Date                       object
Address                    object
Species                    object
Block                       int64
Street                     object
Trap                       object
AddressNumberAndStreet     object
Latitude                  float64
Longitude                 float64
AddressAccuracy             int64
NumMosquitos                int64
WnvPresent                  int64
dtype: object

In [77]:
df["Date"] = pd.to_datetime(df["Date"])
test["Date"] = pd.to_datetime(test["Date"])

In [78]:
df["Latlong"] = zip(df["Latitude"], df["Longitude"])
test["Latlong"] = zip(test["Latitude"], test["Longitude"])

In [79]:
#Station 1: CHICAGO O'HARE INTERNATIONAL AIRPORT Lat: 41.995 Lon: -87.933 Elev: 662 ft. above sea level
#Station 2: CHICAGO MIDWAY INTL ARPT Lat: 41.786 Lon: -87.752 Elev: 612 ft. above sea level

station_1 = (41.995, -87.933)
station_2 = (41.786, -87.752)

print(vincenty(station_1, station_2).miles)

17.1810367501


In [80]:
def checkDistance(x):
    if vincenty(station_1,x).miles > vincenty(station_2,x).miles:
        return 2
    else:
        return 1

In [81]:
df["Station"] = df["Latlong"].apply(checkDistance)
test["Station"] = test["Latlong"].apply(checkDistance)

In [82]:
weather = pd.read_csv("../assets/weather.csv")

In [83]:
weather.head(2)

Unnamed: 0,Station,Date,Tmax,Tmin,Tavg,Depart,DewPoint,WetBulb,Heat,Cool,...,CodeSum,Depth,Water1,SnowFall,PrecipTotal,StnPressure,SeaLevel,ResultSpeed,ResultDir,AvgSpeed
0,1,2007-05-01,83,50,67,14,51,56,0,2,...,,0,M,0.0,0.0,29.1,29.82,1.7,27,9.2
1,2,2007-05-01,84,52,68,M,51,57,0,3,...,,M,M,M,0.0,29.18,29.82,2.7,25,9.6


In [84]:
weather.dtypes

Station          int64
Date            object
Tmax             int64
Tmin             int64
Tavg            object
Depart          object
DewPoint         int64
WetBulb         object
Heat            object
Cool            object
Sunrise         object
Sunset          object
CodeSum         object
Depth           object
Water1          object
SnowFall        object
PrecipTotal     object
StnPressure     object
SeaLevel        object
ResultSpeed    float64
ResultDir        int64
AvgSpeed        object
dtype: object

In [85]:
weather["Date"] = pd.to_datetime(weather["Date"])

In [86]:
df3 = df.merge(weather,on=["Date","Station"])
test3 = test.merge(weather,on=["Date","Station"])

In [87]:
df3.dtypes

Date                      datetime64[ns]
Address                           object
Species                           object
Block                              int64
Street                            object
Trap                              object
AddressNumberAndStreet            object
Latitude                         float64
Longitude                        float64
AddressAccuracy                    int64
NumMosquitos                       int64
WnvPresent                         int64
Latlong                           object
Station                            int64
Tmax                               int64
Tmin                               int64
Tavg                              object
Depart                            object
DewPoint                           int64
WetBulb                           object
Heat                              object
Cool                              object
Sunrise                           object
Sunset                            object
CodeSum         

In [88]:
def replaceNulls(x):
    if (x == "M") | (x==" "):
        return np.nan
    else:
        return x

In [89]:
df3 = df3.applymap(replaceNulls)
test3 = test3.applymap(replaceNulls)

In [90]:
df3.isnull().sum()

Date                          0
Address                       0
Species                       0
Block                         0
Street                        0
Trap                          0
AddressNumberAndStreet        0
Latitude                      0
Longitude                     0
AddressAccuracy               0
NumMosquitos                  0
WnvPresent                    0
Latlong                       0
Station                       0
Tmax                          0
Tmin                          0
Tavg                          0
Depart                     7208
DewPoint                      0
WetBulb                      36
Heat                          0
Cool                          0
Sunrise                       0
Sunset                        0
CodeSum                    5941
Depth                      7208
Water1                    10506
SnowFall                   7208
PrecipTotal                  24
StnPressure                  36
SeaLevel                      0
ResultSp

In [91]:
len(df3)

10506

In [92]:
#drop water -> all data is 0
#drop Depart, Depth, SnowFall (69% of data missing)

In [93]:
df3.drop(["Depart","Depth","SnowFall","Water1"],axis=1, inplace=True)
test3.drop(["Depart","Depth","SnowFall","Water1"],axis=1, inplace=True)

In [94]:
df3.dtypes

Date                      datetime64[ns]
Address                           object
Species                           object
Block                              int64
Street                            object
Trap                              object
AddressNumberAndStreet            object
Latitude                         float64
Longitude                        float64
AddressAccuracy                    int64
NumMosquitos                       int64
WnvPresent                         int64
Latlong                           object
Station                            int64
Tmax                               int64
Tmin                               int64
Tavg                              object
DewPoint                           int64
WetBulb                           object
Heat                              object
Cool                              object
Sunrise                           object
Sunset                            object
CodeSum                           object
PrecipTotal     

In [95]:
sample = pd.read_csv('../Assets/sampleSubmission.csv')

In [96]:
labels = df3.WnvPresent.values

In [100]:
#random forest
X = df3[['Station','Latitude','Longitude','DewPoint']]
y = df3['WnvPresent']
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf.fit(X,labels)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [101]:
# predictions
test=test3[['Station','Latitude','Longitude','DewPoint']]
predictions = rf.predict_proba(test)[:,1]
sample['WnvPresent'] = predictions

In [102]:
sample

Unnamed: 0,Id,WnvPresent
0,1,0.344340
1,2,0.344340
2,3,0.344340
3,4,0.344340
4,5,0.344340
5,6,0.344340
6,7,0.344340
7,8,0.344340
8,9,0.000000
9,10,0.000000


In [103]:
#creating a csv file
sample.to_csv('trial1.csv', index=False)