In [27]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.tree import ExtraTreeClassifier

In [28]:
combine = pd.read_csv('../data/combine.csv')

In [29]:
combine.head()

Unnamed: 0,Date,Tmax,Tmin,Tavg,DewPoint,WetBulb,Heat,Cool,PrecipTotal,StnPressure,...,ResultSpeed,ResultDir,AvgSpeed,Species,Trap,AddressNumberAndStreet,Latitude,Longitude,NumMosquitos,WnvPresent
0,2007-05-29,88,60,74.0,58,65,0,9,0.0,29.39,...,5.8,18,6.5,CULEX PIPIENS/RESTUANS,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,1,0
1,2007-05-29,88,60,74.0,58,65,0,9,0.0,29.39,...,5.8,18,6.5,CULEX RESTUANS,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,1,0
2,2007-05-29,88,60,74.0,58,65,0,9,0.0,29.39,...,5.8,18,6.5,CULEX RESTUANS,T007,"6200 N MANDELL AVE, Chicago, IL",41.994991,-87.769279,1,0
3,2007-05-29,88,60,74.0,58,65,0,9,0.0,29.39,...,5.8,18,6.5,CULEX PIPIENS/RESTUANS,T015,"7900 W FOSTER AVE, Chicago, IL",41.974089,-87.824812,1,0
4,2007-05-29,88,60,74.0,58,65,0,9,0.0,29.39,...,5.8,18,6.5,CULEX RESTUANS,T015,"7900 W FOSTER AVE, Chicago, IL",41.974089,-87.824812,4,0


In [30]:
combine.dtypes

Date                       object
Tmax                        int64
Tmin                        int64
Tavg                      float64
DewPoint                    int64
WetBulb                     int64
Heat                        int64
Cool                        int64
PrecipTotal               float64
StnPressure               float64
SeaLevel                  float64
ResultSpeed               float64
ResultDir                   int64
AvgSpeed                  float64
Species                    object
Trap                       object
AddressNumberAndStreet     object
Latitude                  float64
Longitude                 float64
NumMosquitos                int64
WnvPresent                  int64
dtype: object

In [31]:
combine.Date = pd.to_datetime(combine.Date)

In [32]:
combine.drop(['Date'], axis=1, inplace=True)

In [33]:
combine.drop(['NumMosquitos'], axis=1, inplace=True)

In [34]:
combine.drop(['AddressNumberAndStreet'], axis=1, inplace=True)

In [35]:
combine_dum = pd.get_dummies(combine)

#### Setting up my X and y:

In [36]:
X = combine_dum.drop(['WnvPresent'], axis=1)
y = combine_dum.WnvPresent

In [37]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)

In [38]:
et = ExtraTreeClassifier(random_state=42)

In [39]:
params = {
        'criterion' : ['gini', 'entropy'],
        'min_samples_split' : [2, 4, 6],
        'min_weight_fraction_leaf' : [0.0, 0.2, 0.4],
        'min_impurity_decrease' : [0.0, 0.2, 0.4]
}

In [40]:
gs = GridSearchCV(et, param_grid=params)

In [41]:
gs.fit(X_train, y_train)
print(gs.best_params_)
print(gs.best_score_)
print(gs.score(X_train, y_train))
print(gs.score(X_test, y_test))

{'criterion': 'gini', 'min_impurity_decrease': 0.0, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.2}
0.9475821804797563
0.9475821804797563
0.9474685953559193


#### Awesome! Let's submit to Kaggle ; )

In [42]:
combine2 = pd.read_csv('../data/combine2.csv')

In [43]:
combine.head()

Unnamed: 0,Tmax,Tmin,Tavg,DewPoint,WetBulb,Heat,Cool,PrecipTotal,StnPressure,SeaLevel,ResultSpeed,ResultDir,AvgSpeed,Species,Trap,Latitude,Longitude,WnvPresent
0,88,60,74.0,58,65,0,9,0.0,29.39,30.11,5.8,18,6.5,CULEX PIPIENS/RESTUANS,T002,41.95469,-87.800991,0
1,88,60,74.0,58,65,0,9,0.0,29.39,30.11,5.8,18,6.5,CULEX RESTUANS,T002,41.95469,-87.800991,0
2,88,60,74.0,58,65,0,9,0.0,29.39,30.11,5.8,18,6.5,CULEX RESTUANS,T007,41.994991,-87.769279,0
3,88,60,74.0,58,65,0,9,0.0,29.39,30.11,5.8,18,6.5,CULEX PIPIENS/RESTUANS,T015,41.974089,-87.824812,0
4,88,60,74.0,58,65,0,9,0.0,29.39,30.11,5.8,18,6.5,CULEX RESTUANS,T015,41.974089,-87.824812,0


In [44]:
combine2.head()

Unnamed: 0,Date,Station,Tmax,Tmin,Tavg,DewPoint,WetBulb,Heat,Cool,PrecipTotal,...,Id,Address,Species,Block,Street,Trap,AddressNumberAndStreet,Latitude,Longitude,AddressAccuracy
0,2008-06-11,1,86,61,74.0,56,64,0,9,0.0,...,1,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX PIPIENS/RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9
1,2008-06-11,1,86,61,74.0,56,64,0,9,0.0,...,2,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9
2,2008-06-11,1,86,61,74.0,56,64,0,9,0.0,...,3,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX PIPIENS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9
3,2008-06-11,1,86,61,74.0,56,64,0,9,0.0,...,4,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX SALINARIUS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9
4,2008-06-11,1,86,61,74.0,56,64,0,9,0.0,...,5,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX TERRITANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9


In [45]:
combine2.drop(['Date', 'Id','Station', 'Street', 'Address', 'AddressAccuracy', 'AddressNumberAndStreet', 'Block'], axis=1, inplace=True)

In [46]:
combine2.dtypes

Tmax             int64
Tmin             int64
Tavg           float64
DewPoint         int64
WetBulb          int64
Heat             int64
Cool             int64
PrecipTotal    float64
StnPressure    float64
SeaLevel       float64
ResultSpeed    float64
ResultDir        int64
AvgSpeed       float64
Species         object
Trap            object
Latitude       float64
Longitude      float64
dtype: object

In [47]:
combine.dtypes

Tmax             int64
Tmin             int64
Tavg           float64
DewPoint         int64
WetBulb          int64
Heat             int64
Cool             int64
PrecipTotal    float64
StnPressure    float64
SeaLevel       float64
ResultSpeed    float64
ResultDir        int64
AvgSpeed       float64
Species         object
Trap            object
Latitude       float64
Longitude      float64
WnvPresent       int64
dtype: object

In [48]:
combine2_dum = pd.get_dummies(combine2)

In [49]:
test_drop = list(set(combine2_dum.columns) - set(X.columns))

In [51]:
test_drop

['Trap_T065A',
 'Trap_T200B',
 'Trap_T218C',
 'Trap_T128A',
 'Trap_T090B',
 'Trap_T200A',
 'Trap_T234',
 'Trap_T090A',
 'Trap_T218A',
 'Trap_T002B',
 'Trap_T218B',
 'Species_UNSPECIFIED CULEX',
 'Trap_T002A',
 'Trap_T090C']

In [53]:
X_2 = combine2_dum.drop(test_drop, axis=1)

In [54]:
X_2

Unnamed: 0,Tmax,Tmin,Tavg,DewPoint,WetBulb,Heat,Cool,PrecipTotal,StnPressure,SeaLevel,...,Trap_T230,Trap_T231,Trap_T232,Trap_T233,Trap_T235,Trap_T236,Trap_T237,Trap_T238,Trap_T900,Trap_T903
0,86,61,74.0,56,64,0,9,0.00,29.28,29.99,...,0,0,0,0,0,0,0,0,0,0
1,86,61,74.0,56,64,0,9,0.00,29.28,29.99,...,0,0,0,0,0,0,0,0,0,0
2,86,61,74.0,56,64,0,9,0.00,29.28,29.99,...,0,0,0,0,0,0,0,0,0,0
3,86,61,74.0,56,64,0,9,0.00,29.28,29.99,...,0,0,0,0,0,0,0,0,0,0
4,86,61,74.0,56,64,0,9,0.00,29.28,29.99,...,0,0,0,0,0,0,0,0,0,0
5,86,61,74.0,56,64,0,9,0.00,29.28,29.99,...,0,0,0,0,0,0,0,0,0,0
6,86,61,74.0,56,64,0,9,0.00,29.28,29.99,...,0,0,0,0,0,0,0,0,0,0
7,86,61,74.0,56,64,0,9,0.00,29.28,29.99,...,0,0,0,0,0,0,0,0,0,0
8,86,61,74.0,56,64,0,9,0.00,29.28,29.99,...,0,0,0,0,0,0,0,0,0,0
9,86,61,74.0,56,64,0,9,0.00,29.28,29.99,...,0,0,0,0,0,0,0,0,0,0


In [55]:
predictions = gs.predict_proba(X_2)[:,1]

In [56]:
sample = pd.read_csv('../../input/sampleSubmission.csv')

In [57]:
sample.WnvPresent = predictions

In [58]:
sample.to_csv('../data/et.csv', index=False)