# Project Steps:
#### 1.Import data and python functions
#### 2.Run null model: calculate the percetage(%) of WMV
#### 3.Pick 5 features and use KNN/Logistic regression to get the model
#### 4. Creat confusion matrix: Recall, Precision, Accuracy
#### 5.pick another 5 features and use Knn/Logsitic regression to get the model

## Import data and Python functions

In [49]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore') #help ignore warning, make it more clean
pd.set_option('display.max_columns', None)


# import train data
train=pd.read_csv('train.csv')
test=pd.read_csv('test.csv')
spray=pd.read_csv('spray.csv')

In [50]:
train.head(2)

Unnamed: 0,Date,Address,Species,Block,Street,Trap,AddressNumberAndStreet,Latitude,Longitude,AddressAccuracy,NumMosquitos,WnvPresent
0,2007-05-29,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX PIPIENS/RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,1,0
1,2007-05-29,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,1,0


In [51]:
print 'Train Table - Num of Rows: '+str(train.shape[0])
print 'Train Table - Num of Columns: ' + str(train.shape[1])
print 'Test Table - Num of Rows: ' + str(test.shape[0])
print 'Test Table - Num of Columns: ' + str(test.shape[0])
print 'Spray Table - Num of Rows: '+str(spray.shape[0])
print 'Spray Table - Num of Columns: ' + str(spray.shape[1])

Train Table - Num of Rows: 10506
Train Table - Num of Columns: 12
Test Table - Num of Rows: 116293
Test Table - Num of Columns: 116293
Spray Table - Num of Rows: 14835
Spray Table - Num of Columns: 4


In [52]:
train.columns

Index([u'Date', u'Address', u'Species', u'Block', u'Street', u'Trap',
       u'AddressNumberAndStreet', u'Latitude', u'Longitude',
       u'AddressAccuracy', u'NumMosquitos', u'WnvPresent'],
      dtype='object')

In [53]:
train.dtypes

Date                       object
Address                    object
Species                    object
Block                       int64
Street                     object
Trap                       object
AddressNumberAndStreet     object
Latitude                  float64
Longitude                 float64
AddressAccuracy             int64
NumMosquitos                int64
WnvPresent                  int64
dtype: object

In [54]:
spray.head(2)

Unnamed: 0,Date,Time,Latitude,Longitude
0,2011-08-29,6:56:58 PM,42.391623,-88.089163
1,2011-08-29,6:57:08 PM,42.391348,-88.089163


In [55]:
test.head(2)

Unnamed: 0,Id,Date,Address,Species,Block,Street,Trap,AddressNumberAndStreet,Latitude,Longitude,AddressAccuracy
0,1,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX PIPIENS/RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9
1,2,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9


### Run null model: calculate the percetage(%) of WMV


In [56]:
# check the null value
train.WnvPresent.isnull().sum()


0

In [57]:
# run the null model
train_null= train.WnvPresent.mean()
train_null

0.05244622120692937

In [58]:
# add the prediction value to train datafram as MnVpresent_null
train['MnVpresent_null']=train.WnvPresent.map({0:0.0524, 1:0.0524})

In [59]:
test.Species.value_counts()

CULEX PIPIENS/RESTUANS    15359
CULEX RESTUANS            14670
CULEX PIPIENS             14521
CULEX SALINARIUS          14355
CULEX TERRITANS           14351
CULEX TARSALIS            14347
CULEX ERRATICUS           14345
UNSPECIFIED CULEX         14345
Name: Species, dtype: int64

### Pick up 5 Features to Run Model 2

#### Pick the first feature: Species--the species of mosquitos

In [60]:
train.groupby('Species').sum() #there are 7 types of Mosquito, three types of them have virus

Unnamed: 0_level_0,Block,Latitude,Longitude,AddressAccuracy,NumMosquitos,WnvPresent,MnVpresent_null
Species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
CULEX ERRATICUS,10,41.974689,-87.890615,9,7,0,0.0524
CULEX PIPIENS,88527,112854.372337,-236649.617578,20040,44671,240,141.4276
CULEX PIPIENS/RESTUANS,174429,198865.648503,-416775.307552,37729,66268,262,249.0048
CULEX RESTUANS,100789,114693.922296,-240334.622989,21974,23431,49,143.576
CULEX SALINARIUS,2806,3596.961565,-7541.6658,668,145,0,4.5064
CULEX TARSALIS,183,250.908138,-525.969405,45,7,0,0.3144
CULEX TERRITANS,8192,9279.22185,-19460.159333,1687,510,0,11.6328


In [61]:
# convert the string value of species to categorical variables 
train['Species']=train['Species'].astype('category')
train.dtypes

Date                        object
Address                     object
Species                   category
Block                        int64
Street                      object
Trap                        object
AddressNumberAndStreet      object
Latitude                   float64
Longitude                  float64
AddressAccuracy              int64
NumMosquitos                 int64
WnvPresent                   int64
MnVpresent_null            float64
dtype: object

In [62]:
# code the species to categorical numbers and add the new variable into train dataset
train['Species_c']=train['Species'].cat.codes
train.head(5)

Unnamed: 0,Date,Address,Species,Block,Street,Trap,AddressNumberAndStreet,Latitude,Longitude,AddressAccuracy,NumMosquitos,WnvPresent,MnVpresent_null,Species_c
0,2007-05-29,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX PIPIENS/RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,1,0,0.0524,2
1,2007-05-29,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,1,0,0.0524,3
2,2007-05-29,"6200 North Mandell Avenue, Chicago, IL 60646, USA",CULEX RESTUANS,62,N MANDELL AVE,T007,"6200 N MANDELL AVE, Chicago, IL",41.994991,-87.769279,9,1,0,0.0524,3
3,2007-05-29,"7900 West Foster Avenue, Chicago, IL 60656, USA",CULEX PIPIENS/RESTUANS,79,W FOSTER AVE,T015,"7900 W FOSTER AVE, Chicago, IL",41.974089,-87.824812,8,1,0,0.0524,2
4,2007-05-29,"7900 West Foster Avenue, Chicago, IL 60656, USA",CULEX RESTUANS,79,W FOSTER AVE,T015,"7900 W FOSTER AVE, Chicago, IL",41.974089,-87.824812,8,4,0,0.0524,3


### Run Supervised Learning Models 

### Model Training

### Train-Test Split

In [80]:
# Scale the data
feature_cols=['Species_c']

X=train[feature_cols]
y=train.WnvPresent


from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
X=scaler.fit_transform(X)

In [81]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
LR.predict_proba(x)[0:20]

array([[ 0.95349657,  0.04650343],
       [ 0.97883192,  0.02116808],
       [ 0.97883192,  0.02116808],
       [ 0.95349657,  0.04650343],
       [ 0.97883192,  0.02116808],
       [ 0.97883192,  0.02116808],
       [ 0.97883192,  0.02116808],
       [ 0.95349657,  0.04650343],
       [ 0.97883192,  0.02116808],
       [ 0.97883192,  0.02116808],
       [ 0.95349657,  0.04650343],
       [ 0.95349657,  0.04650343],
       [ 0.97883192,  0.02116808],
       [ 0.95349657,  0.04650343],
       [ 0.97883192,  0.02116808],
       [ 0.97883192,  0.02116808],
       [ 0.97883192,  0.02116808],
       [ 0.97883192,  0.02116808],
       [ 0.90090794,  0.09909206],
       [ 0.95349657,  0.04650343]])

In [82]:
# calculate Accuracy Score of using Logistic Regression
np.mean(y_test==y_pred)

0.94635131237381021

### K-Fold Cross-validation

In [83]:
from sklearn.cross_validation import KFold
#This program does 5-fold. It saves the result at each time as different parts of y_pred. 
#In the end, it returns the y_pred as the result of all the five 5-fold.

def run_cv(X, y, clf_class, **kwargs):
    #construct a kfolds object
    kf=KFold(len(y), n_folds=5, shuffle=True)
    y_pred=y.copy()
    clf=clf_class(**kwargs)
    #Iterate through folds
    for train_index, test_index in kf:
        X_train, X_test=X[train_index], X[test_index]
        y_train=y[train_index]
        clf.fit(X_train, y_train)
        y_pred[test_index]=clf.predict(X_test)
    return y_pred
        

In [84]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics

def accuracy(y_true, y_pred):
    return np.mean(y_true==y_pred) # numpy interprets True and False as 1 and 0

LR_CV_result=run_cv(X, y, LogisticRegression)
RF_CV_result=run_cv(X, y, RandomForestClassifier)

### RandomForest

In [85]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics


RFC=RandomForestClassifier()
RFC.fit(x, y)
pred=LR.predict(x)
y_pred=RFC.predict(x_test)


In [86]:
# calculate Accuracy Score of using RandomForest
np.mean(y_pred==y_test)

0.94635131237381021

### KNN 

In [34]:
# fit the model with data
knn=KNeighborsClassifier()
knn.fit(x, y)
# store the predicted response values.
MnV_Knn_predict=knn.predict(x)
knn.predict_proba(x)

array([[ 1.,  0.],
       [ 1.,  0.],
       [ 1.,  0.],
       ..., 
       [ 1.,  0.],
       [ 1.,  0.],
       [ 1.,  0.]])

In [35]:
# store the predicted probabilities of MNVpredcition to train data.
train['MnV_Knn_predict']=knn.predict_proba(x)[:, 1]
train.head(5)

Unnamed: 0,Date,Address,Species,Block,Street,Trap,AddressNumberAndStreet,Latitude,Longitude,AddressAccuracy,NumMosquitos,WnvPresent,MnVpresent_null,Species_c,MnV_Species_pred_Prob,MnV_Knn_predict
0,2007-05-29,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX PIPIENS/RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,1,0,0.0524,2,0.046503,0.0
1,2007-05-29,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,1,0,0.0524,3,0.021168,0.0
2,2007-05-29,"6200 North Mandell Avenue, Chicago, IL 60646, USA",CULEX RESTUANS,62,N MANDELL AVE,T007,"6200 N MANDELL AVE, Chicago, IL",41.994991,-87.769279,9,1,0,0.0524,3,0.021168,0.0
3,2007-05-29,"7900 West Foster Avenue, Chicago, IL 60656, USA",CULEX PIPIENS/RESTUANS,79,W FOSTER AVE,T015,"7900 W FOSTER AVE, Chicago, IL",41.974089,-87.824812,8,1,0,0.0524,2,0.046503,0.0
4,2007-05-29,"7900 West Foster Avenue, Chicago, IL 60656, USA",CULEX RESTUANS,79,W FOSTER AVE,T015,"7900 W FOSTER AVE, Chicago, IL",41.974089,-87.824812,8,4,0,0.0524,3,0.021168,0.0


In [36]:
train.groupby('Species').sum()

Unnamed: 0_level_0,Block,Latitude,Longitude,AddressAccuracy,NumMosquitos,WnvPresent,MnVpresent_null,Species_c,MnV_Species_pred_Prob,MnV_Knn_predict
Species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
CULEX ERRATICUS,10,41.974689,-87.890615,9,7,0,0.0524,0.0,0.198754,0.0
CULEX PIPIENS,88527,112854.372337,-236649.617578,20040,44671,240,141.4276,2699.0,267.449461,0.0
CULEX PIPIENS/RESTUANS,174429,198865.648503,-416775.307552,37729,66268,262,249.0048,9504.0,220.984317,0.0
CULEX RESTUANS,100789,114693.922296,-240334.622989,21974,23431,49,143.576,8220.0,58.00054,0.0
CULEX SALINARIUS,2806,3596.961565,-7541.6658,668,145,0,4.5064,344.0,0.816835,0.0
CULEX TARSALIS,183,250.908138,-525.969405,45,7,0,0.3144,30.0,0.025404,0.0
CULEX TERRITANS,8192,9279.22185,-19460.159333,1687,510,0,11.6328,1332.0,0.417764,0.0


In [37]:
test.head(5)

Unnamed: 0,Id,Date,Address,Species,Block,Street,Trap,AddressNumberAndStreet,Latitude,Longitude,AddressAccuracy
0,1,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX PIPIENS/RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9
1,2,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9
2,3,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX PIPIENS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9
3,4,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX SALINARIUS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9
4,5,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX TERRITANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9


In [38]:
train['Species']=train['Species'].astype('category')
train.head()

Unnamed: 0,Date,Address,Species,Block,Street,Trap,AddressNumberAndStreet,Latitude,Longitude,AddressAccuracy,NumMosquitos,WnvPresent,MnVpresent_null,Species_c,MnV_Species_pred_Prob,MnV_Knn_predict
0,2007-05-29,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX PIPIENS/RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,1,0,0.0524,2,0.046503,0.0
1,2007-05-29,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,1,0,0.0524,3,0.021168,0.0
2,2007-05-29,"6200 North Mandell Avenue, Chicago, IL 60646, USA",CULEX RESTUANS,62,N MANDELL AVE,T007,"6200 N MANDELL AVE, Chicago, IL",41.994991,-87.769279,9,1,0,0.0524,3,0.021168,0.0
3,2007-05-29,"7900 West Foster Avenue, Chicago, IL 60656, USA",CULEX PIPIENS/RESTUANS,79,W FOSTER AVE,T015,"7900 W FOSTER AVE, Chicago, IL",41.974089,-87.824812,8,1,0,0.0524,2,0.046503,0.0
4,2007-05-29,"7900 West Foster Avenue, Chicago, IL 60656, USA",CULEX RESTUANS,79,W FOSTER AVE,T015,"7900 W FOSTER AVE, Chicago, IL",41.974089,-87.824812,8,4,0,0.0524,3,0.021168,0.0
