# Project Steps:
#### 1.Import data and python functions
#### 2.Run null model: calculate the percetage(%) of WMV
#### 3.Pick 5 features and use KNN/Logistic regression to get the model
#### 4.pick another 5 features and use Knn/Logsitic regression to get the model

## Import data and Python functions

In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore') #help ignore warning, make it more clean


# import train data
train=pd.read_csv('train.csv')
test=pd.read_csv('test.csv')
spray=pd.read_csv('spray.csv')
train.shape

(10506, 12)

In [10]:
train.columns

Index([u'Date', u'Address', u'Species', u'Block', u'Street', u'Trap',
       u'AddressNumberAndStreet', u'Latitude', u'Longitude',
       u'AddressAccuracy', u'NumMosquitos', u'WnvPresent'],
      dtype='object')

In [11]:
train.dtypes

Date                       object
Address                    object
Species                    object
Block                       int64
Street                     object
Trap                       object
AddressNumberAndStreet     object
Latitude                  float64
Longitude                 float64
AddressAccuracy             int64
NumMosquitos                int64
WnvPresent                  int64
dtype: object

## Run null model: calculate the percetage(%) of WMV


In [12]:
# check the null value
train.WnvPresent.isnull().sum()


0

In [13]:
# run the null model
train_null= train.WnvPresent.mean()
train_null

0.05244622120692937

In [6]:
# add the prediction value to train datafram as MnVpresent_null
train['MnVpresent_null']=train.WnvPresent.map({0:0.0524, 1:0.0524})
train.head(5)

Unnamed: 0,Date,Address,Species,Block,Street,Trap,AddressNumberAndStreet,Latitude,Longitude,AddressAccuracy,NumMosquitos,WnvPresent,MnVpresent_null
0,2007-05-29,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX PIPIENS/RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,1,0,0.0524
1,2007-05-29,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,1,0,0.0524
2,2007-05-29,"6200 North Mandell Avenue, Chicago, IL 60646, USA",CULEX RESTUANS,62,N MANDELL AVE,T007,"6200 N MANDELL AVE, Chicago, IL",41.994991,-87.769279,9,1,0,0.0524
3,2007-05-29,"7900 West Foster Avenue, Chicago, IL 60656, USA",CULEX PIPIENS/RESTUANS,79,W FOSTER AVE,T015,"7900 W FOSTER AVE, Chicago, IL",41.974089,-87.824812,8,1,0,0.0524
4,2007-05-29,"7900 West Foster Avenue, Chicago, IL 60656, USA",CULEX RESTUANS,79,W FOSTER AVE,T015,"7900 W FOSTER AVE, Chicago, IL",41.974089,-87.824812,8,4,0,0.0524


In [7]:
test.head(5)

Unnamed: 0,Id,Date,Address,Species,Block,Street,Trap,AddressNumberAndStreet,Latitude,Longitude,AddressAccuracy
0,1,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX PIPIENS/RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9
1,2,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9
2,3,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX PIPIENS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9
3,4,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX SALINARIUS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9
4,5,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX TERRITANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9


In [8]:
test.Species.value_counts()

CULEX PIPIENS/RESTUANS    15359
CULEX RESTUANS            14670
CULEX PIPIENS             14521
CULEX SALINARIUS          14355
CULEX TERRITANS           14351
CULEX TARSALIS            14347
CULEX ERRATICUS           14345
UNSPECIFIED CULEX         14345
Name: Species, dtype: int64

## Pick up 5 Features to Run Model 2

#### Pick the first feature: Species--the species of mosquitos

In [9]:
train.groupby('Species').sum() #there are 7 types of Mosquito, three types of them have virus

Unnamed: 0_level_0,Block,Latitude,Longitude,AddressAccuracy,NumMosquitos,WnvPresent,MnVpresent_null
Species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
CULEX ERRATICUS,10,41.974689,-87.890615,9,7,0,0.0524
CULEX PIPIENS,88527,112854.372337,-236649.617578,20040,44671,240,141.4276
CULEX PIPIENS/RESTUANS,174429,198865.648503,-416775.307552,37729,66268,262,249.0048
CULEX RESTUANS,100789,114693.922296,-240334.622989,21974,23431,49,143.576
CULEX SALINARIUS,2806,3596.961565,-7541.6658,668,145,0,4.5064
CULEX TARSALIS,183,250.908138,-525.969405,45,7,0,0.3144
CULEX TERRITANS,8192,9279.22185,-19460.159333,1687,510,0,11.6328


In [10]:
# convert the string value of species to categorical variables 
train['Species']=train['Species'].astype('category')
train.dtypes

Date                        object
Address                     object
Species                   category
Block                        int64
Street                      object
Trap                        object
AddressNumberAndStreet      object
Latitude                   float64
Longitude                  float64
AddressAccuracy              int64
NumMosquitos                 int64
WnvPresent                   int64
MnVpresent_null            float64
dtype: object

In [11]:
# code the species to categorical numbers and add the new variable into train dataset
train['Species_c']=train['Species'].cat.codes
train.head(5)

Unnamed: 0,Date,Address,Species,Block,Street,Trap,AddressNumberAndStreet,Latitude,Longitude,AddressAccuracy,NumMosquitos,WnvPresent,MnVpresent_null,Species_c
0,2007-05-29,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX PIPIENS/RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,1,0,0.0524,2
1,2007-05-29,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,1,0,0.0524,3
2,2007-05-29,"6200 North Mandell Avenue, Chicago, IL 60646, USA",CULEX RESTUANS,62,N MANDELL AVE,T007,"6200 N MANDELL AVE, Chicago, IL",41.994991,-87.769279,9,1,0,0.0524,3
3,2007-05-29,"7900 West Foster Avenue, Chicago, IL 60656, USA",CULEX PIPIENS/RESTUANS,79,W FOSTER AVE,T015,"7900 W FOSTER AVE, Chicago, IL",41.974089,-87.824812,8,1,0,0.0524,2
4,2007-05-29,"7900 West Foster Avenue, Chicago, IL 60656, USA",CULEX RESTUANS,79,W FOSTER AVE,T015,"7900 W FOSTER AVE, Chicago, IL",41.974089,-87.824812,8,4,0,0.0524,3


In [12]:
# convert unordered categorical variables (species) to multiple dummy code
train1=pd.get_dummies(train.Species_c, prefix='Species')
train1.head(5)

Unnamed: 0,Species_0,Species_1,Species_2,Species_3,Species_4,Species_5,Species_6
0,0,0,1,0,0,0,0
1,0,0,0,1,0,0,0
2,0,0,0,1,0,0,0
3,0,0,1,0,0,0,0
4,0,0,0,1,0,0,0


In [13]:
# drop the first column
train1.drop(train1.columns[0], axis=1, inplace=True)
train1.head()

Unnamed: 0,Species_1,Species_2,Species_3,Species_4,Species_5,Species_6
0,0,1,0,0,0,0
1,0,0,1,0,0,0
2,0,0,1,0,0,0
3,0,1,0,0,0,0
4,0,0,1,0,0,0


In [14]:
# concatenate the orginal house1 dataframe and the dummy dataframe
train2=pd.concat([train, train1], axis=1)
train2.head(3)

Unnamed: 0,Date,Address,Species,Block,Street,Trap,AddressNumberAndStreet,Latitude,Longitude,AddressAccuracy,NumMosquitos,WnvPresent,MnVpresent_null,Species_c,Species_1,Species_2,Species_3,Species_4,Species_5,Species_6
0,2007-05-29,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX PIPIENS/RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,1,0,0.0524,2,0,1,0,0,0,0
1,2007-05-29,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,1,0,0.0524,3,0,0,1,0,0,0
2,2007-05-29,"6200 North Mandell Avenue, Chicago, IL 60646, USA",CULEX RESTUANS,62,N MANDELL AVE,T007,"6200 N MANDELL AVE, Chicago, IL",41.994991,-87.769279,9,1,0,0.0524,3,0,0,1,0,0,0


In [15]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

LR=LogisticRegression()

feature_cols=['Species_1', 'Species_2', 'Species_3', 'Species_4', 'Species_5', 'Species_6']

x=train2[feature_cols]
y=train2.WnvPresent


LR.fit(x, y)
pred=LR.predict(x)


In [16]:
LR.predict_proba(x)[0:20]

array([[ 0.94491721,  0.05508279],
       [ 0.98178892,  0.01821108],
       [ 0.98178892,  0.01821108],
       [ 0.94491721,  0.05508279],
       [ 0.98178892,  0.01821108],
       [ 0.98178892,  0.01821108],
       [ 0.98178892,  0.01821108],
       [ 0.94491721,  0.05508279],
       [ 0.98178892,  0.01821108],
       [ 0.98178892,  0.01821108],
       [ 0.94491721,  0.05508279],
       [ 0.94491721,  0.05508279],
       [ 0.98178892,  0.01821108],
       [ 0.94491721,  0.05508279],
       [ 0.98178892,  0.01821108],
       [ 0.98178892,  0.01821108],
       [ 0.98178892,  0.01821108],
       [ 0.98178892,  0.01821108],
       [ 0.9113592 ,  0.0886408 ],
       [ 0.94491721,  0.05508279]])

In [17]:
# evalute the train split model on the test data.
x_train, x_test, y_train, y_test=train_test_split(x,y)
LR.fit(x_train,y_train)
y_pred=LR.predict(x_test)
LR.score(x_test, y_test)

0.94594594594594594

In [18]:
LR.predict_proba(x)[0:20]

array([[ 0.94656001,  0.05343999],
       [ 0.98124449,  0.01875551],
       [ 0.98124449,  0.01875551],
       [ 0.94656001,  0.05343999],
       [ 0.98124449,  0.01875551],
       [ 0.98124449,  0.01875551],
       [ 0.98124449,  0.01875551],
       [ 0.94656001,  0.05343999],
       [ 0.98124449,  0.01875551],
       [ 0.98124449,  0.01875551],
       [ 0.94656001,  0.05343999],
       [ 0.94656001,  0.05343999],
       [ 0.98124449,  0.01875551],
       [ 0.94656001,  0.05343999],
       [ 0.98124449,  0.01875551],
       [ 0.98124449,  0.01875551],
       [ 0.98124449,  0.01875551],
       [ 0.98124449,  0.01875551],
       [ 0.91046804,  0.08953196],
       [ 0.94656001,  0.05343999]])

In [19]:
# store the predicted probabilities of MNVpredcition to train data.
train2['MnV_Species_pred_Prob']=LR.predict_proba(x)[:, 1]
train2.head(5)

Unnamed: 0,Date,Address,Species,Block,Street,Trap,AddressNumberAndStreet,Latitude,Longitude,AddressAccuracy,...,WnvPresent,MnVpresent_null,Species_c,Species_1,Species_2,Species_3,Species_4,Species_5,Species_6,MnV_Species_pred_Prob
0,2007-05-29,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX PIPIENS/RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,...,0,0.0524,2,0,1,0,0,0,0,0.05344
1,2007-05-29,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,...,0,0.0524,3,0,0,1,0,0,0,0.018756
2,2007-05-29,"6200 North Mandell Avenue, Chicago, IL 60646, USA",CULEX RESTUANS,62,N MANDELL AVE,T007,"6200 N MANDELL AVE, Chicago, IL",41.994991,-87.769279,9,...,0,0.0524,3,0,0,1,0,0,0,0.018756
3,2007-05-29,"7900 West Foster Avenue, Chicago, IL 60656, USA",CULEX PIPIENS/RESTUANS,79,W FOSTER AVE,T015,"7900 W FOSTER AVE, Chicago, IL",41.974089,-87.824812,8,...,0,0.0524,2,0,1,0,0,0,0,0.05344
4,2007-05-29,"7900 West Foster Avenue, Chicago, IL 60656, USA",CULEX RESTUANS,79,W FOSTER AVE,T015,"7900 W FOSTER AVE, Chicago, IL",41.974089,-87.824812,8,...,0,0.0524,3,0,0,1,0,0,0,0.018756


In [20]:
# use KNN train/test split to predict MnV present
from sklearn import metrics
from sklearn.neighbors import KNeighborsClassifier

x_train, x_test, y_train, y_test=train_test_split(x, y, random_state=99)
knn=KNeighborsClassifier(n_neighbors=1)
knn.fit(x_train, y_train)
y_pred_class=knn.predict(x_test)
print ((metrics.accuracy_score(y_test, y_pred_class)))

0.947849257708


In [21]:
# fit the model with data
knn.fit(x, y)
# store the predicted response values.
MnV_Knn_predict=knn.predict(x)
knn.predict_proba(x)

array([[ 1.,  0.],
       [ 1.,  0.],
       [ 1.,  0.],
       ..., 
       [ 1.,  0.],
       [ 1.,  0.],
       [ 1.,  0.]])

In [22]:
# store the predicted probabilities of MNVpredcition to train data.
train2['MnV_Knn_predict']=knn.predict_proba(x)[:, 1]
train2.head(5)

Unnamed: 0,Date,Address,Species,Block,Street,Trap,AddressNumberAndStreet,Latitude,Longitude,AddressAccuracy,...,MnVpresent_null,Species_c,Species_1,Species_2,Species_3,Species_4,Species_5,Species_6,MnV_Species_pred_Prob,MnV_Knn_predict
0,2007-05-29,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX PIPIENS/RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,...,0.0524,2,0,1,0,0,0,0,0.05344,0.0
1,2007-05-29,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,...,0.0524,3,0,0,1,0,0,0,0.018756,0.0
2,2007-05-29,"6200 North Mandell Avenue, Chicago, IL 60646, USA",CULEX RESTUANS,62,N MANDELL AVE,T007,"6200 N MANDELL AVE, Chicago, IL",41.994991,-87.769279,9,...,0.0524,3,0,0,1,0,0,0,0.018756,0.0
3,2007-05-29,"7900 West Foster Avenue, Chicago, IL 60656, USA",CULEX PIPIENS/RESTUANS,79,W FOSTER AVE,T015,"7900 W FOSTER AVE, Chicago, IL",41.974089,-87.824812,8,...,0.0524,2,0,1,0,0,0,0,0.05344,0.0
4,2007-05-29,"7900 West Foster Avenue, Chicago, IL 60656, USA",CULEX RESTUANS,79,W FOSTER AVE,T015,"7900 W FOSTER AVE, Chicago, IL",41.974089,-87.824812,8,...,0.0524,3,0,0,1,0,0,0,0.018756,0.0


In [23]:
train2.groupby('Species').sum()

Unnamed: 0_level_0,Block,Latitude,Longitude,AddressAccuracy,NumMosquitos,WnvPresent,MnVpresent_null,Species_c,Species_1,Species_2,Species_3,Species_4,Species_5,Species_6,MnV_Species_pred_Prob,MnV_Knn_predict
Species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
CULEX ERRATICUS,10,41.974689,-87.890615,9,7,0,0.0524,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.047523,0.0
CULEX PIPIENS,88527,112854.372337,-236649.617578,20040,44671,240,141.4276,2699.0,2699.0,0.0,0.0,0.0,0.0,0.0,241.64676,0.0
CULEX PIPIENS/RESTUANS,174429,198865.648503,-416775.307552,37729,66268,262,249.0048,9504.0,0.0,4752.0,0.0,0.0,0.0,0.0,253.946855,0.0
CULEX RESTUANS,100789,114693.922296,-240334.622989,21974,23431,49,143.576,8220.0,0.0,0.0,2740.0,0.0,0.0,0.0,51.390105,0.0
CULEX SALINARIUS,2806,3596.961565,-7541.6658,668,145,0,4.5064,344.0,0.0,0.0,0.0,86.0,0.0,0.0,1.489839,0.0
CULEX TARSALIS,183,250.908138,-525.969405,45,7,0,0.3144,30.0,0.0,0.0,0.0,0.0,6.0,0.0,0.254114,0.0
CULEX TERRITANS,8192,9279.22185,-19460.159333,1687,510,0,11.6328,1332.0,0.0,0.0,0.0,0.0,0.0,222.0,2.137249,0.0


In [24]:
test.head(5)

Unnamed: 0,Id,Date,Address,Species,Block,Street,Trap,AddressNumberAndStreet,Latitude,Longitude,AddressAccuracy
0,1,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX PIPIENS/RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9
1,2,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9
2,3,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX PIPIENS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9
3,4,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX SALINARIUS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9
4,5,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX TERRITANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9


In [25]:
train['Species']=train['Species'].astype('category')
train.head()

Unnamed: 0,Date,Address,Species,Block,Street,Trap,AddressNumberAndStreet,Latitude,Longitude,AddressAccuracy,NumMosquitos,WnvPresent,MnVpresent_null,Species_c
0,2007-05-29,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX PIPIENS/RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,1,0,0.0524,2
1,2007-05-29,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,1,0,0.0524,3
2,2007-05-29,"6200 North Mandell Avenue, Chicago, IL 60646, USA",CULEX RESTUANS,62,N MANDELL AVE,T007,"6200 N MANDELL AVE, Chicago, IL",41.994991,-87.769279,9,1,0,0.0524,3
3,2007-05-29,"7900 West Foster Avenue, Chicago, IL 60656, USA",CULEX PIPIENS/RESTUANS,79,W FOSTER AVE,T015,"7900 W FOSTER AVE, Chicago, IL",41.974089,-87.824812,8,1,0,0.0524,2
4,2007-05-29,"7900 West Foster Avenue, Chicago, IL 60656, USA",CULEX RESTUANS,79,W FOSTER AVE,T015,"7900 W FOSTER AVE, Chicago, IL",41.974089,-87.824812,8,4,0,0.0524,3
