# Project Steps:
#### 1.Import data and python functions
#### 2.Run null model: calculate the percetage(%) of WMV
#### 3.Pick 1 features and use Random Forest/KNN/Logistic regression to get the model
#### 4. Creat confusion matrix: Recall, Precision, Accuracy
#### 5. pick other features and run models again

### Part1: Import data and Python functions

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore') #help ignore warning, make it more clean
pd.set_option('display.max_columns', None)


# import train data
train=pd.read_csv('train.csv')
test=pd.read_csv('test.csv')
spray=pd.read_csv('spray.csv')

In [2]:
train.head(2)

Unnamed: 0,Date,Address,Species,Block,Street,Trap,AddressNumberAndStreet,Latitude,Longitude,AddressAccuracy,NumMosquitos,WnvPresent
0,2007-05-29,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX PIPIENS/RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,1,0
1,2007-05-29,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,1,0


In [3]:
print 'Train Table - Num of Rows: '+str(train.shape[0])
print 'Train Table - Num of Columns: ' + str(train.shape[1])
print 'Test Table - Num of Rows: ' + str(test.shape[0])
print 'Test Table - Num of Columns: ' + str(test.shape[0])
print 'Spray Table - Num of Rows: '+str(spray.shape[0])
print 'Spray Table - Num of Columns: ' + str(spray.shape[1])

Train Table - Num of Rows: 10506
Train Table - Num of Columns: 12
Test Table - Num of Rows: 116293
Test Table - Num of Columns: 116293
Spray Table - Num of Rows: 14835
Spray Table - Num of Columns: 4


In [4]:
train.columns

Index([u'Date', u'Address', u'Species', u'Block', u'Street', u'Trap',
       u'AddressNumberAndStreet', u'Latitude', u'Longitude',
       u'AddressAccuracy', u'NumMosquitos', u'WnvPresent'],
      dtype='object')

In [5]:
train.dtypes

Date                       object
Address                    object
Species                    object
Block                       int64
Street                     object
Trap                       object
AddressNumberAndStreet     object
Latitude                  float64
Longitude                 float64
AddressAccuracy             int64
NumMosquitos                int64
WnvPresent                  int64
dtype: object

In [6]:
spray.head(2)

Unnamed: 0,Date,Time,Latitude,Longitude
0,2011-08-29,6:56:58 PM,42.391623,-88.089163
1,2011-08-29,6:57:08 PM,42.391348,-88.089163


In [7]:
test.head(2)

Unnamed: 0,Id,Date,Address,Species,Block,Street,Trap,AddressNumberAndStreet,Latitude,Longitude,AddressAccuracy
0,1,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX PIPIENS/RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9
1,2,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9


### Part 2 Run null model: calculate the percetage(%) of WMV


In [8]:
# check the null value
train.WnvPresent.isnull().sum()


0

In [9]:
# run the null model
train_null= train.WnvPresent.mean()
train_null

0.05244622120692937

In [10]:
# add the prediction value to train datafram as MnVpresent_null
train['MnVpresent_null']=train.WnvPresent.map({0:0.0524, 1:0.0524})

In [11]:
test.Species.value_counts()

CULEX PIPIENS/RESTUANS    15359
CULEX RESTUANS            14670
CULEX PIPIENS             14521
CULEX SALINARIUS          14355
CULEX TERRITANS           14351
CULEX TARSALIS            14347
CULEX ERRATICUS           14345
UNSPECIFIED CULEX         14345
Name: Species, dtype: int64

### Part 3: Pick 1 Features to Run Model 

#### Pick the feature: Species--the species of mosquitos

In [12]:
train.groupby('Species').sum() #there are 7 types of Mosquito, three types of them have virus

Unnamed: 0_level_0,Block,Latitude,Longitude,AddressAccuracy,NumMosquitos,WnvPresent,MnVpresent_null
Species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
CULEX ERRATICUS,10,41.974689,-87.890615,9,7,0,0.0524
CULEX PIPIENS,88527,112854.372337,-236649.617578,20040,44671,240,141.4276
CULEX PIPIENS/RESTUANS,174429,198865.648503,-416775.307552,37729,66268,262,249.0048
CULEX RESTUANS,100789,114693.922296,-240334.622989,21974,23431,49,143.576
CULEX SALINARIUS,2806,3596.961565,-7541.6658,668,145,0,4.5064
CULEX TARSALIS,183,250.908138,-525.969405,45,7,0,0.3144
CULEX TERRITANS,8192,9279.22185,-19460.159333,1687,510,0,11.6328


In [13]:
# convert the string value of species to categorical variables 
train['Species']=train['Species'].astype('category')
train.dtypes

Date                        object
Address                     object
Species                   category
Block                        int64
Street                      object
Trap                        object
AddressNumberAndStreet      object
Latitude                   float64
Longitude                  float64
AddressAccuracy              int64
NumMosquitos                 int64
WnvPresent                   int64
MnVpresent_null            float64
dtype: object

In [14]:
# code the species to categorical numbers and add the new variable into train dataset
train['Species_c']=train['Species'].cat.codes
train.head(5)

Unnamed: 0,Date,Address,Species,Block,Street,Trap,AddressNumberAndStreet,Latitude,Longitude,AddressAccuracy,NumMosquitos,WnvPresent,MnVpresent_null,Species_c
0,2007-05-29,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX PIPIENS/RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,1,0,0.0524,2
1,2007-05-29,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,1,0,0.0524,3
2,2007-05-29,"6200 North Mandell Avenue, Chicago, IL 60646, USA",CULEX RESTUANS,62,N MANDELL AVE,T007,"6200 N MANDELL AVE, Chicago, IL",41.994991,-87.769279,9,1,0,0.0524,3
3,2007-05-29,"7900 West Foster Avenue, Chicago, IL 60656, USA",CULEX PIPIENS/RESTUANS,79,W FOSTER AVE,T015,"7900 W FOSTER AVE, Chicago, IL",41.974089,-87.824812,8,1,0,0.0524,2
4,2007-05-29,"7900 West Foster Avenue, Chicago, IL 60656, USA",CULEX RESTUANS,79,W FOSTER AVE,T015,"7900 W FOSTER AVE, Chicago, IL",41.974089,-87.824812,8,4,0,0.0524,3


### Run Supervised Learning Models 

### Model Training

### Train-Test Split

In [19]:
# Scale the data
feature_cols=['Species_c']

X=train[feature_cols]
y=train.WnvPresent


from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
X=scaler.fit_transform(X)

In [21]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
LR=LogisticRegression()
LR.fit(X, y)
LR.predict_proba(X)

array([[ 0.95182385,  0.04817615],
       [ 0.97640456,  0.02359544],
       [ 0.97640456,  0.02359544],
       ..., 
       [ 0.95182385,  0.04817615],
       [ 0.95182385,  0.04817615],
       [ 0.95182385,  0.04817615]])

### K-Fold Cross-validation/Logistic Regression/KNN

In [23]:
from sklearn.cross_validation import KFold

#This program does 5-fold. It saves the result at each time as different parts of y_pred. 
#In the end, it returns the y_pred as the result of all the five 5-fold.
def run_cv(X, y, clf_class, **kwargs):
    #construct a kfolds object
    kf=KFold(len(y), n_folds=5, shuffle=True)
    y_pred=y.copy()
    clf=clf_class(**kwargs)
    #Iterate through folds
    for train_index, test_index in kf:
        X_train, X_test=X[train_index], X[test_index]
        y_train=y[train_index]
        clf.fit(X_train, y_train)
        y_pred[test_index]=clf.predict(X_test)
    return y_pred
        



In [24]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics

def accuracy(y_true, y_pred):
    return np.mean(y_true==y_pred) # numpy interprets True and False as 1 and 0

LR_CV_result=run_cv(X, y, LogisticRegression)
RF_CV_result=run_cv(X, y, RandomForestClassifier)
KNN_CV_result=run_cv(X, y, KNeighborsClassifier) # default: N-neighbors=5

In [25]:
#print accuracy for each model
print "Logistic Regression(L2 is default): " + str(accuracy(y, LR_CV_result))
print "Random Forest: " + str(accuracy(y, RF_CV_result))
print "K-nearest-neighbors: " + str(accuracy(y, KNN_CV_result))

Logistic Regression(L2 is default): 0.947553778793
Random Forest: 0.947553778793
K-nearest-neighbors: 0.947553778793


### Find optimal Parameters-KNN

In [26]:
def print_grid_search_metrics(gs):
    print "Best Score: %0.3f" % gs.best_score_
    print "Best parameters set: "
    best_parameters=gs.best_params_
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))

In [27]:
from sklearn.grid_search import GridSearchCV
parameters={
    'n_neighbors': [3, 5, 7, 10]
}
Grid_KNN=GridSearchCV(KNeighborsClassifier(), parameters, cv=5, verbose=1, refit=False)
Grid_KNN.fit(X, y)



Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:    1.9s finished


GridSearchCV(cv=5, error_score='raise',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_neighbors': [3, 5, 7, 10]}, pre_dispatch='2*n_jobs',
       refit=False, scoring=None, verbose=1)

In [28]:
print_grid_search_metrics(Grid_KNN)

Best Score: 0.948
Best parameters set: 
	n_neighbors: 3


In [29]:
from sklearn.cross_validation import cross_val_score
score=cross_val_score(KNeighborsClassifier(n_neighbors=5), X,y,cv=5)
print "5-fold cross validation accuracy: " + str(np.mean(score))

5-fold cross validation accuracy: 0.947553813122


### Part 4: Calculate Confusion Matrix

In [30]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

def cal_evaluation(classifier, cm):
    tn=cm[0][0]
    fp=cm[0][1]
    fn=cm[1][0]
    tp=cm[1][1]
    accuracy=(tp+tn)/(tp+fp+fn+tn+0.0)
    precision=tp(tp+fp+0.0)
    recall=tp/(tp+fn+0.0)
    print classifier
    print "Accuracy is " + str(accuracy)
    print "Precision is " + str(precision)
    print "Recall is " + str(recall)
    
def show_confusion_matrices(confusion_matrices, class_names):
    class_names=['No','Yes']
    for cm in confusion_matrices:
        classifier, cm=cm[0], cm[1]
        cal_evaluation(classifier, cm)
        fig=plt.figure()
        ax=fig.add_subplot(111)
        cax=ax.matshow(cm, interpolation='nearest', cmap=plt.get_cmap('Reds'))
        plt.title('Confusion matrix for %s' % classifier)
        fig.colorbar(cax)
        ax.set_xticklabels(['']+class_names)
        ax.set_yticklabels(['']+class_names)
        plt.xlabel('Predicted')
        plt.ylabel('True')
        plt.show()

In [31]:
%matplotlib inline
y=np.array(y)
class_names=np.unique(y)
print class_names

confusion_matrices=[
    ("Random Forest", confusion_matrix(y, RF_CV_result)),
    ("K-Nearest-Neighbors", confusion_matrix(y, KNN_CV_result))
]

show_confusion_matrices(confusion_matrices, class_names)

[0 1]


TypeError: 'numpy.int64' object is not callable

### Part 5: Test other Features

### pick date (month) as one feature

In [32]:
# define a function to extract month from the date
def month_extractor(date):
    date=str(date)
    month=date[5:7]
    return int(month)

month_extractor("2007-12-23")

12

In [33]:
# create new column with month with the month_extractor function
train['Month']=train['Date'].map(lambda x: month_extractor(x))
train.head()

Unnamed: 0,Date,Address,Species,Block,Street,Trap,AddressNumberAndStreet,Latitude,Longitude,AddressAccuracy,NumMosquitos,WnvPresent,MnVpresent_null,Species_c,Month
0,2007-05-29,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX PIPIENS/RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,1,0,0.0524,2,5
1,2007-05-29,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,1,0,0.0524,3,5
2,2007-05-29,"6200 North Mandell Avenue, Chicago, IL 60646, USA",CULEX RESTUANS,62,N MANDELL AVE,T007,"6200 N MANDELL AVE, Chicago, IL",41.994991,-87.769279,9,1,0,0.0524,3,5
3,2007-05-29,"7900 West Foster Avenue, Chicago, IL 60656, USA",CULEX PIPIENS/RESTUANS,79,W FOSTER AVE,T015,"7900 W FOSTER AVE, Chicago, IL",41.974089,-87.824812,8,1,0,0.0524,2,5
4,2007-05-29,"7900 West Foster Avenue, Chicago, IL 60656, USA",CULEX RESTUANS,79,W FOSTER AVE,T015,"7900 W FOSTER AVE, Chicago, IL",41.974089,-87.824812,8,4,0,0.0524,3,5


In [34]:
train_month=train.groupby('Month').WnvPresent.sum().sort_values(ascending=False)
train_month.head()

Month
8     377
9     125
7      46
10      2
6       1
Name: WnvPresent, dtype: int64

### Model Training

In [40]:
# Scale the data
feature_cols=['Month', 'Species_c']

X=train[feature_cols]
y=train.WnvPresent


from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
X=scaler.fit_transform(X)

In [41]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
LR=LogisticRegression()
LR.fit(X, y)
LR.predict_proba(X)[0:20]

array([[ 0.98074475,  0.01925525],
       [ 0.98944904,  0.01055096],
       [ 0.98944904,  0.01055096],
       [ 0.98074475,  0.01925525],
       [ 0.98944904,  0.01055096],
       [ 0.98944904,  0.01055096],
       [ 0.98944904,  0.01055096],
       [ 0.98074475,  0.01925525],
       [ 0.98944904,  0.01055096],
       [ 0.98944904,  0.01055096],
       [ 0.98074475,  0.01925525],
       [ 0.98074475,  0.01925525],
       [ 0.98944904,  0.01055096],
       [ 0.98074475,  0.01925525],
       [ 0.98944904,  0.01055096],
       [ 0.98944904,  0.01055096],
       [ 0.98944904,  0.01055096],
       [ 0.98944904,  0.01055096],
       [ 0.96511282,  0.03488718],
       [ 0.98074475,  0.01925525]])

In [42]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics

def accuracy(y_true, y_pred):
    return np.mean(y_true==y_pred) # numpy interprets True and False as 1 and 0

LR_CV_result=run_cv(X, y, LogisticRegression)
RF_CV_result=run_cv(X, y, RandomForestClassifier)
KNN_CV_result=run_cv(X, y, KNeighborsClassifier) # default: N-neighbors=5

In [43]:
#print accuracy for each model
print "Logistic Regression(L2 is default): " + str(accuracy(y, LR_CV_result))
print "Random Forest: " + str(accuracy(y, RF_CV_result))
print "K-nearest-neighbors: " + str(accuracy(y, KNN_CV_result))

Logistic Regression(L2 is default): 0.947553778793
Random Forest: 0.947553778793
K-nearest-neighbors: 0.947553778793
