# Project 4: West Nile Virus Prediction
____________

#  Kaggle Predictions

To predicting the Wnv presence on the Kaggle dataset.

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

from sklearn.preprocessing import StandardScaler,  PolynomialFeatures
from sklearn.ensemble import GradientBoostingClassifier, ExtraTreesClassifier, AdaBoostClassifier

### Import data

In [2]:
# Import weather data to merge with test data
weather = pd.read_csv('../assets/cleaned_weather.csv')

# Import train data to compare the features needed
X_train = pd.read_csv('../assets/X_train.csv')

# Import train dataset
test = pd.read_csv('../assets/cleaned_test.csv')

In [3]:
weather.head()

Unnamed: 0.1,Unnamed: 0,Station,Date,Tmax,Tmin,Tavg,Depart,DewPoint,WetBulb,Heat,...,Sunset,PrecipTotal,StnPressure,SeaLevel,ResultSpeed,ResultDir,AvgSpeed,day,month,year
0,0,1,2007-05-01,83,50,67,14,51,56.0,0,...,1849,0.0,29.1,29.82,1.7,27,9.2,1,5,2007
1,1,2,2007-05-01,84,52,68,14,51,57.0,0,...,1849,0.0,29.18,29.82,2.7,25,9.6,1,5,2007
2,2,1,2007-05-02,59,42,51,-3,42,47.0,14,...,1850,0.0,29.38,30.09,13.0,4,13.4,2,5,2007
3,3,2,2007-05-02,60,43,52,-3,42,47.0,13,...,1850,0.0,29.44,30.08,13.3,2,13.4,2,5,2007
4,4,1,2007-05-03,66,46,56,2,40,48.0,9,...,1851,0.0,29.39,30.12,11.7,7,11.9,3,5,2007


In [4]:
# Import Kaggle Test data
test = pd.read_csv('../assets/cleaned_test.csv')

In [5]:
test.head()

Unnamed: 0,Id,Date,Address,Species,Block,Street,Trap,AddressNumberAndStreet,Latitude,Longitude,AddressAccuracy,day,month,year
0,1,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX PIPIENS/RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,11,6,2008
1,2,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,11,6,2008
2,3,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX PIPIENS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,11,6,2008
3,4,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX SALINARIUS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,11,6,2008
4,5,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX TERRITANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,11,6,2008


In [6]:
test.shape

(116293, 14)

In [7]:
# Convert date to datetime format
test['Date'] = pd.to_datetime(test['Date'])
weather['Date'] = pd.to_datetime(weather['Date'])

In [8]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 116293 entries, 0 to 116292
Data columns (total 14 columns):
 #   Column                  Non-Null Count   Dtype         
---  ------                  --------------   -----         
 0   Id                      116293 non-null  int64         
 1   Date                    116293 non-null  datetime64[ns]
 2   Address                 116293 non-null  object        
 3   Species                 116293 non-null  object        
 4   Block                   116293 non-null  int64         
 5   Street                  116293 non-null  object        
 6   Trap                    116293 non-null  object        
 7   AddressNumberAndStreet  116293 non-null  object        
 8   Latitude                116293 non-null  float64       
 9   Longitude               116293 non-null  float64       
 10  AddressAccuracy         116293 non-null  int64         
 11  day                     116293 non-null  int64         
 12  month                   116293

We see that the test data is complete with no missing/null values. The data types are also correct.

### Check for duplicates

In [9]:
test[test.duplicated()]

Unnamed: 0,Id,Date,Address,Species,Block,Street,Trap,AddressNumberAndStreet,Latitude,Longitude,AddressAccuracy,day,month,year


Test data is clean with no duplicates.

## Transformation of Test data

The test data which we have only contain the mosquito trap information, we want to add in the weather data and also transform the features to match that of the train data so that we can send it into our production model for predictions.

In [10]:
# As we will be using lat and long for location, remove other location variables
test.drop(columns=['Address','AddressNumberAndStreet','AddressAccuracy'], inplace=True)

In [11]:
# Map the species
test['species_no'] = test['Species'].map({'CULEX PIPIENS/RESTUANS': 2,'CULEX PIPIENS': 2,'CULEX RESTUANS': 1})
test['species_no'].fillna(value=0,inplace=True)

# Drop the Species
#test.drop(columns=['Species'], inplace=True)

In [12]:
# Check Species after update
test.head()

Unnamed: 0,Id,Date,Species,Block,Street,Trap,Latitude,Longitude,day,month,year,species_no
0,1,2008-06-11,CULEX PIPIENS/RESTUANS,41,N OAK PARK AVE,T002,41.95469,-87.800991,11,6,2008,2.0
1,2,2008-06-11,CULEX RESTUANS,41,N OAK PARK AVE,T002,41.95469,-87.800991,11,6,2008,1.0
2,3,2008-06-11,CULEX PIPIENS,41,N OAK PARK AVE,T002,41.95469,-87.800991,11,6,2008,2.0
3,4,2008-06-11,CULEX SALINARIUS,41,N OAK PARK AVE,T002,41.95469,-87.800991,11,6,2008,0.0
4,5,2008-06-11,CULEX TERRITANS,41,N OAK PARK AVE,T002,41.95469,-87.800991,11,6,2008,0.0


In [13]:
#Created a new dataset to sort the columns
test_final = test.groupby(['Id','Date','day','month','year','Trap','Latitude', 'Longitude',
                                 'Species','species_no'], as_index=False).sum().reindex()

In [14]:
test_final.head()

Unnamed: 0,Id,Date,day,month,year,Trap,Latitude,Longitude,Species,species_no,Block
0,1,2008-06-11,11,6,2008,T002,41.95469,-87.800991,CULEX PIPIENS/RESTUANS,2.0,41
1,2,2008-06-11,11,6,2008,T002,41.95469,-87.800991,CULEX RESTUANS,1.0,41
2,3,2008-06-11,11,6,2008,T002,41.95469,-87.800991,CULEX PIPIENS,2.0,41
3,4,2008-06-11,11,6,2008,T002,41.95469,-87.800991,CULEX SALINARIUS,0.0,41
4,5,2008-06-11,11,6,2008,T002,41.95469,-87.800991,CULEX TERRITANS,0.0,41


In [15]:
# Add in column to indicate the weather station for each trap in test data
import geopy.distance

station1 = (41.995, -87.933) #Station 1: CHICAGO O'HARE INTERNATIONAL AIRPORT
station2 = (41.786, -87.752) #Station 2: CHICAGO MIDWAY INTL ARPT
def get_station(lat, long):
    dist1 = geopy.distance.distance((lat, long), station1).km
    dist2 = geopy.distance.distance((lat, long), station2).km
    return 1 if dist1 < dist2 else 2

test_final['Station'] = [get_station(test_final.loc[i,'Latitude'],test_final.loc[i,'Longitude']) for i in test_final.index]

Combining test Datasets with train

In [16]:
# Merge weather data with test data
combined_test = pd.merge(test_final, weather, on=['Date','Station'], how='left')

In [17]:
# Add column for trap_sprayed to be zero since we do not have data of spraying effort in the years of test data
#test['trap_sprayed'] = 0

In [18]:
#Dropping unused features
combined_test = combined_test.drop(columns=['Unnamed: 0','day_y', 'month_y','year_y'])


#rename columns
combined_test = combined_test.rename(columns={'day_x': 'day', 'month_x': 'month', 'year_x': 'year'})

In [19]:
# Check data after update
combined_test.columns

Index(['Id', 'Date', 'day', 'month', 'year', 'Trap', 'Latitude', 'Longitude',
       'Species', 'species_no', 'Block', 'Station', 'Tmax', 'Tmin', 'Tavg',
       'Depart', 'DewPoint', 'WetBulb', 'Heat', 'Cool', 'Sunrise', 'Sunset',
       'PrecipTotal', 'StnPressure', 'SeaLevel', 'ResultSpeed', 'ResultDir',
       'AvgSpeed'],
      dtype='object')

In [20]:
combined_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 116293 entries, 0 to 116292
Data columns (total 28 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   Id           116293 non-null  int64         
 1   Date         116293 non-null  datetime64[ns]
 2   day          116293 non-null  int64         
 3   month        116293 non-null  int64         
 4   year         116293 non-null  int64         
 5   Trap         116293 non-null  object        
 6   Latitude     116293 non-null  float64       
 7   Longitude    116293 non-null  float64       
 8   Species      116293 non-null  object        
 9   species_no   116293 non-null  float64       
 10  Block        116293 non-null  int64         
 11  Station      116293 non-null  int64         
 12  Tmax         116293 non-null  int64         
 13  Tmin         116293 non-null  int64         
 14  Tavg         116293 non-null  int64         
 15  Depart       116293 non-null  int6

### Feature Engineering

In [21]:
# Applying lowercase on the dataset
combined_test.columns = combined_test.columns.map(lambda x: x.lower())

In [22]:
# Create long-lat column by multiplying longitude and latitude, remove individual columns
combined_test['long_lat'] = combined_test['latitude']*combined_test['longitude']
combined_test.drop(['latitude','longitude'],axis=1,inplace=True)

In [23]:
combined_test.head()

Unnamed: 0,id,date,day,month,year,trap,species,species_no,block,station,...,cool,sunrise,sunset,preciptotal,stnpressure,sealevel,resultspeed,resultdir,avgspeed,long_lat
0,1,2008-06-11,11,6,2008,T002,CULEX PIPIENS/RESTUANS,2.0,41,1,...,9,416,1926,0.0,29.28,29.99,8.9,18,10.0,-3683.663359
1,2,2008-06-11,11,6,2008,T002,CULEX RESTUANS,1.0,41,1,...,9,416,1926,0.0,29.28,29.99,8.9,18,10.0,-3683.663359
2,3,2008-06-11,11,6,2008,T002,CULEX PIPIENS,2.0,41,1,...,9,416,1926,0.0,29.28,29.99,8.9,18,10.0,-3683.663359
3,4,2008-06-11,11,6,2008,T002,CULEX SALINARIUS,0.0,41,1,...,9,416,1926,0.0,29.28,29.99,8.9,18,10.0,-3683.663359
4,5,2008-06-11,11,6,2008,T002,CULEX TERRITANS,0.0,41,1,...,9,416,1926,0.0,29.28,29.99,8.9,18,10.0,-3683.663359


In [24]:
# One-hot encode the traps
combined_test = pd.get_dummies(data=combined_test, columns=['trap'])

In [25]:
features = X_train.columns
features

Index(['species_no', 'month', 'sunrise', 'dewpoint', 'wetbulb', 'trap_T900',
       'tmin', 'tavg', 'cool', 'station', 'depart', 'sunset', 'tmax', 'heat',
       'resultspeed', 'long_lat', 'year', 'avgspeed', 'trap_T003', 'trap_T086',
       'trap_T225', 'trap_T143', 'trap_T115', 'trap_T002', 'trap_T223',
       'trap_T046', 'trap_T006', 'trap_T233', 'trap_T013', 'trap_T235',
       'trap_T017', 'trap_T014', 'trap_T230', 'trap_T148', 'trap_T200',
       'trap_T018', 'trap_T043', 'trap_T103', 'trap_T015', 'trap_T096'],
      dtype='object')

In [26]:
X_test = combined_test[features]
X_test.head()

Unnamed: 0,species_no,month,sunrise,dewpoint,wetbulb,trap_T900,tmin,tavg,cool,station,...,trap_T017,trap_T014,trap_T230,trap_T148,trap_T200,trap_T018,trap_T043,trap_T103,trap_T015,trap_T096
0,2.0,6,416,56,64.0,0,61,74,9,1,...,0,0,0,0,0,0,0,0,0,0
1,1.0,6,416,56,64.0,0,61,74,9,1,...,0,0,0,0,0,0,0,0,0,0
2,2.0,6,416,56,64.0,0,61,74,9,1,...,0,0,0,0,0,0,0,0,0,0
3,0.0,6,416,56,64.0,0,61,74,9,1,...,0,0,0,0,0,0,0,0,0,0
4,0.0,6,416,56,64.0,0,61,74,9,1,...,0,0,0,0,0,0,0,0,0,0


## Predictions

Now that the test data is ready, we can load in our production model and use it for predictions.

In [27]:
# Load in production model and scaler
poly = pickle.load(open('polyfeat.pkl', 'rb'))
ss = pickle.load(open('scaler.pkl', 'rb'))
model = pickle.load(open('production_model.pkl', 'rb'))

print(ss)
print(model)

StandardScaler()
AdaBoostClassifier(learning_rate=1, n_estimators=20, random_state=42)


In [28]:
# PolyNomial with default degree=2

X_test = poly.transform(X_test)

In [29]:
# Scale test data
X_test_sc = ss.transform(X_test)

In [30]:
predictions = model.predict(X_test_sc)
predictions

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [35]:
# Predict the probability 
pred_proba_t = [i[1] for i in model.predict_proba(X_test_sc)]

In [36]:
# Set up dataframe to save as .csv for submission
submission = pd.DataFrame(test['Id'])

# Add in the predictions
submission['WnvPresent'] = pred_proba_t

In [37]:
submission.head(10)

Unnamed: 0,Id,WnvPresent
0,1,0.39823
1,2,0.367207
2,3,0.39823
3,4,0.367207
4,5,0.367207
5,6,0.367207
6,7,0.367207
7,8,0.367207
8,9,0.39823
9,10,0.367207


In [38]:
submission['WnvPresent'].value_counts()

0.376943    13253
0.367207     7133
0.412900     3725
0.420645     3500
0.393706     3387
            ...  
0.505588        1
0.453388        1
0.510419        1
0.509474        1
0.479950        1
Name: WnvPresent, Length: 442, dtype: int64

#### Save predictions as `.csv` and submit

In [39]:
# Save csv
submission.to_csv('../assets/submission_predictions.csv', index=False)

![Submission](../images/kaggle_submission.png)