In [1]:
#### PACKAGE AND DATA IMPORTS
##########################
import pandas as pd
import numpy as np
from sklearn.preprocessing import Imputer
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
train_df = pd.read_csv('./kaggle_data/train.csv')
weather_df = pd.read_csv('./kaggle_data/weather.csv')
spray_df = pd.read_csv('./kaggle_data/spray.csv')

In [3]:
###  EDA
##########################
train_df.head()

Unnamed: 0,Date,Address,Species,Block,Street,Trap,AddressNumberAndStreet,Latitude,Longitude,AddressAccuracy,NumMosquitos,WnvPresent
0,2007-05-29,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX PIPIENS/RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,1,0
1,2007-05-29,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,1,0
2,2007-05-29,"6200 North Mandell Avenue, Chicago, IL 60646, USA",CULEX RESTUANS,62,N MANDELL AVE,T007,"6200 N MANDELL AVE, Chicago, IL",41.994991,-87.769279,9,1,0
3,2007-05-29,"7900 West Foster Avenue, Chicago, IL 60656, USA",CULEX PIPIENS/RESTUANS,79,W FOSTER AVE,T015,"7900 W FOSTER AVE, Chicago, IL",41.974089,-87.824812,8,1,0
4,2007-05-29,"7900 West Foster Avenue, Chicago, IL 60656, USA",CULEX RESTUANS,79,W FOSTER AVE,T015,"7900 W FOSTER AVE, Chicago, IL",41.974089,-87.824812,8,4,0


In [4]:
weather_df.head()

Unnamed: 0,Station,Date,Tmax,Tmin,Tavg,Depart,DewPoint,WetBulb,Heat,Cool,...,CodeSum,Depth,Water1,SnowFall,PrecipTotal,StnPressure,SeaLevel,ResultSpeed,ResultDir,AvgSpeed
0,1,2007-05-01,83,50,67,14,51,56,0,2,...,,0,M,0.0,0.0,29.1,29.82,1.7,27,9.2
1,2,2007-05-01,84,52,68,M,51,57,0,3,...,,M,M,M,0.0,29.18,29.82,2.7,25,9.6
2,1,2007-05-02,59,42,51,-3,42,47,14,0,...,BR,0,M,0.0,0.0,29.38,30.09,13.0,4,13.4
3,2,2007-05-02,60,43,52,M,42,47,13,0,...,BR HZ,M,M,M,0.0,29.44,30.08,13.3,2,13.4
4,1,2007-05-03,66,46,56,2,40,48,9,0,...,,0,M,0.0,0.0,29.39,30.12,11.7,7,11.9


In [6]:
#sns.heatmap(train_df.corr(), annot=True)

Unsurprisingly, the likelihood of WNV increases as the number of mosquitos increase, but NumMosquitos is not a feature in the test set

In [7]:
# print(train_df.shape)
# train_df.isnull().sum()

In [8]:
# print(spray_df.shape)
# spray_df.isnull().sum()
                 ## We will need to deal with these time nulls, but it may make sense to drop the time column
                  # since the other dfs dont have time

In [9]:
# print(spray_df.shape)
# weather_df.isnull().sum()

In [10]:
 # dropping time from spray data because it is not in any other dfs
spray_df.drop('Time', axis = 1, inplace = True)

In [12]:
# creating weather data df using only one station
daily_weather = weather_df[weather_df['Station'] == 1] 

In [13]:
# dropping station label since all are station 1
daily_weather.drop('Station', axis= 1, inplace=True) 

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [14]:
 # datetime index on weather data
daily_weather.reset_index(inplace=True, drop=True) 

daily_weather['Date'] = pd.to_datetime(daily_weather['Date']) 

daily_weather.set_index('Date',inplace=True, drop=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [15]:
#datetime index on training data
train_df['Date'] = pd.to_datetime(train_df['Date'])

train_df.set_index('Date', inplace=True, drop=True)

In [16]:
# new_df = combined weather and train data
new_df = pd.merge(train_df, 
                  daily_weather, 
                  left_on = train_df.index, right_on = daily_weather.index)

In [18]:
#assiging date as index of new_df 
new_df['key_0'] = pd.to_datetime(new_df['key_0'])

new_df.set_index('key_0', inplace=True, drop=True)

new_df.index.rename('Date', inplace=True) 

In [19]:
# dropping these columns since they provided no info
new_df.drop(['Water1','SnowFall'], axis=1, inplace=True)

In [20]:
# storing mode precip value for replacement in next step
mode_precip = float(new_df[new_df['PrecipTotal'] != '  T'].StnPressure.mode()[0]) 

In [21]:
# replacing '  T' in PrecipTotal column with mode precip value
precip_totals = []
for total in new_df.PrecipTotal:
    if total == '  T':
        precip_totals.append(mode_precip)
    else:
        precip_totals.append(total)

new_df.PrecipTotal = pd.to_numeric(precip_totals) 

In [22]:
# storing mode pressure for replacement in next step
mode_pressure = new_df[new_df['StnPressure'] != 'M'].StnPressure.mode() 

In [23]:
# replacing 'M' in StnPressure column with mode pressure value
pressures = []
for pressure in new_df.StnPressure:
    if pressure == 'M':
        pressures.append(mode_pressure)
    else:
        pressures.append(pressure)
pressures = [float(pressure) for pressure in pressures]

new_df.StnPressure = pd.to_numeric(pressures) 

In [24]:
pressures = [float(pressure) for pressure in pressures] # converting strings to floats

In [25]:
#new_df.dtypes
cols_to_change = ['Tavg',
                 'Depart',
                 'Cool',
                 'Sunrise',
                 'Sunset',
                 'Depth',
                 'PrecipTotal',
                 'StnPressure',
                 'SeaLevel',
                 'AvgSpeed'
                 ] # columns of type object that can be coerced to numeric values

In [26]:
for col in cols_to_change:
    new_df[col] = pd.to_numeric(new_df[col])#changing columns above to numeric

In [27]:
new_df

Unnamed: 0_level_0,Address,Species,Block,Street,Trap,AddressNumberAndStreet,Latitude,Longitude,AddressAccuracy,NumMosquitos,...,Sunrise,Sunset,CodeSum,Depth,PrecipTotal,StnPressure,SeaLevel,ResultSpeed,ResultDir,AvgSpeed
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2007-05-29,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX PIPIENS/RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.954690,-87.800991,9,1,...,421,1917,BR HZ,0,0.00,29.39,30.11,5.8,18,6.5
2007-05-29,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.954690,-87.800991,9,1,...,421,1917,BR HZ,0,0.00,29.39,30.11,5.8,18,6.5
2007-05-29,"6200 North Mandell Avenue, Chicago, IL 60646, USA",CULEX RESTUANS,62,N MANDELL AVE,T007,"6200 N MANDELL AVE, Chicago, IL",41.994991,-87.769279,9,1,...,421,1917,BR HZ,0,0.00,29.39,30.11,5.8,18,6.5
2007-05-29,"7900 West Foster Avenue, Chicago, IL 60656, USA",CULEX PIPIENS/RESTUANS,79,W FOSTER AVE,T015,"7900 W FOSTER AVE, Chicago, IL",41.974089,-87.824812,8,1,...,421,1917,BR HZ,0,0.00,29.39,30.11,5.8,18,6.5
2007-05-29,"7900 West Foster Avenue, Chicago, IL 60656, USA",CULEX RESTUANS,79,W FOSTER AVE,T015,"7900 W FOSTER AVE, Chicago, IL",41.974089,-87.824812,8,4,...,421,1917,BR HZ,0,0.00,29.39,30.11,5.8,18,6.5
2007-05-29,"1500 West Webster Avenue, Chicago, IL 60614, USA",CULEX RESTUANS,15,W WEBSTER AVE,T045,"1500 W WEBSTER AVE, Chicago, IL",41.921600,-87.666455,8,2,...,421,1917,BR HZ,0,0.00,29.39,30.11,5.8,18,6.5
2007-05-29,"2500 West Grand Avenue, Chicago, IL 60654, USA",CULEX RESTUANS,25,W GRAND AVE,T046,"2500 W GRAND AVE, Chicago, IL",41.891118,-87.654491,8,1,...,421,1917,BR HZ,0,0.00,29.39,30.11,5.8,18,6.5
2007-05-29,"1100 Roosevelt Road, Chicago, IL 60608, USA",CULEX PIPIENS/RESTUANS,11,W ROOSEVELT,T048,"1100 W ROOSEVELT, Chicago, IL",41.867108,-87.654224,8,1,...,421,1917,BR HZ,0,0.00,29.39,30.11,5.8,18,6.5
2007-05-29,"1100 Roosevelt Road, Chicago, IL 60608, USA",CULEX RESTUANS,11,W ROOSEVELT,T048,"1100 W ROOSEVELT, Chicago, IL",41.867108,-87.654224,8,2,...,421,1917,BR HZ,0,0.00,29.39,30.11,5.8,18,6.5
2007-05-29,"1100 West Chicago Avenue, Chicago, IL 60642, USA",CULEX RESTUANS,11,W CHICAGO,T049,"1100 W CHICAGO, Chicago, IL",41.896282,-87.655232,8,1,...,421,1917,BR HZ,0,0.00,29.39,30.11,5.8,18,6.5


In [28]:
new_df

Unnamed: 0_level_0,Address,Species,Block,Street,Trap,AddressNumberAndStreet,Latitude,Longitude,AddressAccuracy,NumMosquitos,...,Sunrise,Sunset,CodeSum,Depth,PrecipTotal,StnPressure,SeaLevel,ResultSpeed,ResultDir,AvgSpeed
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2007-05-29,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX PIPIENS/RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.954690,-87.800991,9,1,...,421,1917,BR HZ,0,0.00,29.39,30.11,5.8,18,6.5
2007-05-29,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.954690,-87.800991,9,1,...,421,1917,BR HZ,0,0.00,29.39,30.11,5.8,18,6.5
2007-05-29,"6200 North Mandell Avenue, Chicago, IL 60646, USA",CULEX RESTUANS,62,N MANDELL AVE,T007,"6200 N MANDELL AVE, Chicago, IL",41.994991,-87.769279,9,1,...,421,1917,BR HZ,0,0.00,29.39,30.11,5.8,18,6.5
2007-05-29,"7900 West Foster Avenue, Chicago, IL 60656, USA",CULEX PIPIENS/RESTUANS,79,W FOSTER AVE,T015,"7900 W FOSTER AVE, Chicago, IL",41.974089,-87.824812,8,1,...,421,1917,BR HZ,0,0.00,29.39,30.11,5.8,18,6.5
2007-05-29,"7900 West Foster Avenue, Chicago, IL 60656, USA",CULEX RESTUANS,79,W FOSTER AVE,T015,"7900 W FOSTER AVE, Chicago, IL",41.974089,-87.824812,8,4,...,421,1917,BR HZ,0,0.00,29.39,30.11,5.8,18,6.5
2007-05-29,"1500 West Webster Avenue, Chicago, IL 60614, USA",CULEX RESTUANS,15,W WEBSTER AVE,T045,"1500 W WEBSTER AVE, Chicago, IL",41.921600,-87.666455,8,2,...,421,1917,BR HZ,0,0.00,29.39,30.11,5.8,18,6.5
2007-05-29,"2500 West Grand Avenue, Chicago, IL 60654, USA",CULEX RESTUANS,25,W GRAND AVE,T046,"2500 W GRAND AVE, Chicago, IL",41.891118,-87.654491,8,1,...,421,1917,BR HZ,0,0.00,29.39,30.11,5.8,18,6.5
2007-05-29,"1100 Roosevelt Road, Chicago, IL 60608, USA",CULEX PIPIENS/RESTUANS,11,W ROOSEVELT,T048,"1100 W ROOSEVELT, Chicago, IL",41.867108,-87.654224,8,1,...,421,1917,BR HZ,0,0.00,29.39,30.11,5.8,18,6.5
2007-05-29,"1100 Roosevelt Road, Chicago, IL 60608, USA",CULEX RESTUANS,11,W ROOSEVELT,T048,"1100 W ROOSEVELT, Chicago, IL",41.867108,-87.654224,8,2,...,421,1917,BR HZ,0,0.00,29.39,30.11,5.8,18,6.5
2007-05-29,"1100 West Chicago Avenue, Chicago, IL 60642, USA",CULEX RESTUANS,11,W CHICAGO,T049,"1100 W CHICAGO, Chicago, IL",41.896282,-87.655232,8,1,...,421,1917,BR HZ,0,0.00,29.39,30.11,5.8,18,6.5


In [30]:
# make new_df lat / long a Point obj
# drop lat/long
#make spray_df lat/long a Point obj
#drop lat/long
#combine spray_df with new_df
new_df.to_csv('./train-weather_df')