# Data Cleaning

## Imports

In [226]:
import numpy as np
import pandas as pd

from datetime import date
from datetime import datetime


## Training Data

In [277]:
train = pd.read_csv('../data/train.csv')

In [278]:
train.columns = np.array([col.lower() for col in train.columns])

In [279]:
train['date'] = pd.to_datetime(train['date'])

In [282]:
train.drop_duplicates()

Unnamed: 0,date,address,species,block,street,trap,addressnumberandstreet,latitude,longitude,addressaccuracy,nummosquitos,wnvpresent
0,2007-05-29,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX PIPIENS/RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.954690,-87.800991,9,1,0
1,2007-05-29,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.954690,-87.800991,9,1,0
2,2007-05-29,"6200 North Mandell Avenue, Chicago, IL 60646, USA",CULEX RESTUANS,62,N MANDELL AVE,T007,"6200 N MANDELL AVE, Chicago, IL",41.994991,-87.769279,9,1,0
3,2007-05-29,"7900 West Foster Avenue, Chicago, IL 60656, USA",CULEX PIPIENS/RESTUANS,79,W FOSTER AVE,T015,"7900 W FOSTER AVE, Chicago, IL",41.974089,-87.824812,8,1,0
4,2007-05-29,"7900 West Foster Avenue, Chicago, IL 60656, USA",CULEX RESTUANS,79,W FOSTER AVE,T015,"7900 W FOSTER AVE, Chicago, IL",41.974089,-87.824812,8,4,0
...,...,...,...,...,...,...,...,...,...,...,...,...
10501,2013-09-26,"5100 West 72nd Street, Chicago, IL 60638, USA",CULEX PIPIENS/RESTUANS,51,W 72ND ST,T035,"5100 W 72ND ST, Chicago, IL",41.763733,-87.742302,8,6,1
10502,2013-09-26,"5800 North Ridge Avenue, Chicago, IL 60660, USA",CULEX PIPIENS/RESTUANS,58,N RIDGE AVE,T231,"5800 N RIDGE AVE, Chicago, IL",41.987280,-87.666066,8,5,0
10503,2013-09-26,"1700 North Ashland Avenue, Chicago, IL 60622, USA",CULEX PIPIENS/RESTUANS,17,N ASHLAND AVE,T232,"1700 N ASHLAND AVE, Chicago, IL",41.912563,-87.668055,9,1,0
10504,2013-09-26,"7100 North Harlem Avenue, Chicago, IL 60631, USA",CULEX PIPIENS/RESTUANS,71,N HARLEM AVE,T233,"7100 N HARLEM AVE, Chicago, IL",42.009876,-87.807277,9,5,0


In [271]:
# identifying duplicate rows given that all other column data apart from species is duplicated
train[train.duplicated(subset=['date', 'address', 'block', 'street', 'trap',
       'addressnumberandstreet', 'latitude', 'longitude', 'addressaccuracy',
       'nummosquitos', 'wnvpresent']) == 1].head()

Unnamed: 0,date,address,species,block,street,trap,addressnumberandstreet,latitude,longitude,addressaccuracy,nummosquitos,wnvpresent


In [266]:

train['year'] = train['date'].apply(lambda x: x.year)
train['month'] = train['date'].apply(lambda x: x.month)
train['week'] = train['date'].apply(lambda x: x.week)
train['dayofweek'] = train['date'].apply(lambda x: x.dayofweek)

KeyError: 'date'

In [264]:
# Rearranging our columns
train = train[['date', 
    'address', 
    'species', 
    'block', 
    'street', 
    'trap', 
    'addressnumberandstreet', 
    'latitude', 
    'longitude',
    'addressaccuracy',
    'nummosquitos',
    'wnvpresent',
   ]]

In [265]:
train.set_index('date', inplace=True)
train.sort_index(inplace=True)

In [219]:
# We are going to work off the basis that there is good reason for there to be two rows with duplicated values 
# apart from species
# One good reason could be that they have to separate the species 
# identifying duplicate rows given that all other column data apart from species is duplicated
 
train[train.duplicated(subset=['date', 'address', 'block', 'street', 'trap',
       'addressnumberandstreet', 'latitude', 'longitude', 'addressaccuracy',
       'nummosquitos', 'wnvpresent']) == 1].head()

Unnamed: 0,date,address,species,block,street,trap,addressnumberandstreet,latitude,longitude,addressaccuracy,nummosquitos,wnvpresent
1,2007-05-29,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,1,0
14,2007-05-29,"2200 West 113th Street, Chicago, IL 60643, USA",CULEX RESTUANS,22,W 113TH ST,T086,"2200 W 113TH ST, Chicago, IL",41.688324,-87.676709,8,1,0
32,2007-06-05,"1500 West Webster Avenue, Chicago, IL 60614, USA",CULEX PIPIENS,15,W WEBSTER AVE,T045,"1500 W WEBSTER AVE, Chicago, IL",41.9216,-87.666455,8,1,0
35,2007-06-05,"1100 Roosevelt Road, Chicago, IL 60608, USA",CULEX RESTUANS,11,W ROOSEVELT,T048,"1100 W ROOSEVELT, Chicago, IL",41.867108,-87.654224,8,1,0
39,2007-06-05,"2100 North Stave Street, Chicago, IL 60647, USA",CULEX RESTUANS,21,N STAVE ST,T050,"2100 N STAVE ST, Chicago, IL",41.919343,-87.694259,8,1,0


In [275]:
train = train.drop_duplicates(subset=['date', 'address', 'block', 'street', 'trap',
       'addressnumberandstreet', 'latitude', 'longitude', 'addressaccuracy',
       'nummosquitos', 'wnvpresent'], keep='first')

In [241]:
train['species'].value_counts()

CULEX PIPIENS/RESTUANS    4469
CULEX RESTUANS            2413
CULEX PIPIENS             1891
CULEX TERRITANS            159
CULEX SALINARIUS            57
CULEX TARSALIS               4
CULEX ERRATICUS              1
Name: species, dtype: int64

In [219]:
# identifying duplicate rows given that all other column data apart from species is duplicated
train[train.duplicated(subset=['date', 'address', 'block', 'street', 'trap',
       'addressnumberandstreet', 'latitude', 'longitude', 'addressaccuracy',
       'nummosquitos', 'wnvpresent']) == 1].head()

Unnamed: 0,date,address,species,block,street,trap,addressnumberandstreet,latitude,longitude,addressaccuracy,nummosquitos,wnvpresent
1,2007-05-29,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,1,0
14,2007-05-29,"2200 West 113th Street, Chicago, IL 60643, USA",CULEX RESTUANS,22,W 113TH ST,T086,"2200 W 113TH ST, Chicago, IL",41.688324,-87.676709,8,1,0
32,2007-06-05,"1500 West Webster Avenue, Chicago, IL 60614, USA",CULEX PIPIENS,15,W WEBSTER AVE,T045,"1500 W WEBSTER AVE, Chicago, IL",41.9216,-87.666455,8,1,0
35,2007-06-05,"1100 Roosevelt Road, Chicago, IL 60608, USA",CULEX RESTUANS,11,W ROOSEVELT,T048,"1100 W ROOSEVELT, Chicago, IL",41.867108,-87.654224,8,1,0
39,2007-06-05,"2100 North Stave Street, Chicago, IL 60647, USA",CULEX RESTUANS,21,N STAVE ST,T050,"2100 N STAVE ST, Chicago, IL",41.919343,-87.694259,8,1,0


In [219]:
# identifying duplicate rows given that all other column data apart from species is duplicated
train[train.duplicated(subset=['date', 'address', 'block', 'street', 'trap',
       'addressnumberandstreet', 'latitude', 'longitude', 'addressaccuracy',
       'nummosquitos', 'wnvpresent']) == 1].head()

Unnamed: 0,date,address,species,block,street,trap,addressnumberandstreet,latitude,longitude,addressaccuracy,nummosquitos,wnvpresent
1,2007-05-29,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,1,0
14,2007-05-29,"2200 West 113th Street, Chicago, IL 60643, USA",CULEX RESTUANS,22,W 113TH ST,T086,"2200 W 113TH ST, Chicago, IL",41.688324,-87.676709,8,1,0
32,2007-06-05,"1500 West Webster Avenue, Chicago, IL 60614, USA",CULEX PIPIENS,15,W WEBSTER AVE,T045,"1500 W WEBSTER AVE, Chicago, IL",41.9216,-87.666455,8,1,0
35,2007-06-05,"1100 Roosevelt Road, Chicago, IL 60608, USA",CULEX RESTUANS,11,W ROOSEVELT,T048,"1100 W ROOSEVELT, Chicago, IL",41.867108,-87.654224,8,1,0
39,2007-06-05,"2100 North Stave Street, Chicago, IL 60647, USA",CULEX RESTUANS,21,N STAVE ST,T050,"2100 N STAVE ST, Chicago, IL",41.919343,-87.694259,8,1,0


In [170]:
train.to_csv('../data/train_clean.csv')

## Test data

In [49]:
test = pd.read_csv('../data/test.csv')

In [50]:
test.columns = np.array([col.lower() for col in test.columns])

In [61]:
test[test.duplicated(subset=['date', 'address', 'block', 'street', 'trap',
       'addressnumberandstreet', 'latitude', 'longitude', 'addressaccuracy',]) == 1]

Unnamed: 0,id,date,address,species,block,street,trap,addressnumberandstreet,latitude,longitude,addressaccuracy
1,2,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.954690,-87.800991,9
2,3,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX PIPIENS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.954690,-87.800991,9
3,4,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX SALINARIUS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.954690,-87.800991,9
4,5,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX TERRITANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.954690,-87.800991,9
5,6,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX TARSALIS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.954690,-87.800991,9
...,...,...,...,...,...,...,...,...,...,...,...
116288,116289,2014-10-02,"2100 North Cannon Drive, Chicago, IL 60614, USA",CULEX SALINARIUS,21,N CANNON DR,T054C,"2100 N CANNON DR, Chicago, IL",41.925652,-87.633590,8
116289,116290,2014-10-02,"2100 North Cannon Drive, Chicago, IL 60614, USA",CULEX TERRITANS,21,N CANNON DR,T054C,"2100 N CANNON DR, Chicago, IL",41.925652,-87.633590,8
116290,116291,2014-10-02,"2100 North Cannon Drive, Chicago, IL 60614, USA",CULEX TARSALIS,21,N CANNON DR,T054C,"2100 N CANNON DR, Chicago, IL",41.925652,-87.633590,8
116291,116292,2014-10-02,"2100 North Cannon Drive, Chicago, IL 60614, USA",UNSPECIFIED CULEX,21,N CANNON DR,T054C,"2100 N CANNON DR, Chicago, IL",41.925652,-87.633590,8


In [60]:
test.drop_duplicates(subset=['date', 'address', 'block', 'street', 'trap',
       'addressnumberandstreet', 'latitude', 'longitude', 'addressaccuracy',
                            ], keep='first')

Unnamed: 0,id,date,address,species,block,street,trap,addressnumberandstreet,latitude,longitude,addressaccuracy
0,1,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX PIPIENS/RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.954690,-87.800991,9
8,9,2008-06-11,"6200 North Mandell Avenue, Chicago, IL 60646, USA",CULEX PIPIENS/RESTUANS,62,N MANDELL AVE,T007,"6200 N MANDELL AVE, Chicago, IL",41.994991,-87.769279,9
16,17,2008-06-11,"7900 West Foster Avenue, Chicago, IL 60656, USA",CULEX PIPIENS/RESTUANS,79,W FOSTER AVE,T015,"7900 W FOSTER AVE, Chicago, IL",41.974089,-87.824812,8
24,25,2008-06-11,"1500 West Webster Avenue, Chicago, IL 60614, USA",CULEX PIPIENS/RESTUANS,15,W WEBSTER AVE,T045,"1500 W WEBSTER AVE, Chicago, IL",41.921600,-87.666455,8
32,33,2008-06-11,"2500 West Grand Avenue, Chicago, IL 60654, USA",CULEX PIPIENS/RESTUANS,25,W GRAND AVE,T046,"2500 W GRAND AVE, Chicago, IL",41.891118,-87.654491,8
...,...,...,...,...,...,...,...,...,...,...,...
116253,116254,2014-10-02,"2900 West 85th Street, Chicago, IL 60652, USA",CULEX PIPIENS/RESTUANS,29,W 85TH ST,T237,"2900 W 85TH ST, Chicago, IL",41.738903,-87.695443,8
116261,116262,2014-10-02,"3400 West 77th Street, Chicago, IL 60652, USA",CULEX PIPIENS/RESTUANS,34,W 77TH ST,T238,"3400 W 77TH ST, Chicago, IL",41.753391,-87.707394,8
116269,116270,2014-10-02,"5100 West 63rd Place, Chicago, IL 60638, USA",CULEX PIPIENS/RESTUANS,51,W 63RD PL,T065A,"5100 W 63RD PL, Chicago, IL",41.777689,-87.749149,9
116277,116278,2014-10-02,"9600 South Longwood Drive, Chicago, IL 60643, USA",CULEX PIPIENS/RESTUANS,96,S LONGWOOD DR,T094B,"9600 S LONGWOOD DR, Chicago, IL",41.719140,-87.669539,9


In [63]:
test = test.drop_duplicates(subset=['date', 'address', 'block', 'street', 'trap',
       'addressnumberandstreet', 'latitude', 'longitude', 'addressaccuracy',
                                   ], keep='first')

In [181]:
test['date'] = pd.to_datetime(test['date'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['date'] = pd.to_datetime(test['date'])


In [64]:
test = test[['date', 
             'datetime', 
             'address', 
             'species', 
             'block', 
             'street', 
             'trap', 
             'addressnumberandstreet', 
             'latitude', 
             'longitude',
             'addressaccuracy',
             'nummosquitos',
             'wnvpresent',
             ]]

Unnamed: 0,id,date,address,species,block,street,trap,addressnumberandstreet,latitude,longitude,addressaccuracy
0,1,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX PIPIENS/RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.954690,-87.800991,9
8,9,2008-06-11,"6200 North Mandell Avenue, Chicago, IL 60646, USA",CULEX PIPIENS/RESTUANS,62,N MANDELL AVE,T007,"6200 N MANDELL AVE, Chicago, IL",41.994991,-87.769279,9
16,17,2008-06-11,"7900 West Foster Avenue, Chicago, IL 60656, USA",CULEX PIPIENS/RESTUANS,79,W FOSTER AVE,T015,"7900 W FOSTER AVE, Chicago, IL",41.974089,-87.824812,8
24,25,2008-06-11,"1500 West Webster Avenue, Chicago, IL 60614, USA",CULEX PIPIENS/RESTUANS,15,W WEBSTER AVE,T045,"1500 W WEBSTER AVE, Chicago, IL",41.921600,-87.666455,8
32,33,2008-06-11,"2500 West Grand Avenue, Chicago, IL 60654, USA",CULEX PIPIENS/RESTUANS,25,W GRAND AVE,T046,"2500 W GRAND AVE, Chicago, IL",41.891118,-87.654491,8
...,...,...,...,...,...,...,...,...,...,...,...
116253,116254,2014-10-02,"2900 West 85th Street, Chicago, IL 60652, USA",CULEX PIPIENS/RESTUANS,29,W 85TH ST,T237,"2900 W 85TH ST, Chicago, IL",41.738903,-87.695443,8
116261,116262,2014-10-02,"3400 West 77th Street, Chicago, IL 60652, USA",CULEX PIPIENS/RESTUANS,34,W 77TH ST,T238,"3400 W 77TH ST, Chicago, IL",41.753391,-87.707394,8
116269,116270,2014-10-02,"5100 West 63rd Place, Chicago, IL 60638, USA",CULEX PIPIENS/RESTUANS,51,W 63RD PL,T065A,"5100 W 63RD PL, Chicago, IL",41.777689,-87.749149,9
116277,116278,2014-10-02,"9600 South Longwood Drive, Chicago, IL 60643, USA",CULEX PIPIENS/RESTUANS,96,S LONGWOOD DR,T094B,"9600 S LONGWOOD DR, Chicago, IL",41.719140,-87.669539,9


In [65]:
test.to_csv('../data/test_clean.csv')

# Spray

In [190]:
spray = pd.read_csv('../data/spray.csv')

In [191]:
spray.head()

Unnamed: 0,Date,Time,Latitude,Longitude
0,2011-08-29,6:56:58 PM,42.391623,-88.089163
1,2011-08-29,6:57:08 PM,42.391348,-88.089163
2,2011-08-29,6:57:18 PM,42.391022,-88.089157
3,2011-08-29,6:57:28 PM,42.390637,-88.089158
4,2011-08-29,6:57:38 PM,42.39041,-88.088858


In [194]:
spray.columns = np.array([col.lower() for col in spray.columns])

In [195]:
spray.columns

Index(['date', 'time', 'latitude', 'longitude'], dtype='object')

In [224]:
spray[spray.duplicated()]

Unnamed: 0,date,time,latitude,longitude
485,2011-09-07,7:43:40 PM,41.983917,-87.793088
490,2011-09-07,7:44:32 PM,41.986460,-87.794225
491,2011-09-07,7:44:32 PM,41.986460,-87.794225
492,2011-09-07,7:44:32 PM,41.986460,-87.794225
493,2011-09-07,7:44:32 PM,41.986460,-87.794225
...,...,...,...,...
1025,2011-09-07,7:44:32 PM,41.986460,-87.794225
1026,2011-09-07,7:44:32 PM,41.986460,-87.794225
1027,2011-09-07,7:44:32 PM,41.986460,-87.794225
1028,2011-09-07,7:44:32 PM,41.986460,-87.794225


In [196]:
spray[spray.duplicated(subset=list(spray.columns)) == 1]

Unnamed: 0,date,time,latitude,longitude
485,2011-09-07,7:43:40 PM,41.983917,-87.793088
490,2011-09-07,7:44:32 PM,41.986460,-87.794225
491,2011-09-07,7:44:32 PM,41.986460,-87.794225
492,2011-09-07,7:44:32 PM,41.986460,-87.794225
493,2011-09-07,7:44:32 PM,41.986460,-87.794225
...,...,...,...,...
1025,2011-09-07,7:44:32 PM,41.986460,-87.794225
1026,2011-09-07,7:44:32 PM,41.986460,-87.794225
1027,2011-09-07,7:44:32 PM,41.986460,-87.794225
1028,2011-09-07,7:44:32 PM,41.986460,-87.794225


In [94]:
spray = spray.drop_duplicates(subset=list(spray.columns))

In [197]:
spray.isnull().sum()

date           0
time         584
latitude       0
longitude      0
dtype: int64

In [200]:
# not null rows
spray[spray['time']==spray['time']]

Unnamed: 0,date,time,latitude,longitude
0,2011-08-29,6:56:58 PM,42.391623,-88.089163
1,2011-08-29,6:57:08 PM,42.391348,-88.089163
2,2011-08-29,6:57:18 PM,42.391022,-88.089157
3,2011-08-29,6:57:28 PM,42.390637,-88.089158
4,2011-08-29,6:57:38 PM,42.390410,-88.088858
...,...,...,...,...
14830,2013-09-05,8:34:11 PM,42.006587,-87.812355
14831,2013-09-05,8:35:01 PM,42.006192,-87.816015
14832,2013-09-05,8:35:21 PM,42.006022,-87.817392
14833,2013-09-05,8:35:31 PM,42.005453,-87.817423


In [202]:
# are null rows
spray[spray['time']!=spray['time']]

Unnamed: 0,date,time,latitude,longitude
1030,2011-09-07,,41.987092,-87.794286
1031,2011-09-07,,41.987620,-87.794382
1032,2011-09-07,,41.988004,-87.794574
1033,2011-09-07,,41.988292,-87.795486
1034,2011-09-07,,41.988100,-87.796014
...,...,...,...,...
1609,2011-09-07,,41.995876,-87.811615
1610,2011-09-07,,41.995972,-87.810271
1611,2011-09-07,,41.995684,-87.810319
1612,2011-09-07,,41.994724,-87.810415


In [178]:
# Time not important, just the instance of spraying
spray = spray.fillna("7:00:00 PM")

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14294 entries, 0 to 14834
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   date       14294 non-null  object 
 1   time       13710 non-null  object 
 2   latitude   14294 non-null  float64
 3   longitude  14294 non-null  float64
dtypes: float64(2), object(2)
memory usage: 558.4+ KB
