# Agenda: Cleaning data
1. NaN and cleaning it up
   * Series
   * Data frames
   * Two techniques: (a) replacing and (b) removing
2. Nullable types
3. Interpolation
4. Replacement of values

# Why clean our data? Because the real world is messy
* Sensors go dead
* People make mistakes
* People don't report data on time
* Weird errors

In [2]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame




In [3]:
s = Series([10, 20, np.nan, 40, 50])


In [4]:
s

0    10.0
1    20.0
2     NaN
3    40.0
4    50.0
dtype: float64

In [6]:
s.astype(np.int64)


IntCastingNaNError: Cannot convert non-finite values (NA or inf) to integer

In [7]:
# one way to get rid of NaN is to use the .fillna method
# this replaces all NaN values with whatever value we give

s.fillna(5)

0    10.0
1    20.0
2     5.0
3    40.0
4    50.0
dtype: float64

In [19]:
s = Series([22, 23, 22, 19, 18, 22, 23, 22, 22, 23],
           index='Tue Wed Thu Fri Sat Sun Mon Tue Wed Thu'.split())

In [20]:
s


Tue    22
Wed    23
Thu    22
Fri    19
Sat    18
Sun    22
Mon    23
Tue    22
Wed    22
Thu    23
dtype: int64

In [11]:
s.fillna(s.mean())

Tue    22.000000
Wed    23.000000
Thu    21.857143
Fri    19.000000
Sat    21.857143
Sun    22.000000
Mon    23.000000
Tue    22.000000
Wed    22.000000
Thu    21.857143
dtype: float64

In [13]:
s.fillna(s.median())


Tue    22.0
Wed    23.0
Thu    22.0
Fri    19.0
Sat    22.0
Sun    22.0
Mon    23.0
Tue    22.0
Wed    22.0
Thu    22.0
dtype: float64

In [14]:
s.dropna()

Tue    22.0
Wed    23.0
Fri    19.0
Sun    22.0
Mon    23.0
Tue    22.0
Wed    22.0
dtype: float64

In [21]:
s.dropna()

Tue    22
Wed    23
Thu    22
Fri    19
Sat    18
Sun    22
Mon    23
Tue    22
Wed    22
Thu    23
dtype: int64

In [22]:
s.loc[['Wed', 'Sat']] = np.nan
s

Tue    22.0
Wed     NaN
Thu    22.0
Fri    19.0
Sat     NaN
Sun    22.0
Mon    23.0
Tue    22.0
Wed     NaN
Thu    23.0
dtype: float64

In [23]:
s.dropna()

Tue    22.0
Thu    22.0
Fri    19.0
Sun    22.0
Mon    23.0
Tue    22.0
Thu    23.0
dtype: float64

In [24]:
s.fillna(s.mean())

Tue    22.000000
Wed    21.857143
Thu    22.000000
Fri    19.000000
Sat    21.857143
Sun    22.000000
Mon    23.000000
Tue    22.000000
Wed    21.857143
Thu    23.000000
dtype: float64

In [25]:
s.fillna(s.median())

Tue    22.0
Wed    22.0
Thu    22.0
Fri    19.0
Sat    22.0
Sun    22.0
Mon    23.0
Tue    22.0
Wed    22.0
Thu    23.0
dtype: float64

# Data frames and `NaN`

As a general rule, anything that we can do with a series, we can also do with a data frame. And when we do that, we get the result from applying the method to every single column.

In [26]:
np.random.seed(0)
df = DataFrame(np.random.randint(-500, 500, [3,4]),
               index=list('abc'),
               columns=list('wxyz'))
df

Unnamed: 0,w,x,y,z
a,184,59,129,-308
b,335,263,207,-141
c,-491,223,-223,254


In [28]:
df.loc['a', 'w'] = np.nan
df.loc['a', 'y'] = np.nan
df.loc['c', 'y'] = np.nan

df

Unnamed: 0,w,x,y,z
a,,59,,-308
b,335.0,263,207.0,-141
c,-491.0,223,,254


In [29]:
# if I want to run fillna, I can with a scalar value
df.fillna(9999)


Unnamed: 0,w,x,y,z
a,9999.0,59,9999.0,-308
b,335.0,263,207.0,-141
c,-491.0,223,9999.0,254


In [30]:
df

Unnamed: 0,w,x,y,z
a,,59,,-308
b,335.0,263,207.0,-141
c,-491.0,223,,254


In [31]:
# what will happen when I use dropna?
# every row containing NaN will be removed
# (or really, will not be in the new data frame that's returned)

df.dropna()

Unnamed: 0,w,x,y,z
b,335.0,263,207.0,-141


In [33]:
filename = '../data/boston,ma.csv'

In [34]:
!head $filename

date_time,"boston,ma_maxtempC","boston,ma_mintempC","boston,ma_totalSnow_cm","boston,ma_sunHour","boston,ma_uvIndex","boston,ma_uvIndex","boston,ma_moon_illumination","boston,ma_moonrise","boston,ma_moonset","boston,ma_sunrise","boston,ma_sunset","boston,ma_DewPointC","boston,ma_FeelsLikeC","boston,ma_HeatIndexC","boston,ma_WindChillC","boston,ma_WindGustKmph","boston,ma_cloudcover","boston,ma_humidity","boston,ma_precipMM","boston,ma_pressure","boston,ma_tempC","boston,ma_visibility","boston,ma_winddirDegree","boston,ma_windspeedKmph"
2018-12-11 00:00:00,1,-4,0.0,8.7,2,0,21,10:19 AM,08:12 PM,07:03 AM,04:11 PM,-7,-3,0,-3,10,0,57,0.0,1022,-3,10,339,8
2018-12-11 03:00:00,1,-4,0.0,8.7,2,0,21,10:19 AM,08:12 PM,07:03 AM,04:11 PM,-7,-1,1,-1,7,2,57,0.0,1023,-3,10,319,6
2018-12-11 06:00:00,1,-4,0.0,8.7,2,0,21,10:19 AM,08:12 PM,07:03 AM,04:11 PM,-9,-5,-3,-5,8,4,60,0.0,1023,-4,10,334,7
2018-12-11 09:00:00,1,-4,0.0,8.7,2,2,21,10:19 AM,08:12 PM,07:03 AM,04:11 PM,-9,1,1,1,3,6,49,0.0,1022,-1,10,334,

In [42]:
df = pd.read_csv(filename, usecols=['boston,ma_maxtempC', 'boston,ma_mintempC'])

df


Unnamed: 0,"boston,ma_maxtempC","boston,ma_mintempC"
0,1,-4
1,1,-4
2,1,-4
3,1,-4
4,1,-4
...,...,...
723,8,2
724,8,2
725,8,2
726,8,2


In [43]:
df.loc['a', 'w'] = np.nan
df.loc['a', 'y'] = np.nan
df.loc['c', 'y'] = np.nan

df

Unnamed: 0,"boston,ma_maxtempC","boston,ma_mintempC"
0,1,-4
1,1,-4
2,1,-4
3,1,-4
4,1,-4
...,...,...
723,8,2
724,8,2
725,8,2
726,8,2


In [45]:
df.loc[::5,'boston,ma_maxtempC'] = np.nan

In [46]:
df

Unnamed: 0,"boston,ma_maxtempC","boston,ma_mintempC"
0,,-4
1,1.0,-4
2,1.0,-4
3,1.0,-4
4,1.0,-4
...,...,...
723,8.0,2
724,8.0,2
725,,2
726,8.0,2


In [48]:
df.loc[::3,'boston,ma_mintempC'] = np.nan

In [49]:
df

Unnamed: 0,"boston,ma_maxtempC","boston,ma_mintempC"
0,,
1,1.0,-4.0
2,1.0,-4.0
3,1.0,
4,1.0,-4.0
...,...,...
723,8.0,
724,8.0,2.0
725,,2.0
726,8.0,


In [51]:
df.head()

Unnamed: 0,"boston,ma_maxtempC","boston,ma_mintempC"
0,,
1,1.0,-4.0
2,1.0,-4.0
3,1.0,
4,1.0,-4.0


In [53]:
df.shape

(728, 2)

In [54]:
df.dropna()

Unnamed: 0,"boston,ma_maxtempC","boston,ma_mintempC"
1,1.0,-4.0
2,1.0,-4.0
4,1.0,-4.0
7,1.0,-4.0
8,2.0,-3.0
...,...,...
719,2.0,-4.0
721,8.0,2.0
722,8.0,2.0
724,8.0,2.0


In [55]:
df.interpolate().head(20)

Unnamed: 0,"boston,ma_maxtempC","boston,ma_mintempC"
0,,
1,1.0,-4.0
2,1.0,-4.0
3,1.0,-4.0
4,1.0,-4.0
5,1.0,-4.0
6,1.0,-4.0
7,1.0,-4.0
8,2.0,-3.0
9,2.0,-3.0


In [57]:
filename = 'nyc-mini-parking-violations-2020.csv'
df = pd.read_csv(filename)

In [58]:
df.head()

Unnamed: 0,Summons Number,Plate ID,Registration State,Plate Type,Issue Date,Violation Code,Vehicle Body Type,Vehicle Make,Issuing Agency,Street Code1,...,Vehicle Color,Unregistered Vehicle?,Vehicle Year,Meter Number,Feet From Curb,Violation Post Code,Violation Description,No Standing or Stopping Violation,Hydrant Violation,Double Parking Violation
0,1477633194,J58JKX,NJ,PAS,05/08/1972 12:00:00 AM,16,SDN,HONDA,P,8730,...,BK,0.0,0,-,0,,,,,
1,1449715424,KRE6058,PA,PAS,08/29/1977 12:00:00 AM,98,SUBN,ME/BE,P,86530,...,BLK,0.0,0,-,0,,,,,
2,1455779155,444326R,NJ,PAS,10/03/1988 12:00:00 AM,20,SDN,LEXUS,P,27030,...,BLACK,0.0,0,-,0,,,,,
3,1458800908,F728330,OH,PAS,01/03/1990 12:00:00 AM,21,SDN,CHEVR,P,33030,...,,0.0,0,-,0,,,,,
4,1466038676,FMY9090,NY,PAS,02/14/1990 12:00:00 AM,21,SUBN,JEEP,S,45130,...,GREY,0.0,2015,-,0,,,,,
