In [1]:
# cleaning 'NaN'

import pandas as pd
import numpy as np
from pandas import Series, DataFrame

s = Series([10,20, np.nan, 40, 50])
s

0    10.0
1    20.0
2     NaN
3    40.0
4    50.0
dtype: float64

In [2]:
# fillna method
s.fillna(5)

0    10.0
1    20.0
2     5.0
3    40.0
4    50.0
dtype: float64

In [3]:
s.fillna(s.mean())

0    10.0
1    20.0
2    30.0
3    40.0
4    50.0
dtype: float64

In [6]:
# dropna

s.dropna()

0    10.0
1    20.0
3    40.0
4    50.0
dtype: float64

In [9]:
# Missing Weather

s = Series([19,22,22,np.nan,19,22,np.nan,22,19,np.nan],
          index='Tue Wed Thu Fri Sat Sun Mon Tue Wed Thu'.split())

Tue    19.0
Wed    22.0
Thu    22.0
Fri     NaN
Sat    19.0
Sun    22.0
Mon     NaN
Tue    22.0
Wed    19.0
Thu     NaN
dtype: float64

In [10]:
s.fillna(s.mean())

Tue    19.000000
Wed    22.000000
Thu    22.000000
Fri    20.714286
Sat    19.000000
Sun    22.000000
Mon    20.714286
Tue    22.000000
Wed    19.000000
Thu    20.714286
dtype: float64

In [11]:
s.fillna(s.median())

Tue    19.0
Wed    22.0
Thu    22.0
Fri    22.0
Sat    19.0
Sun    22.0
Mon    22.0
Tue    22.0
Wed    19.0
Thu    22.0
dtype: float64

In [12]:
s.dropna()

Tue    19.0
Wed    22.0
Thu    22.0
Sat    19.0
Sun    22.0
Tue    22.0
Wed    19.0
dtype: float64

In [14]:
np.random.seed(0)
df = DataFrame(np.random.randint(-500, 500, [3,4]),
               index=list('abc'),
               columns=list('wxyz'))
df

Unnamed: 0,w,x,y,z
a,184,59,129,-308
b,335,263,207,-141
c,-491,223,-223,254


In [16]:
df.loc['a', 'w'] = np.nan
df.loc['a', 'y'] = np.nan
df.loc['c', 'y'] = np.nan
df

Unnamed: 0,w,x,y,z
a,,59,,-308
b,335.0,263,207.0,-141
c,-491.0,223,,254


In [18]:
df.fillna(9999)

Unnamed: 0,w,x,y,z
a,9999.0,59,9999.0,-308
b,335.0,263,207.0,-141
c,-491.0,223,9999.0,254


In [19]:
df.mean()

w    -78.000000
x    181.666667
y    207.000000
z    -65.000000
dtype: float64

In [21]:
df.fillna(df.mean())

Unnamed: 0,w,x,y,z
a,-78.0,59,207.0,-308
b,335.0,263,207.0,-141
c,-491.0,223,207.0,254


In [24]:
df.fillna({'w':9999, 'y':df['y'].mean()})

Unnamed: 0,w,x,y,z
a,9999.0,59,207.0,-308
b,335.0,263,207.0,-141
c,-491.0,223,207.0,254


In [25]:
df

Unnamed: 0,w,x,y,z
a,,59,,-308
b,335.0,263,207.0,-141
c,-491.0,223,,254


In [27]:
df.dropna()

Unnamed: 0,w,x,y,z
b,335.0,263,207.0,-141


In [29]:
# kwargs to dropna()

# 1. thresh, integer indicating threshold of good values we need to have
# 2. subset, list of column names (strings) we want to look at when determining if we should drop the row
# if a column is not in subset, it will not be dropped

df.dropna(thresh=3)

Unnamed: 0,w,x,y,z
b,335.0,263,207.0,-141
c,-491.0,223,,254


In [34]:
df.dropna(subset=['w','z'])
# NaN in row 'y' is kept because its outside of the subset

Unnamed: 0,w,x,y,z
b,335.0,263,207.0,-141
c,-491.0,223,,254


In [61]:
# Exercise: Boston temp

filename = '../data/boston,ma.csv'
df = pd.read_csv(filename, usecols=['boston,ma_maxtempC','boston,ma_mintempC'])
df

Unnamed: 0,"boston,ma_maxtempC","boston,ma_mintempC"
0,1,-4
1,1,-4
2,1,-4
3,1,-4
4,1,-4
...,...,...
723,8,2
724,8,2
725,8,2
726,8,2


In [59]:
df.iloc[::5, :]

Unnamed: 0,"boston,ma_maxtempC","boston,ma_mintempC"
0,1,-4
5,1,-4
10,2,-3
15,2,-3
20,0,-5
...,...,...
705,6,-7
710,6,-7
715,2,-4
720,8,2


In [65]:
df.loc[::5, 'boston,ma_maxtempC'] = np.nan
df

Unnamed: 0,"boston,ma_maxtempC","boston,ma_mintempC"
0,,-4
1,1.0,-4
2,1.0,-4
3,1.0,-4
4,1.0,-4
...,...,...
723,8.0,2
724,8.0,2
725,,2
726,8.0,2


In [67]:
df.loc[::3, 'boston,ma_mintempC'] = np.nan
df

Unnamed: 0,"boston,ma_maxtempC","boston,ma_mintempC"
0,,
1,1.0,-4.0
2,1.0,-4.0
3,1.0,
4,1.0,-4.0
...,...,...
723,8.0,
724,8.0,2.0
725,,2.0
726,8.0,


In [68]:
df.shape

(728, 2)

In [70]:
df.dropna().shape

(388, 2)

In [72]:
df.dropna(subset='boston,ma_mintempC')

Unnamed: 0,"boston,ma_maxtempC","boston,ma_mintempC"
1,1.0,-4.0
2,1.0,-4.0
4,1.0,-4.0
5,,-4.0
7,1.0,-4.0
...,...,...
721,8.0,2.0
722,8.0,2.0
724,8.0,2.0
725,,2.0


In [74]:
df.dropna(thresh=1).shape

(679, 2)

In [75]:
df.fillna(df.mean())

Unnamed: 0,"boston,ma_maxtempC","boston,ma_mintempC"
0,2.850515,-3.142268
1,1.000000,-4.000000
2,1.000000,-4.000000
3,1.000000,-3.142268
4,1.000000,-4.000000
...,...,...
723,8.000000,-3.142268
724,8.000000,2.000000
725,2.850515,2.000000
726,8.000000,-3.142268


In [76]:
# Interpolation

df.fillna(df.interpolate())

Unnamed: 0,"boston,ma_maxtempC","boston,ma_mintempC"
0,,
1,1.0,-4.0
2,1.0,-4.0
3,1.0,-4.0
4,1.0,-4.0
...,...,...
723,8.0,2.0
724,8.0,2.0
725,8.0,2.0
726,8.0,2.0
