## Handling Missing Data

In [3]:
import pandas as pd
import numpy as np

In [5]:
string_data = pd.Series(['aardvark', 'artichoke', np.nan, 'avocado'])
string_data

0     aardvark
1    artichoke
2          NaN
3      avocado
dtype: object

In [6]:
string_data.isnull()

0    False
1    False
2     True
3    False
dtype: bool

In [7]:
string_data[0] = None

In [9]:
string_data.isnull()

0     True
1    False
2     True
3    False
dtype: bool

## NA Handling Methods 
- dropna
- fillna
- isnull
- notnull

## Filtering Out Missing Data

In [14]:
data = pd.Series([1, np.nan, 3.5, np.nan, 7])
data

0    1.0
1    NaN
2    3.5
3    NaN
4    7.0
dtype: float64

In [13]:
data.dropna()

0    1.0
2    3.5
4    7.0
dtype: float64

In [15]:
data[data.notnull()]

0    1.0
2    3.5
4    7.0
dtype: float64

In [18]:
##
df = pd.DataFrame(np.random.randn(7,3))
df

Unnamed: 0,0,1,2
0,-0.400801,-1.03196,-0.712493
1,-0.898912,1.231192,1.397029
2,1.782975,-0.898898,0.555878
3,-0.672921,1.257953,0.464799
4,0.146677,0.463664,-0.579702
5,-0.229784,-1.382777,-0.377679
6,-0.934725,-1.191837,0.537229


In [19]:
df.ix[:4,1] = np.nan; df.ix[:2, 2] = np.nan

In [21]:
df

Unnamed: 0,0,1,2
0,-0.400801,,
1,-0.898912,,
2,1.782975,,
3,-0.672921,,0.464799
4,0.146677,,-0.579702
5,-0.229784,-1.382777,-0.377679
6,-0.934725,-1.191837,0.537229


## Thresh argument

In [25]:
'''Suppose you want to keep only rows containing a certain number of observations'''
df.dropna(thresh = 2) # drops rows that don't have at least two values

Unnamed: 0,0,1,2
3,-0.672921,,0.464799
4,0.146677,,-0.579702
5,-0.229784,-1.382777,-0.377679
6,-0.934725,-1.191837,0.537229


## Filling in Missing Data

In [26]:
df

Unnamed: 0,0,1,2
0,-0.400801,,
1,-0.898912,,
2,1.782975,,
3,-0.672921,,0.464799
4,0.146677,,-0.579702
5,-0.229784,-1.382777,-0.377679
6,-0.934725,-1.191837,0.537229


In [27]:
df.fillna(0)

Unnamed: 0,0,1,2
0,-0.400801,0.0,0.0
1,-0.898912,0.0,0.0
2,1.782975,0.0,0.0
3,-0.672921,0.0,0.464799
4,0.146677,0.0,-0.579702
5,-0.229784,-1.382777,-0.377679
6,-0.934725,-1.191837,0.537229


In [29]:
df.fillna({1:0.5, 3: -1})

Unnamed: 0,0,1,2
0,-0.400801,0.5,
1,-0.898912,0.5,
2,1.782975,0.5,
3,-0.672921,0.5,0.464799
4,0.146677,0.5,-0.579702
5,-0.229784,-1.382777,-0.377679
6,-0.934725,-1.191837,0.537229


In [32]:
_ = df.fillna(0, inplace = True)
_

In [33]:
df

Unnamed: 0,0,1,2
0,-0.400801,0.0,0.0
1,-0.898912,0.0,0.0
2,1.782975,0.0,0.0
3,-0.672921,0.0,0.464799
4,0.146677,0.0,-0.579702
5,-0.229784,-1.382777,-0.377679
6,-0.934725,-1.191837,0.537229
