In [1]:
import numpy as np
import pandas as pd
from pandas import Series,DataFrame

In [2]:
string_data = Series(['aardvark', 'artichoke', np.nan, 'avocado'])

In [3]:
string_data.is_null()

AttributeError: 'Series' object has no attribute 'is_null'

In [6]:
string_data.isnull()

0    False
1    False
2     True
3    False
dtype: bool

In [7]:
string_data.notnull()

0     True
1     True
2    False
3     True
dtype: bool

# Filtering out missing data

### dropna method

In [8]:
from numpy import nan as NA

In [9]:
data = Series([1, NA, 3.5, NA, 7])

In [10]:
data.dropna()

0    1.0
2    3.5
4    7.0
dtype: float64

In [12]:
data[data.notnull()]  #same with boolean indexing

0    1.0
2    3.5
4    7.0
dtype: float64

In [17]:
data = DataFrame([[1., 6.5, 3.], [1., NA, NA], [NA, NA, NA], [NA, 6.5, 3.]])
data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [15]:
cleaned = data.dropna()  #drops by default all rows containing any NA value
cleaned

Unnamed: 0,0,1,2
0,1,6.5,3


In [16]:
data.dropna(how='all')  #passing how='all' drops only those rows that are all NA

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


In [18]:
data[4] = NA

In [19]:
data

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,
1,1.0,,,
2,,,,
3,,6.5,3.0,


In [21]:
data.dropna(axis=1, how='all') #dropping columns

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


### using thresh argument to keep rows containing only a certain number of values

In [24]:
df = DataFrame(np.random.randn(7,3))
df

Unnamed: 0,0,1,2
0,-0.38778,0.482077,0.108558
1,1.71865,-0.047224,0.586034
2,-1.155535,-0.748406,-1.107947
3,0.357879,-0.15478,-0.337258
4,-0.542029,1.424575,0.266382
5,-0.492386,-0.105122,1.820586
6,-0.016947,-1.236699,-0.229008


In [26]:
df.ix[:4,1]=NA; df.ix[:2,2] = NA
df

Unnamed: 0,0,1,2
0,-0.38778,,
1,1.71865,,
2,-1.155535,,
3,0.357879,,-0.337258
4,-0.542029,,0.266382
5,-0.492386,-0.105122,1.820586
6,-0.016947,-1.236699,-0.229008


In [27]:
df.dropna(thresh=3)

Unnamed: 0,0,1,2
5,-0.492386,-0.105122,1.820586
6,-0.016947,-1.236699,-0.229008


### filling in missing data

In [28]:
df.fillna(0)

Unnamed: 0,0,1,2
0,-0.38778,0.0,0.0
1,1.71865,0.0,0.0
2,-1.155535,0.0,0.0
3,0.357879,0.0,-0.337258
4,-0.542029,0.0,0.266382
5,-0.492386,-0.105122,1.820586
6,-0.016947,-1.236699,-0.229008


#### filling columns with different values by calling fillna with a dict 

In [30]:
df.fillna(dict([(1,0.5),(3,-1)]))

Unnamed: 0,0,1,2
0,-0.38778,0.5,
1,1.71865,0.5,
2,-1.155535,0.5,
3,0.357879,0.5,-0.337258
4,-0.542029,0.5,0.266382
5,-0.492386,-0.105122,1.820586
6,-0.016947,-1.236699,-0.229008


In [32]:
dict(zip((2,3),(4,4)))

{2: 4, 3: 4}

In [33]:
df

Unnamed: 0,0,1,2
0,-0.38778,,
1,1.71865,,
2,-1.155535,,
3,0.357879,,-0.337258
4,-0.542029,,0.266382
5,-0.492386,-0.105122,1.820586
6,-0.016947,-1.236699,-0.229008


In [36]:
_ = df.fillna(0, inplace=True)

In [37]:
df

Unnamed: 0,0,1,2
0,-0.38778,0.0,0.0
1,1.71865,0.0,0.0
2,-1.155535,0.0,0.0
3,0.357879,0.0,-0.337258
4,-0.542029,0.0,0.266382
5,-0.492386,-0.105122,1.820586
6,-0.016947,-1.236699,-0.229008


In [38]:
_

Unnamed: 0,0,1,2
0,-0.38778,0.0,0.0
1,1.71865,0.0,0.0
2,-1.155535,0.0,0.0
3,0.357879,0.0,-0.337258
4,-0.542029,0.0,0.266382
5,-0.492386,-0.105122,1.820586
6,-0.016947,-1.236699,-0.229008


### interpolation method

In [39]:
df = DataFrame(np.random.randn(6,3))

In [40]:
df.ix[2:,1] = df.ix[4:,2] = NA

In [43]:
df.fillna(method='ffill')

Unnamed: 0,0,1,2
0,1.06954,-1.580313,1.988621
1,-0.653108,0.069843,0.140765
2,-1.492688,0.069843,0.135063
3,0.123737,0.069843,-0.796437
4,-0.346973,0.069843,-0.796437
5,-0.509076,0.069843,-0.796437


In [44]:
df.fillna(method='ffill',limit=2)

Unnamed: 0,0,1,2
0,1.06954,-1.580313,1.988621
1,-0.653108,0.069843,0.140765
2,-1.492688,0.069843,0.135063
3,0.123737,0.069843,-0.796437
4,-0.346973,,-0.796437
5,-0.509076,,-0.796437


### passing mean or median value

In [46]:
ser = Series([1., NA, 3.5, NA, 7])

In [47]:
ser.fillna(ser.mean())

0    1.000000
1    3.833333
2    3.500000
3    3.833333
4    7.000000
dtype: float64

In [50]:
ser.fillna(ser.median())

0    1.0
1    3.5
2    3.5
3    3.5
4    7.0
dtype: float64