# Missing Data

In [1]:
import numpy as np 
import pandas as pd

In [2]:
falsy_values = (0,False,None,{},[])
any(falsy_values)

False

In [4]:
np.nan + 3


nan

In [5]:
3 + np.nan

nan

In [8]:
a = np.array([1,2,np.nan,np.nan,4])

In [9]:
a

array([ 1.,  2., nan, nan,  4.])

In [10]:
a.sum()

nan

In [11]:
a.mean()

nan

In [12]:
a = np.array([1,2,np.nan,np.nan,4],dtype='float')

In [13]:
a

array([ 1.,  2., nan, nan,  4.])

In [17]:
a = np.array([1, 2, 3, np.nan, np.nan, 4])

In [18]:
a.mean()

nan

In [19]:
a.sum()

nan

In [22]:
# Numpy also supports an "Infinite" type:

np.inf

inf

In [23]:
3 + np.inf

inf

In [24]:
np.inf / 3

inf

In [25]:
np.inf / np.inf

nan

In [26]:
b = np.array([1, 2, 3, np.inf, np.nan, 4], dtype=np.float)

In [27]:
b.sum()

nan

In [28]:
np.isnan(np.nan)

True

In [29]:
np.isinf(np.inf)

True

In [30]:
np.isfinite(np.nan), np.isfinite(np.inf)

(False, False)

In [31]:
np.isnan(np.array([1, 2, 3, np.nan, np.inf, 4]))

array([False, False, False,  True, False, False])

In [32]:
np.isinf(np.array([1, 2, 3, np.nan, np.inf, 4]))

array([False, False, False, False,  True, False])

In [34]:
np.isfinite(np.array([1, 2, 3, np.nan, np.inf, 4]))

array([ True,  True,  True, False, False,  True])

In [35]:
a = np.array([1, 2, 3, np.nan, np.nan, 4])

In [36]:
a[~np.isnan(a)]

array([1., 2., 3., 4.])

In [37]:
a[np.isfinite(a)]

array([1., 2., 3., 4.])

In [38]:
a[np.isfinite(a)].sum()

10.0

In [39]:
a[np.isfinite(a)].mean()

2.5

# Handling Missing Data with Pandas

In [40]:
import numpy as np 
import pandas as pd 

In [41]:
pd.isnull(np.nan)

True

In [43]:
pd.isna(None)

True

In [44]:
pd.notnull(None)

False

In [46]:
pd.notnull(np.nan)

False

In [47]:
pd.isna(pd.Series([1,2,np.nan,23]))

0    False
1    False
2     True
3    False
dtype: bool

In [48]:
pd.notnull(pd.Series([1,2,np.nan,23]))

0     True
1     True
2    False
3     True
dtype: bool

In [49]:
pd.isna(pd.DataFrame({
    'Column A' :[1,np.nan,3],
    'Cloumn B' :[np.nan,2,3],
    'Column C' :[np.nan,2,np.nan]
}))

Unnamed: 0,Column A,Cloumn B,Column C
0,False,True,True
1,True,False,False
2,False,False,True


# Pandas Operations with Missing Values

In [50]:
pd.Series([1,2,np.nan,np.nan,3]).count()

3

In [51]:
pd.Series([1,2,np.nan,3]).sum()

6.0

In [52]:
pd.Series([1,2,np.nan,3]).mean()

2.0

In [53]:
# Filtering missing data 

s = pd.Series([1,2,3,np.nan,np.nan,4])

In [54]:
pd.isna(s)

0    False
1    False
2    False
3     True
4     True
5    False
dtype: bool

In [55]:
pd.notnull(s)

0     True
1     True
2     True
3    False
4    False
5     True
dtype: bool

In [56]:
pd.notnull(s).sum()

4

In [59]:
s[pd.notnull(s)]

0    1.0
1    2.0
2    3.0
5    4.0
dtype: float64

In [60]:
s.isnull()

0    False
1    False
2    False
3     True
4     True
5    False
dtype: bool

In [61]:
s.notnull()

0     True
1     True
2     True
3    False
4    False
5     True
dtype: bool

In [63]:
s[s.notnull()]

0    1.0
1    2.0
2    3.0
5    4.0
dtype: float64

In [64]:
a = pd.isna(pd.DataFrame({
    'Column A' :[1,np.nan,3],
    'Cloumn B' :[np.nan,2,3],
    'Column C' :[np.nan,2,np.nan]
}))

In [65]:
a[a.notnull()]

Unnamed: 0,Column A,Cloumn B,Column C
0,False,True,True
1,True,False,False
2,False,False,True


In [66]:
a

Unnamed: 0,Column A,Cloumn B,Column C
0,False,True,True
1,True,False,False
2,False,False,True


In [68]:
# Dropping null values on DataFrame 
s

0    1.0
1    2.0
2    3.0
3    NaN
4    NaN
5    4.0
dtype: float64

In [69]:
s.dropna()

0    1.0
1    2.0
2    3.0
5    4.0
dtype: float64

In [71]:
# Dropping null values on DataFrames 

df = pd.DataFrame({
    'Column A': [1,np.nan,30,np.nan],
    'Column B': [2,8,31,np.nan],
    'Column C': [np.nan,9,32,100],
    'Column D': [5,8,34,100]
})

In [72]:
df

Unnamed: 0,Column A,Column B,Column C,Column D
0,1.0,2.0,,5
1,,8.0,9.0,8
2,30.0,31.0,32.0,34
3,,,100.0,100


In [73]:
df.dropna()

Unnamed: 0,Column A,Column B,Column C,Column D
2,30.0,31.0,32.0,34


In [74]:
df

Unnamed: 0,Column A,Column B,Column C,Column D
0,1.0,2.0,,5
1,,8.0,9.0,8
2,30.0,31.0,32.0,34
3,,,100.0,100


In [75]:
df.shape

(4, 4)

In [76]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 4 columns):
Column A    2 non-null float64
Column B    3 non-null float64
Column C    3 non-null float64
Column D    4 non-null int64
dtypes: float64(3), int64(1)
memory usage: 208.0 bytes


In [78]:
df.isnull().sum()

Column A    2
Column B    1
Column C    1
Column D    0
dtype: int64

In [79]:
df.dropna()

Unnamed: 0,Column A,Column B,Column C,Column D
2,30.0,31.0,32.0,34


In [80]:
df.dropna(axis=1)

Unnamed: 0,Column D
0,5
1,8
2,34
3,100


In [81]:
df2 = pd.DataFrame({
    'Column 1':[1,np.nan,30],
    'Column 2':[2,np.nan,31],
    'Column 3':[np.nan,np.nan,100]
})

In [82]:
df2

Unnamed: 0,Column 1,Column 2,Column 3
0,1.0,2.0,
1,,,
2,30.0,31.0,100.0


In [83]:
df.dropna(how='all')

Unnamed: 0,Column A,Column B,Column C,Column D
0,1.0,2.0,,5
1,,8.0,9.0,8
2,30.0,31.0,32.0,34
3,,,100.0,100


In [84]:
df.dropna(how='any')

Unnamed: 0,Column A,Column B,Column C,Column D
2,30.0,31.0,32.0,34


In [85]:
df

Unnamed: 0,Column A,Column B,Column C,Column D
0,1.0,2.0,,5
1,,8.0,9.0,8
2,30.0,31.0,32.0,34
3,,,100.0,100


In [87]:
df.dropna(thresh=3)

Unnamed: 0,Column A,Column B,Column C,Column D
0,1.0,2.0,,5
1,,8.0,9.0,8
2,30.0,31.0,32.0,34


In [88]:
df.dropna(thresh=3,axis='columns')

Unnamed: 0,Column B,Column C,Column D
0,2.0,,5
1,8.0,9.0,8
2,31.0,32.0,34
3,,100.0,100


In [89]:
s

0    1.0
1    2.0
2    3.0
3    NaN
4    NaN
5    4.0
dtype: float64

In [91]:
s.fillna(0)

0    1.0
1    2.0
2    3.0
3    0.0
4    0.0
5    4.0
dtype: float64

In [92]:
s.fillna(s.mean())

0    1.0
1    2.0
2    3.0
3    2.5
4    2.5
5    4.0
dtype: float64

In [93]:
s

0    1.0
1    2.0
2    3.0
3    NaN
4    NaN
5    4.0
dtype: float64

In [94]:
s.fillna(method='ffill')

0    1.0
1    2.0
2    3.0
3    3.0
4    3.0
5    4.0
dtype: float64

In [95]:
s.fillna(method='bfill')

0    1.0
1    2.0
2    3.0
3    4.0
4    4.0
5    4.0
dtype: float64

In [96]:
pd.Series([np.nan,3,np.nan,9]).fillna(method='ffill')

0    NaN
1    3.0
2    3.0
3    9.0
dtype: float64

In [97]:
pd.Series([1, np.nan, 3, np.nan, np.nan]).fillna(method='bfill')

0    1.0
1    3.0
2    3.0
3    NaN
4    NaN
dtype: float64

In [98]:
df


Unnamed: 0,Column A,Column B,Column C,Column D
0,1.0,2.0,,5
1,,8.0,9.0,8
2,30.0,31.0,32.0,34
3,,,100.0,100


In [99]:
df.fillna(method='ffill')

Unnamed: 0,Column A,Column B,Column C,Column D
0,1.0,2.0,,5
1,1.0,8.0,9.0,8
2,30.0,31.0,32.0,34
3,30.0,31.0,100.0,100


In [100]:
df.fillna({'Column A': 0, 'Column B': 99, 'Column C': df['Column C'].mean()})

Unnamed: 0,Column A,Column B,Column C,Column D
0,1.0,2.0,47.0,5
1,0.0,8.0,9.0,8
2,30.0,31.0,32.0,34
3,0.0,99.0,100.0,100


In [101]:
df.fillna(method='ffill', axis=0)

Unnamed: 0,Column A,Column B,Column C,Column D
0,1.0,2.0,,5
1,1.0,8.0,9.0,8
2,30.0,31.0,32.0,34
3,30.0,31.0,100.0,100


In [102]:
df.fillna(method='bfill',axis=1)

Unnamed: 0,Column A,Column B,Column C,Column D
0,1.0,2.0,5.0,5.0
1,8.0,8.0,9.0,8.0
2,30.0,31.0,32.0,34.0
3,100.0,100.0,100.0,100.0


In [104]:
df.fillna(method='ffill',axis=0)

Unnamed: 0,Column A,Column B,Column C,Column D
0,1.0,2.0,,5
1,1.0,8.0,9.0,8
2,30.0,31.0,32.0,34
3,30.0,31.0,100.0,100


In [105]:
s.dropna().count()

4

In [107]:
s

0    1.0
1    2.0
2    3.0
3    NaN
4    NaN
5    4.0
dtype: float64

In [108]:
missing_values = len(s.dropna()) != len(s)
missing_values

True

In [109]:
len(s)

6

In [110]:
s.count()

4

In [111]:
missing_values = s.count() != len(s)
missing_values

True

In [115]:
pd.Series([True,False, False]).any()

True

In [116]:
pd.Series([True,False,False]).all()

False

In [118]:
s.isna()

0    False
1    False
2    False
3     True
4     True
5    False
dtype: bool

In [119]:
pd.Series([1, np.nan]).isnull().any()

True

In [120]:
pd.Series([1, 2]).isnull().any()

False

In [121]:
s.isnull().any()

True

In [122]:
s.isnull().values


array([False, False, False,  True,  True, False])

In [124]:
s.isna().values.any()

True