In [1]:
import pandas as pd

In [2]:
import numpy as np

### Missing Values

In [3]:
S = pd.Series([11, 21, np.nan, None])
S

0    11.0
1    21.0
2     NaN
3     NaN
dtype: float64

In [4]:
type(np.nan)

float

The default missing value representation in Pandas is NaN but Python’s None is also detected as missing value.

In [6]:
S.isna()

0    False
1    False
2     True
3     True
dtype: bool

In [7]:
S.isnull()

0    False
1    False
2     True
3     True
dtype: bool

In [58]:
S = pd.Series([11, 13, np.nan, None], dtype=pd.Int16Dtype())
S

0      11
1      13
2    <NA>
3    <NA>
dtype: Int16

A new representation for missing values is introduced with Pandas 1.0 which is `<NA>`

In [59]:
S.isna()

0    False
1    False
2     True
3     True
dtype: bool

### Missing data while reading file

In [26]:
data = pd.read_csv(r'./data/sample_data.csv')

In [27]:
data

Unnamed: 0,Name,Age,Gender,Location
0,Vijay,?,M,-
1,Sujoy,,M,na
2,Diksha,23,F,unavailable


In [28]:
missing_values = ['?', '-', 'NaN', 'na', 'unavailable']

data = pd.read_csv(r'./data/sample_data.csv', na_values = missing_values)
data

Unnamed: 0,Name,Age,Gender,Location
0,Vijay,,M,
1,Sujoy,,M,
2,Diksha,23.0,F,


In [29]:
data.isna().sum()

Name        0
Age         2
Gender      0
Location    3
dtype: int64

In [32]:
data['doj'] = pd.date_range('01-06-2020', periods=3)

In [33]:
data

Unnamed: 0,Name,Age,Gender,Location,doj
0,Vijay,,M,,2020-01-06
1,Sujoy,,M,,2020-01-07
2,Diksha,23.0,F,,2020-01-08


In [35]:
data.iloc[0,4] = np.nan

In [36]:
data

Unnamed: 0,Name,Age,Gender,Location,doj
0,Vijay,,M,,NaT
1,Sujoy,,M,,2020-01-07
2,Diksha,23.0,F,,2020-01-08


NaT is used to represent datetime missing values.

In [37]:
data.isna().sum()

Name        0
Age         2
Gender      0
Location    3
doj         1
dtype: int64

In [38]:
data = pd.DataFrame({'A':[10, 20, np.nan, 30]})
data

Unnamed: 0,A
0,10.0
1,20.0
2,
3,30.0


### Aggregate functions like max(), cumsum() skipna values

In [39]:
data.cumsum()

Unnamed: 0,A
0,10.0
1,30.0
2,
3,60.0


In [43]:
data['A'].max()

30.0

### Groupby with missing values

In [46]:
df = pd.DataFrame({'Color':['Red','Blue','Black','Red','Red','Black'],
                  'Price':[10, 15, 30, np.nan, 40, np.nan]})
df

Unnamed: 0,Color,Price
0,Red,10.0
1,Blue,15.0
2,Black,30.0
3,Red,
4,Red,40.0
5,Black,


In [47]:
df.groupby('Color')['Price'].sum()

Color
Black    30.0
Blue     15.0
Red      50.0
Name: Price, dtype: float64

Groupby also excludes missing values

### Deal with missing values

In [48]:
df

Unnamed: 0,Color,Price
0,Red,10.0
1,Blue,15.0
2,Black,30.0
3,Red,
4,Red,40.0
5,Black,


In [50]:
df.dropna(how='any')

Unnamed: 0,Color,Price
0,Red,10.0
1,Blue,15.0
2,Black,30.0
4,Red,40.0


In [51]:
df.dropna(how='all')

Unnamed: 0,Color,Price
0,Red,10.0
1,Blue,15.0
2,Black,30.0
3,Red,
4,Red,40.0
5,Black,


In [53]:
df['Price'].fillna(df['Price'].mean())

0    10.00
1    15.00
2    30.00
3    23.75
4    40.00
5    23.75
Name: Price, dtype: float64

### method=ffill i.e. forwardfill

In [54]:
df

Unnamed: 0,Color,Price
0,Red,10.0
1,Blue,15.0
2,Black,30.0
3,Red,
4,Red,40.0
5,Black,


In [55]:
df.fillna(method='ffill')

Unnamed: 0,Color,Price
0,Red,10.0
1,Blue,15.0
2,Black,30.0
3,Red,30.0
4,Red,40.0
5,Black,40.0


### method:bfill i.e. backfill

In [57]:
df

Unnamed: 0,Color,Price
0,Red,10.0
1,Blue,15.0
2,Black,30.0
3,Red,
4,Red,40.0
5,Black,


In [56]:
df.fillna(method='bfill')

Unnamed: 0,Color,Price
0,Red,10.0
1,Blue,15.0
2,Black,30.0
3,Red,40.0
4,Red,40.0
5,Black,
