## [Filling missing data](https://pandas.pydata.org/pandas-docs/stable/user_guide/missing_data.html#filling-missing-data)

In [1]:
import pandas as pd
import numpy as np

In [2]:
data = {'np': [1,np.nan, np.nan, 2], 'arrow': pd.array([1, pd.NA, pd.NA, 2], dtype = 'float64[pyarrow]')}
data

{'np': [1, nan, nan, 2],
 'arrow': <ArrowExtensionArray>
 [1.0, <NA>, <NA>, 2.0]
 Length: 4, dtype: double[pyarrow]}

In [3]:
df = pd.DataFrame(data)
df

Unnamed: 0,np,arrow
0,1.0,1.0
1,,
2,,
3,2.0,2.0


In [4]:
df.fillna(0)

Unnamed: 0,np,arrow
0,1.0,1.0
1,0.0,0.0
2,0.0,0.0
3,2.0,2.0


In [5]:
df.ffill()

Unnamed: 0,np,arrow
0,1.0,1.0
1,1.0,1.0
2,1.0,1.0
3,2.0,2.0


In [6]:
df.bfill()

Unnamed: 0,np,arrow
0,1.0,1.0
1,2.0,2.0
2,2.0,2.0
3,2.0,2.0


In [7]:
df.ffill(limit=1)

Unnamed: 0,np,arrow
0,1.0,1.0
1,1.0,1.0
2,,
3,2.0,2.0


NA values can be replaced with corresponding value from a `Series` or `DataFrame` where the index and column aligns between the original object and the filled object.

In [25]:
dff = pd.DataFrame(np.arange(30, dtype='float64').reshape(10,3), columns=list("ABC"))
dff

Unnamed: 0,A,B,C
0,0.0,1.0,2.0
1,3.0,4.0,5.0
2,6.0,7.0,8.0
3,9.0,10.0,11.0
4,12.0,13.0,14.0
5,15.0,16.0,17.0
6,18.0,19.0,20.0
7,21.0,22.0,23.0
8,24.0,25.0,26.0
9,27.0,28.0,29.0


In [30]:
dff.iloc[3:5, 0] = np.nan
dff.iloc[4:6, 1] = np.nan
dff.iloc[5:8, 2] = np.nan
dff

Unnamed: 0,A,B,C
0,0.0,1.0,2.0
1,3.0,4.0,5.0
2,6.0,7.0,8.0
3,,10.0,11.0
4,,,14.0
5,15.0,,
6,18.0,19.0,
7,21.0,22.0,
8,24.0,25.0,26.0
9,27.0,28.0,29.0


In [21]:
dff.mean()

A    14.250000
B    14.500000
C    13.571429
dtype: float64

In [24]:
dff.fillna(dff.mean())

Unnamed: 0,A,B,C
0,0.0,1.0,2.0
1,3.0,4.0,5.0
2,6.0,7.0,8.0
3,14.25,10.0,11.0
4,14.25,14.5,14.0
5,15.0,14.5,13.571429
6,18.0,19.0,13.571429
7,21.0,22.0,13.571429
8,24.0,25.0,26.0
9,27.0,28.0,29.0


In [26]:
dff

Unnamed: 0,A,B,C
0,0.0,1.0,2.0
1,3.0,4.0,5.0
2,6.0,7.0,8.0
3,9.0,10.0,11.0
4,12.0,13.0,14.0
5,15.0,16.0,17.0
6,18.0,19.0,20.0
7,21.0,22.0,23.0
8,24.0,25.0,26.0
9,27.0,28.0,29.0


In [31]:
pd.notna(dff)

Unnamed: 0,A,B,C
0,True,True,True
1,True,True,True
2,True,True,True
3,False,True,True
4,False,False,True
5,True,False,False
6,True,True,False
7,True,True,False
8,True,True,True
9,True,True,True


In [32]:
dff.where(pd.notna(dff), dff.mean(), axis='columns')

Unnamed: 0,A,B,C
0,0.0,1.0,2.0
1,3.0,4.0,5.0
2,6.0,7.0,8.0
3,14.25,10.0,11.0
4,14.25,14.5,14.0
5,15.0,14.5,13.571429
6,18.0,19.0,13.571429
7,21.0,22.0,13.571429
8,24.0,25.0,26.0
9,27.0,28.0,29.0
