# Handle Missing data using Pandas

In [1]:
import pandas as pd
df = pd.read_csv("weather_data2.csv")
df

Unnamed: 0,day,temperature,windspeed,event
0,01-01-2022,32F,6.0,Rain
1,01-02-2022,,8.0,Sunny
2,01-03-2022,28,5.0,
3,01-04-2022,,,Rain
4,01-05-2022,32,7.0,Snow
5,01-06-2022,,,Sunny
6,01-07-2022,,9.0,
7,01-08-2022,35C,6.0,Cloudy
8,01-09-2022,40,,Snow
9,01-10-2022,37,8.0,Cloudy


In [2]:
type(df.day)

pandas.core.series.Series

In [3]:
type(df.day[3])

str

In [4]:
df = pd.read_csv("weather_data2.csv", parse_dates=['day'])
df

Unnamed: 0,day,temperature,windspeed,event
0,2022-01-01,32F,6.0,Rain
1,2022-01-02,,8.0,Sunny
2,2022-01-03,28,5.0,
3,2022-01-04,,,Rain
4,2022-01-05,32,7.0,Snow
5,2022-01-06,,,Sunny
6,2022-01-07,,9.0,
7,2022-01-08,35C,6.0,Cloudy
8,2022-01-09,40,,Snow
9,2022-01-10,37,8.0,Cloudy


In [5]:
type(df.day[3])

pandas._libs.tslibs.timestamps.Timestamp

In [6]:
df.set_index('day', inplace=True)
df

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2022-01-01,32F,6.0,Rain
2022-01-02,,8.0,Sunny
2022-01-03,28,5.0,
2022-01-04,,,Rain
2022-01-05,32,7.0,Snow
2022-01-06,,,Sunny
2022-01-07,,9.0,
2022-01-08,35C,6.0,Cloudy
2022-01-09,40,,Snow
2022-01-10,37,8.0,Cloudy


In [7]:
# To replace na values
new_df = df.fillna('-')
new_df

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2022-01-01,32F,6.0,Rain
2022-01-02,-,8.0,Sunny
2022-01-03,28,5.0,-
2022-01-04,-,-,Rain
2022-01-05,32,7.0,Snow
2022-01-06,-,-,Sunny
2022-01-07,-,9.0,-
2022-01-08,35C,6.0,Cloudy
2022-01-09,40,-,Snow
2022-01-10,37,8.0,Cloudy


In [8]:
# different values for different columns
new_df = df.fillna({
    'temperature':0,
    'windspeed':'-',
    'event':'no event'
})
new_df

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2022-01-01,32F,6.0,Rain
2022-01-02,0,8.0,Sunny
2022-01-03,28,5.0,no event
2022-01-04,0,-,Rain
2022-01-05,32,7.0,Snow
2022-01-06,0,-,Sunny
2022-01-07,0,9.0,no event
2022-01-08,35C,6.0,Cloudy
2022-01-09,40,-,Snow
2022-01-10,37,8.0,Cloudy


In [9]:
# to carry forward previous values for the na_values
# ffill--> forward fill & bfill--> backward fill
# new_df = df.fillna(method='ffill')
new_df = df.fillna(method='bfill')
# new_df = df.fillna(method='bfill', axis='columns')
new_df = df.fillna(method='bfill', limit=1)

new_df

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2022-01-01,32F,6.0,Rain
2022-01-02,28,8.0,Sunny
2022-01-03,28,5.0,Rain
2022-01-04,32,7.0,Rain
2022-01-05,32,7.0,Snow
2022-01-06,,9.0,Sunny
2022-01-07,35C,9.0,Cloudy
2022-01-08,35C,6.0,Cloudy
2022-01-09,40,8.0,Snow
2022-01-10,37,8.0,Cloudy


In [10]:
# new_df = df.interpolate()
new_df = df.interpolate(method='time')
new_df

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2022-01-01,32F,6.0,Rain
2022-01-02,,8.0,Sunny
2022-01-03,28,5.0,
2022-01-04,,6.0,Rain
2022-01-05,32,7.0,Snow
2022-01-06,,8.0,Sunny
2022-01-07,,9.0,
2022-01-08,35C,6.0,Cloudy
2022-01-09,40,7.0,Snow
2022-01-10,37,8.0,Cloudy


In [11]:
# to drop rows with na value
new_df1 = df.dropna()
new_df1

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2022-01-01,32F,6.0,Rain
2022-01-05,32,7.0,Snow
2022-01-08,35C,6.0,Cloudy
2022-01-10,37,8.0,Cloudy


In [12]:
new_df1 = df.dropna(how='all')
new_df1

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2022-01-01,32F,6.0,Rain
2022-01-02,,8.0,Sunny
2022-01-03,28,5.0,
2022-01-04,,,Rain
2022-01-05,32,7.0,Snow
2022-01-06,,,Sunny
2022-01-07,,9.0,
2022-01-08,35C,6.0,Cloudy
2022-01-09,40,,Snow
2022-01-10,37,8.0,Cloudy


In [13]:
new_df1 = df.dropna(thresh=1)
new_df1

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2022-01-01,32F,6.0,Rain
2022-01-02,,8.0,Sunny
2022-01-03,28,5.0,
2022-01-04,,,Rain
2022-01-05,32,7.0,Snow
2022-01-06,,,Sunny
2022-01-07,,9.0,
2022-01-08,35C,6.0,Cloudy
2022-01-09,40,,Snow
2022-01-10,37,8.0,Cloudy


In [14]:
dt = pd.date_range("01-01-2022", "01-10-2022")
idx = pd.DatetimeIndex(dt)
df.reindex(idx)

Unnamed: 0,temperature,windspeed,event
2022-01-01,32F,6.0,Rain
2022-01-02,,8.0,Sunny
2022-01-03,28,5.0,
2022-01-04,,,Rain
2022-01-05,32,7.0,Snow
2022-01-06,,,Sunny
2022-01-07,,9.0,
2022-01-08,35C,6.0,Cloudy
2022-01-09,40,,Snow
2022-01-10,37,8.0,Cloudy
