In [59]:
import pandas as pd
import numpy as np
df = pd.read_csv('data/weather_data.csv', parse_dates=['day'])
df.set_index('day', inplace=True)

In [60]:
df

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2022-01-01,32.0,6.0,Rain
2022-01-04,,9.0,Sunny
2022-01-05,28.0,,Snow
2022-01-06,,7.0,
2022-01-07,32.0,,Rain
2022-01-08,,,Sunny
2022-01-09,,,
2022-01-10,34.0,8.0,Cloudy
2022-01-11,40.0,12.0,Sunny


In [61]:
# with df.fillna(), we can specify different values for different columns
# just pass a dictionary into the fillna method
new_df = df.fillna({
    'temperature': 0,
    'windspeed': 0,
    'event': 'No event'
})
new_df

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2022-01-01,32.0,6.0,Rain
2022-01-04,0.0,9.0,Sunny
2022-01-05,28.0,0.0,Snow
2022-01-06,0.0,7.0,No event
2022-01-07,32.0,0.0,Rain
2022-01-08,0.0,0.0,Sunny
2022-01-09,0.0,0.0,No event
2022-01-10,34.0,8.0,Cloudy
2022-01-11,40.0,12.0,Sunny


In [62]:
# https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.fillna.html
new_df = df.fillna(method="ffill") # ffill carries over values to fill NaNs/NAs.

new_df

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2022-01-01,32.0,6.0,Rain
2022-01-04,32.0,9.0,Sunny
2022-01-05,28.0,9.0,Snow
2022-01-06,28.0,7.0,Snow
2022-01-07,32.0,7.0,Rain
2022-01-08,32.0,7.0,Sunny
2022-01-09,32.0,7.0,Sunny
2022-01-10,34.0,8.0,Cloudy
2022-01-11,40.0,12.0,Sunny


In [63]:
new_df[0:3]['temperature'].std()

2.3094010767585034

In [64]:
# Fill NaN values using an interpolation method.
# time method takes into account missing days.
# if the temperature changes linearly, the missing temp on 2022-01-04 should
# be closer to 2022-01-05 than 2022-01-01 and NOT equally spaced between the two
new_df = df.interpolate(method='time') 

new_df

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2022-01-01,32.0,6.0,Rain
2022-01-04,29.0,9.0,Sunny
2022-01-05,28.0,8.0,Snow
2022-01-06,30.0,7.0,
2022-01-07,32.0,7.25,Rain
2022-01-08,32.666667,7.5,Sunny
2022-01-09,33.333333,7.75,
2022-01-10,34.0,8.0,Cloudy
2022-01-11,40.0,12.0,Sunny


In [77]:
# you can also dropna with a specified axis, with default being 0
new_df = df.dropna() # will drop rows with missing values
new_df

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2022-01-01,32.0,6.0,Rain
2022-01-10,34.0,8.0,Cloudy
2022-01-11,40.0,12.0,Sunny


In [86]:
# dropna with axis = 1 will drop columns with missing values
example = {
    'data': ['3/1/2022', '3/2/2022', '3/3/2022'],
    'temperature': [65, 71, 72],
    'windspeed': [12, 8, np.nan],
    'event': ['Sunny', 'Sunny', 'Sunny']
}
example_df = pd.DataFrame(example)
example_df.dropna(axis='columns', inplace=True)
example_df

Unnamed: 0,data,temperature,event
0,3/1/2022,65,Sunny
1,3/2/2022,71,Sunny
2,3/3/2022,72,Sunny
