### Create a DataFrame representing weather and air quality data, with some missing values

In [2]:
import pandas as pd
import numpy as np

df = pd.DataFrame(data={
    'Humidity': np.random.choice([*np.random.randint(70, 101, 10), np.nan], size=20),
    'Air_Quality': np.random.choice([*range(1, 6), np.nan], size=20),
    'Day_Of_Week': np.random.choice(['Mon', 'Tue', 'Wed', 'Thur', 'Fri', 'Sat', 'Sun', None], size=20),
    'Prevailing_Wind': np.random.choice(a=['West', 'East', 'South', 'North', None], p=[0.4, 0.2, 0.1, 0.1, 0.2], size=20)
})

print(df.isnull().sum())
df



Humidity           2
Air_Quality        4
Day_Of_Week        3
Prevailing_Wind    5
dtype: int64


Unnamed: 0,Humidity,Air_Quality,Day_Of_Week,Prevailing_Wind
0,81.0,4.0,Mon,West
1,81.0,1.0,Sat,
2,86.0,5.0,Sat,East
3,70.0,1.0,Thur,West
4,92.0,3.0,,West
5,86.0,,Wed,South
6,95.0,,Fri,West
7,,5.0,Sun,East
8,99.0,1.0,Fri,North
9,90.0,,Mon,West


### Fill the Day_Of_Week column missing values with a constant

In [3]:
df['Day_Of_Week'] = df['Day_Of_Week'].fillna(value='Mon')
df

Unnamed: 0,Humidity,Air_Quality,Day_Of_Week,Prevailing_Wind
0,81.0,4.0,Mon,West
1,81.0,1.0,Sat,
2,86.0,5.0,Sat,East
3,70.0,1.0,Thur,West
4,92.0,3.0,Mon,West
5,86.0,,Wed,South
6,95.0,,Fri,West
7,,5.0,Sun,East
8,99.0,1.0,Fri,North
9,90.0,,Mon,West


### Forward fill the 'Humidity' column with the preceeding valid value. 

Check that the first value is not still NaN!

In [4]:
df['Humidity'] = df['Humidity'].ffill()
df


Unnamed: 0,Humidity,Air_Quality,Day_Of_Week,Prevailing_Wind
0,81.0,4.0,Mon,West
1,81.0,1.0,Sat,
2,86.0,5.0,Sat,East
3,70.0,1.0,Thur,West
4,92.0,3.0,Mon,West
5,86.0,,Wed,South
6,95.0,,Fri,West
7,95.0,5.0,Sun,East
8,99.0,1.0,Fri,North
9,90.0,,Mon,West


### Fill missing values for 'Air_Quality' based on the means for weekdays and weekends

In [5]:
# Add a new boolean column to discriminate between weekdays and weekends
df['Is_Weekend'] = df['Day_Of_Week'].isin(['Sun', 'Sat'])

# Calculate separate means for weekdays and weekends
weekday_mean = df.query('not Is_Weekend')['Air_Quality'].mean()
weekend_mean = df.query('Is_Weekend')['Air_Quality'].mean()
print(f'Weekday mean: {weekday_mean}, Weekend mean: {weekend_mean}')

# Fill the missing 'Air Quality' values based on the day of the week
df.loc[df['Is_Weekend'], 'Air_Quality'] = df.loc[df['Is_Weekend'], 'Air_Quality'].fillna(weekend_mean)
df.loc[~df['Is_Weekend'], 'Air_Quality'] = df.loc[~df['Is_Weekend'], 'Air_Quality'].fillna(weekday_mean)

df



Weekday mean: 2.5, Weekend mean: 3.25


Unnamed: 0,Humidity,Air_Quality,Day_Of_Week,Prevailing_Wind,Is_Weekend
0,81.0,4.0,Mon,West,False
1,81.0,1.0,Sat,,True
2,86.0,5.0,Sat,East,True
3,70.0,1.0,Thur,West,False
4,92.0,3.0,Mon,West,False
5,86.0,2.5,Wed,South,False
6,95.0,2.5,Fri,West,False
7,95.0,5.0,Sun,East,True
8,99.0,1.0,Fri,North,False
9,90.0,2.5,Mon,West,False


### Use the most common value for prevailing wind to fill missing values

In [9]:
most_common_wind = df['Prevailing_Wind'].mode().iloc[0]
print(f'most_common_wind: {most_common_wind}')

df['Prevailing_Wind'] = df['Prevailing_Wind'].fillna(most_common_wind)
df

most_common_wind: West


Unnamed: 0,Humidity,Air_Quality,Day_Of_Week,Prevailing_Wind,Is_Weekend
0,81.0,4.0,Mon,West,False
1,81.0,1.0,Sat,West,True
2,86.0,5.0,Sat,East,True
3,70.0,1.0,Thur,West,False
4,92.0,3.0,Mon,West,False
5,86.0,2.5,Wed,South,False
6,95.0,2.5,Fri,West,False
7,95.0,5.0,Sun,East,True
8,99.0,1.0,Fri,North,False
9,90.0,2.5,Mon,West,False


### Remove the intermediate 'Is_Weekend' feature, if appropriate

In [11]:
df = df.drop(['Is_Weekend'], axis=1)
df

Unnamed: 0,Humidity,Air_Quality,Day_Of_Week,Prevailing_Wind
0,81.0,4.0,Mon,West
1,81.0,1.0,Sat,West
2,86.0,5.0,Sat,East
3,70.0,1.0,Thur,West
4,92.0,3.0,Mon,West
5,86.0,2.5,Wed,South
6,95.0,2.5,Fri,West
7,95.0,5.0,Sun,East
8,99.0,1.0,Fri,North
9,90.0,2.5,Mon,West
