In [1]:
import pandas as pd
import matplotlib.pyplot as plt

### Read CSV file(s)

In [2]:
air_df = pd.read_csv("data/aqi.csv")
weather_df = pd.read_csv("data/weather.csv")

In [3]:
air_df = air_df.rename(columns={"Unnamed: 0": "time"})
air_df.head()

Unnamed: 0,time,carbon_monoxide,pm10,pm2_5,nitrogen_dioxide,ozone,sulphur_dioxide
0,2022-08-04T00:00:00,595.0,58.0,40.3,29.7,24.0,16.8
1,2022-08-04T01:00:00,552.0,43.5,30.0,25.0,49.0,18.2
2,2022-08-04T02:00:00,492.0,47.3,32.7,18.4,84.0,20.2
3,2022-08-04T03:00:00,429.0,50.3,34.9,11.2,128.0,22.0
4,2022-08-04T04:00:00,414.0,55.2,38.3,8.6,154.0,21.5


In [4]:
weather_df = weather_df.rename(columns={"Unnamed: 0": "time"})
weather_df.head()

Unnamed: 0,time,temperature_2m,relative_humidity_2m,dew_point_2m,precipitation,surface_pressure,cloud_cover,wind_speed_10m,wind_direction_10m
0,2020-12-01T00:00:00,16.9,61,9.5,0.0,1022.1,76,10.6,28
1,2020-12-01T01:00:00,17.4,59,9.3,0.0,1022.9,99,12.1,27
2,2020-12-01T02:00:00,17.8,58,9.5,0.0,1023.9,62,12.9,23
3,2020-12-01T03:00:00,18.7,56,9.9,0.0,1023.7,100,12.7,15
4,2020-12-01T04:00:00,19.4,55,10.3,0.0,1022.8,100,12.6,13


Set datetime types

In [5]:
weather_df['time'] = weather_df['time'].astype('datetime64[s]')
air_df['time'] = air_df['time'].astype('datetime64[s]')

In [6]:
weather_df.head()

Unnamed: 0,time,temperature_2m,relative_humidity_2m,dew_point_2m,precipitation,surface_pressure,cloud_cover,wind_speed_10m,wind_direction_10m
0,2020-12-01 00:00:00,16.9,61,9.5,0.0,1022.1,76,10.6,28
1,2020-12-01 01:00:00,17.4,59,9.3,0.0,1022.9,99,12.1,27
2,2020-12-01 02:00:00,17.8,58,9.5,0.0,1023.9,62,12.9,23
3,2020-12-01 03:00:00,18.7,56,9.9,0.0,1023.7,100,12.7,15
4,2020-12-01 04:00:00,19.4,55,10.3,0.0,1022.8,100,12.6,13


Check null values (Later on interpolation may be needed if there exist such entries)

In [7]:
print(air_df.isnull().values.any())
print(weather_df.isnull().values.any())

False
False


Check negative values (Attributes only)

In [8]:
air_df_attr = air_df.iloc[:,1:]
weather_df_attr = weather_df.iloc[:,1:]


In [9]:
print((air_df_attr.values < 0).any())
print((weather_df_attr.values < 0).any())

False
True


In [10]:
print((air_df.select_dtypes(include='number') < 0).sum())

carbon_monoxide     0
pm10                0
pm2_5               0
nitrogen_dioxide    0
ozone               0
sulphur_dioxide     0
dtype: int64


However for weather dataframe, only negative values are observed in dewpoint_2m column, measured in degrees Celsius (which can have values smaller than 0)

In [11]:
print((weather_df.select_dtypes(include='number') < 0).sum())

temperature_2m            0
relative_humidity_2m      0
dew_point_2m            186
precipitation             0
surface_pressure          0
cloud_cover               0
wind_speed_10m            0
wind_direction_10m        0
dtype: int64


Hence, there are no invalid entries for both datasets

### Check missing timestamps

Set time as index

In [12]:
weather_df.set_index(weather_df.columns[0], inplace=True)
air_df.set_index(air_df.columns[0], inplace=True)

Match timestamps for air data and weather data

In [13]:
print(air_df.index.min(), air_df.index.max())
print(weather_df.index.min(), weather_df.index.max())

2022-08-04 00:00:00 2025-12-15 00:00:00
2020-12-01 00:00:00 2025-12-15 00:00:00


In [14]:
starting_timestamp = max(air_df.index.min(), weather_df.index.min())
ending_timestamp = min(air_df.index.max(), weather_df.index.max())

air_df = air_df[starting_timestamp:ending_timestamp]
weather_df = weather_df[starting_timestamp:ending_timestamp]

Save dataframes

In [15]:
air_df.sort_index().to_csv("data/processed/cleaned/cleaned_air.csv")
weather_df.sort_index().to_csv("data/processed/cleaned/cleaned_weather.csv")