In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error
import pprint
import os


### Read CSV file(s)

In [2]:
air_df = pd.read_csv("data/location_7740/aqi.csv")

In [3]:
air_df = air_df.rename(columns={"Unnamed: 0": "time"})
air_df.head()

Unnamed: 0,time,co,no2,o3,pm10,pm25,so2
0,2022-01-01T00:00:00Z,769.0,34.1,58.4,47.9,33.9,5.2
1,2022-01-01T01:00:00Z,891.0,49.9,55.0,56.1,37.9,6.1
2,2022-01-01T02:00:00Z,914.0,53.4,59.1,61.9,41.0,6.6
3,2022-01-01T03:00:00Z,846.0,43.1,68.8,63.2,44.1,6.9
4,2022-01-01T04:00:00Z,821.0,40.6,74.3,52.7,38.1,6.9


Check null values (Later on interpolation may be needed)

In [4]:
print(air_df.isnull().values.any())

True


Check negative values (Attributes only)

In [5]:
air_df_attr = air_df.iloc[:,1:-1]
print((air_df_attr.values < 0).any())

False


### Setup and data cleaning

Directory for output files

In [6]:
os.makedirs("data/processed", exist_ok=True)

Standardize time and set index

In [7]:
air_df["time"] = pd.to_datetime(air_df["time"], utc=True).dt.tz_localize(None)
air_df = air_df.dropna(subset=["time"]).set_index("time").sort_index()

# Round to nearest hour and handle duplicates
air_df.index = air_df.index.round("h")
air_df = air_df.groupby(air_df.index).mean()

Remove negative values (if exist), also save a cleaned dataframe and dropped dataframe (for inavild entries)

In [8]:
numeric_cols = air_df.select_dtypes(include=[np.number]).columns
invalid_mask = (air_df[numeric_cols] < 0).any(axis=1)
dropped_air_df = air_df[invalid_mask].copy()

cleaned_air_df = air_df.copy()
cleaned_air_df[numeric_cols] = cleaned_air_df[numeric_cols].mask(cleaned_air_df[numeric_cols] < 0)

print(f"Total Rows: {len(air_df)}")
print(f"Invalid Rows identified (negatives): {len(dropped_air_df)}")

Total Rows: 34484
Invalid Rows identified (negatives): 0


### Interpolation setup

Define the full hourly range for the cleaned dataframe

In [9]:
start_date = air_df.index.min()
end_date = air_df.index.max()
full_range = pd.date_range(start=start_date, end=end_date, freq="h")

In [10]:
aligned_air_df = cleaned_air_df.reindex(full_range)

Apply time interpolation to fill in for the invalid entries

In [11]:
interpolated_air_df = aligned_air_df.interpolate(method="time", axis=0, limit_direction="both")

### Saving dataframes

In [12]:
dropped_air_df.to_csv("data/processed/dropped_air.csv")
cleaned_air_df.to_csv("data/processed/cleaned_air.csv")
interpolated_air_df.to_csv("data/processed/interpolated_air.csv")

### Interpolation Evaluation

Creating artificial random gaps from longest consecutive timestamps to calculate losses for different interpolation methods

Identify the start of each sequence of consecutive timestamps (True if the start of sequence)

In [13]:
start_of_consec_seq = air_df.index.get_level_values(0).drop_duplicates().to_series().diff() != pd.Timedelta("0 days 01:00:00")
start_of_consec_seq.head(5)

time
2022-01-01 00:00:00     True
2022-01-01 01:00:00    False
2022-01-01 02:00:00    False
2022-01-01 03:00:00    False
2022-01-01 04:00:00    False
Name: time, dtype: bool

Counter/identifer for each sequence

In [14]:
consec_seq_id = start_of_consec_seq.cumsum().to_frame(name="group")
consec_seq_id

Unnamed: 0_level_0,group
time,Unnamed: 1_level_1
2022-01-01 00:00:00,1
2022-01-01 01:00:00,1
2022-01-01 02:00:00,1
2022-01-01 03:00:00,1
2022-01-01 04:00:00,1
...,...
2025-12-07 15:00:00,1
2025-12-07 16:00:00,1
2025-12-07 17:00:00,1
2025-12-07 18:00:00,1


Group id with their group length

In [15]:
grouped = consec_seq_id.groupby("group").size()
grouped

group
1    34484
dtype: int64

Select group with the longest consecutive timestamps

In [16]:
longest_sequence = consec_seq_id[consec_seq_id["group"] == grouped.idxmax()].index
longest_sequence

DatetimeIndex(['2022-01-01 00:00:00', '2022-01-01 01:00:00',
               '2022-01-01 02:00:00', '2022-01-01 03:00:00',
               '2022-01-01 04:00:00', '2022-01-01 05:00:00',
               '2022-01-01 06:00:00', '2022-01-01 07:00:00',
               '2022-01-01 08:00:00', '2022-01-01 09:00:00',
               ...
               '2025-12-07 10:00:00', '2025-12-07 11:00:00',
               '2025-12-07 12:00:00', '2025-12-07 13:00:00',
               '2025-12-07 14:00:00', '2025-12-07 15:00:00',
               '2025-12-07 16:00:00', '2025-12-07 17:00:00',
               '2025-12-07 18:00:00', '2025-12-07 19:00:00'],
              dtype='datetime64[ns]', name='time', length=34484, freq=None)

Randomly drop 5% of timestamps (rows) from the longest sequence

In [17]:
full_seq_air = air_df.loc[longest_sequence]

In [18]:
n_drop = int(len(full_seq_air.index) * 0.05)
np.random.seed(24)
drop_indices = np.random.choice(full_seq_air.index, size=n_drop, replace=False)
drop_indices.shape

(1724,)

In [19]:
missing_seq_air = full_seq_air.drop(index=drop_indices)

Re-index missing timestamps for interpolation

In [20]:
missing_seq_air = missing_seq_air.reindex(full_seq_air.index)

In [21]:
linear_interpolated_air = missing_seq_air.interpolate(
    method="linear", axis=0, limit=None, limit_direction="both")
time_interpolated_air = missing_seq_air.interpolate(
    method="time", axis=0, limit=None, limit_direction="both")
spline2_interpolated_air = missing_seq_air.interpolate(
    method="spline", axis=0, limit=None, limit_direction="both", order=2)
spline3_interpolated_air = missing_seq_air.interpolate(
    method="spline", axis=0, limit=None, limit_direction="both", order=3)

Function for calculating errors for the interpolation methods

In [22]:
def calculate_errors(original, interpolated):
    orig_arr = np.array(original)
    interp_arr = np.array(interpolated)
    
    # Filter NaNs
    mask = ~np.isnan(orig_arr) & ~np.isnan(interp_arr)
    clean_orig = orig_arr[mask]
    clean_interp = interp_arr[mask]

    if len(clean_orig) == 0:
        return np.nan, np.nan, np.nan

    mae = mean_absolute_error(clean_orig, clean_interp)
    mse = mean_squared_error(clean_orig, clean_interp)
    mape = mean_absolute_percentage_error(clean_orig, clean_interp)

    rmse = float(np.sqrt(mse))
    
    return mae, rmse, mape

def calculate_errors_all_columns(original_df, interpolated_df, interpolated_indices):
    errors_dict = {}
    for column in original_df.columns:
        if column in interpolated_df.columns:
            original_values = original_df.loc[interpolated_indices, column]
            interpolated_values = interpolated_df.loc[interpolated_indices, column]
            
            mae, rmse, mape = calculate_errors(original_values, interpolated_values)
            
            errors_dict[column] = {"MAE": mae, "RMSE": rmse, "MAPE": mape}
    return errors_dict

interpolation_results = {
    "Linear": linear_interpolated_air,
    "Time": time_interpolated_air,
    "Spline (Ord 2)": spline2_interpolated_air,
    "Spline (Ord 3)": spline3_interpolated_air
}

Loss between interpolated and real values

In [23]:
final_scores = {}

for method_name, df_result in interpolation_results.items():
    
    method_errors = calculate_errors_all_columns(
        full_seq_air,  
        df_result,       
        drop_indices     
    )
    
    final_scores[method_name] = method_errors

for method, scores in final_scores.items():
    print(f"\n--- Results for Method: {method} ---")
    pprint.pprint(scores)


--- Results for Method: Linear ---
{'co': {'MAE': 26.269200396643374,
        'MAPE': 4.116208971329504,
        'RMSE': 66.06835616559806},
 'no2': {'MAE': 4.702284705827704,
         'MAPE': 0.13560705138045936,
         'RMSE': 6.808341349737708},
 'o3': {'MAE': 5.022741216696381,
        'MAPE': 0.20705850677756796,
        'RMSE': 7.386604631637298},
 'pm10': {'MAE': 2.3159539049787887,
          'MAPE': 25121191045831.562,
          'RMSE': 4.393090329439082},
 'pm25': {'MAE': 1.7866039061668841,
          'MAPE': 23125590297014.836,
          'RMSE': 2.4714236261052513},
 'so2': {'MAE': 0.4248616029986399,
         'MAPE': 0.08017810813819636,
         'RMSE': 0.8325931753451933}}

--- Results for Method: Time ---
{'co': {'MAE': 26.269200396643374,
        'MAPE': 4.116208971329504,
        'RMSE': 66.06835616559806},
 'no2': {'MAE': 4.702284705827704,
         'MAPE': 0.13560705138045936,
         'RMSE': 6.808341349737708},
 'o3': {'MAE': 5.022741216696381,
        'MAPE': 0.