In [1]:
import pandas as pd
import numpy as np

file_path = "data/india_states/india_all_states_aq_last30days.csv"
df = pd.read_csv(file_path)

print("Shape:", df.shape)
df.head()

Shape: (1034106, 14)


Unnamed: 0,state,district,location_id,location_name,sensor_id,parameter,parameter_original,parameter_display,value,unit,datetime_utc,datetime_local,latitude,longitude
0,Andhra Pradesh,Visakhapatnam,5628,"GVM Corporation, Visakhapatnam - APPCB",12235460,pm25,pm25,PM2.5,107.0,µg/m³,2025-11-24T13:00:00Z,2025-11-24T18:30:00+05:30,17.722682,83.308197
1,Andhra Pradesh,Visakhapatnam,5628,"GVM Corporation, Visakhapatnam - APPCB",12235460,pm25,pm25,PM2.5,107.0,µg/m³,2025-11-24T13:15:00Z,2025-11-24T18:45:00+05:30,17.722682,83.308197
2,Andhra Pradesh,Visakhapatnam,5628,"GVM Corporation, Visakhapatnam - APPCB",12235460,pm25,pm25,PM2.5,107.0,µg/m³,2025-11-24T13:30:00Z,2025-11-24T19:00:00+05:30,17.722682,83.308197
3,Andhra Pradesh,Visakhapatnam,5628,"GVM Corporation, Visakhapatnam - APPCB",12235460,pm25,pm25,PM2.5,107.0,µg/m³,2025-11-24T13:45:00Z,2025-11-24T19:15:00+05:30,17.722682,83.308197
4,Andhra Pradesh,Visakhapatnam,5628,"GVM Corporation, Visakhapatnam - APPCB",12235460,pm25,pm25,PM2.5,107.0,µg/m³,2025-11-24T14:00:00Z,2025-11-24T19:30:00+05:30,17.722682,83.308197


# Data Cleaning

In [6]:
#step1: Remove Duplicates
before = len(df)
df.drop_duplicates(inplace=True)
after = len(df)

print(f"Removed {before - after} duplicate rows")

Removed 0 duplicate rows


In [12]:
#Step2: Handle Missing Values
df.isnull().sum()
(df == "").sum()
#filling missing values
"""# Numeric columns → use median
num_cols = df.select_dypes(include=np.number).columns
df[num_cols] = df[num_cols].fillna(df[num_cols].median())

# Categorical columns → fill with "Unknown"
cat_cols = df.select_dtypes(include='object').columns
df[cat_cols] = df[cat_cols].fillna("Unknown")"""

state                 0
district              0
location_id           0
location_name         0
sensor_id             0
parameter             0
parameter_original    0
parameter_display     0
value                 0
unit                  0
datetime_utc          0
datetime_local        0
latitude              0
longitude             0
dtype: int64

In [22]:
#Step3: Standardize Timestamp
if 'datetime_utc' in df.columns:
    df['datetime_utc'] = pd.to_datetime(df['datetime_utc'], errors='coerce')

In [24]:
df['hour'] = df['datetime_utc'].dt.hour
df['day'] = df['datetime_utc'].dt.day
df['month'] = df['datetime_utc'].dt.month
df['weekday'] = df['datetime_utc'].dt.weekday

In [26]:
#Step4: Convert Values to Numeric
df['value'] = pd.to_numeric(df['value'], errors='coerce')
df['value'].fillna(df['value'].median(), inplace=True)

In [28]:
#Step5: Remove Extreme Outliers(Safe filtering)
Q1 = df['value'].quantile(0.25)
Q3 = df['value'].quantile(0.75)
IQR = Q3 - Q1
lower = Q1 - 1.5 * IQR
upper = Q3 + 1.5 * IQR

before = len(df)
df = df[(df['value'] >= lower) & (df['value'] <= upper)]
after = len(df)

print(f"Removed {before - after} outlier records")

Removed 80351 outlier records


In [32]:
len_before = before     # from original dataset size
len_after = after       # after outlier filtering

print("Before:", len_before)
print("After:", len_after)
print("Removed:", len_before - len_after)
print("Percent removed:", (len_before - len_after) / len_before * 100)

Before: 1034106
After: 953755
Removed: 80351
Percent removed: 7.770093201277238


In [36]:
#Step6: Normalise Pollutant Values
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

df['value_norm'] = scaler.fit_transform(df[['value']])

In [38]:
#Step7: save the clean dataset
output_file = "data/india_states/clean_india_aq_dataset.csv"
df.to_csv(output_file, index=False)

print("Clean dataset saved:", output_file)

Clean dataset saved: data/india_states/clean_india_aq_dataset.csv


# Feature Engineering

In [41]:
#STEP-1 — Load the cleaned dataset
import pandas as pd

df = pd.read_csv("data/india_states/clean_india_aq_dataset.csv")
df.head()

Unnamed: 0,state,district,location_id,location_name,sensor_id,parameter,parameter_original,parameter_display,value,unit,datetime_utc,datetime_local,latitude,longitude,hour,day,month,weekday,value_norm
0,Andhra Pradesh,Visakhapatnam,5628,"GVM Corporation, Visakhapatnam - APPCB",12235460,pm25,pm25,PM2.5,107.0,µg/m³,2025-11-24 13:00:00+00:00,2025-11-24T18:30:00+05:30,17.722682,83.308197,13,24,11,0,0.773684
1,Andhra Pradesh,Visakhapatnam,5628,"GVM Corporation, Visakhapatnam - APPCB",12235460,pm25,pm25,PM2.5,107.0,µg/m³,2025-11-24 13:15:00+00:00,2025-11-24T18:45:00+05:30,17.722682,83.308197,13,24,11,0,0.773684
2,Andhra Pradesh,Visakhapatnam,5628,"GVM Corporation, Visakhapatnam - APPCB",12235460,pm25,pm25,PM2.5,107.0,µg/m³,2025-11-24 13:30:00+00:00,2025-11-24T19:00:00+05:30,17.722682,83.308197,13,24,11,0,0.773684
3,Andhra Pradesh,Visakhapatnam,5628,"GVM Corporation, Visakhapatnam - APPCB",12235460,pm25,pm25,PM2.5,107.0,µg/m³,2025-11-24 13:45:00+00:00,2025-11-24T19:15:00+05:30,17.722682,83.308197,13,24,11,0,0.773684
4,Andhra Pradesh,Visakhapatnam,5628,"GVM Corporation, Visakhapatnam - APPCB",12235460,pm25,pm25,PM2.5,107.0,µg/m³,2025-11-24 14:00:00+00:00,2025-11-24T19:30:00+05:30,17.722682,83.308197,14,24,11,0,0.773684


In [43]:
#STEP-2 — Temporal Features (VERY useful)
df['datetime_utc'] = pd.to_datetime(df['datetime_utc'])

df['hour'] = df['datetime_utc'].dt.hour
df['day'] = df['datetime_utc'].dt.day
df['month'] = df['datetime_utc'].dt.month
df['weekday'] = df['datetime_utc'].dt.weekday
df['is_weekend'] = df['weekday'].isin([5,6]).astype(int)

In [45]:
#STEP-3 — Pollution Trend Features
df = df.sort_values(by=['location_id', 'datetime_utc'])

df['pm_trend_mean_3h'] = df.groupby('location_id')['value'].rolling(3).mean().reset_index(0,drop=True)
df['pm_trend_mean_6h'] = df.groupby('location_id')['value'].rolling(6).mean().reset_index(0,drop=True)

df['recent_spike'] = (df['value'] > df['pm_trend_mean_3h']).astype(int)

df.fillna(method='bfill', inplace=True)

  df.fillna(method='bfill', inplace=True)


In [47]:
#STEP-4 — Basic Spatial Features (before OSM step)
df['lat_round'] = df['latitude'].round(2)
df['lon_round'] = df['longitude'].round(2)

In [49]:
#STEP-5 — Encode Categorical Columns
cat_cols = ['state', 'district', 'parameter']

df = pd.get_dummies(df, columns=cat_cols, drop_first=True)

In [51]:
#STEP-6 — Save Feature-Engineered Dataset
output = "data/india_states/feature_engineered_dataset.csv"
df.to_csv(output, index=False)
print("Saved:", output)

Saved: data/india_states/feature_engineered_dataset.csv
