In [4]:
import pandas as pd

df = pd.read_excel("Finalised_merged_dataset.xlsx")
df.head()

Unnamed: 0,state,district,location_id,location_name,datetime_utc,datetime_local,latitude,longitude,pm25,pm10,...,pm25_scaled,pm10_scaled,no2_scaled,co_scaled,so2_scaled,o3_scaled,temperature_scaled,humidity_scaled,wind_speed_scaled,wind_direction_scaled
0,Andhra Pradesh,Tirupati,5649,"Tirumala, Tirupati - APPCB",2025-11-11T15:00:00Z,2025-11-11 20:30:00,13.67,79.35,81.0,115.0,...,-0.016369,-0.048343,-0.02771,0.093771,-0.372981,-0.005949,0.202941,1.059692,-0.188803,2.060052
1,Andhra Pradesh,Tirupati,5649,"Tirumala, Tirupati - APPCB",2025-11-11T15:15:00Z,2025-11-11 20:45:00,13.67,79.35,81.0,115.0,...,-0.016369,-0.048343,-0.035966,0.001364,-0.372981,-0.005949,0.181327,1.059692,-0.194356,2.060052
2,Andhra Pradesh,Tirupati,5649,"Tirumala, Tirupati - APPCB",2025-11-11T15:45:00Z,2025-11-11 21:15:00,13.67,79.35,81.0,115.0,...,-0.016369,-0.048343,-0.030092,0.186179,-0.369372,-0.005949,0.170521,1.059692,-0.199909,2.060052
3,Andhra Pradesh,Tirupati,5649,"Tirumala, Tirupati - APPCB",2025-11-11T16:15:00Z,2025-11-11 21:45:00,13.67,79.35,90.0,114.0,...,-0.01489,-0.049869,-0.032791,0.067369,-0.372981,-0.005949,0.202941,0.93924,-0.199909,2.060052
4,Andhra Pradesh,Tirupati,5649,"Tirumala, Tirupati - APPCB",2025-11-11T16:30:00Z,2025-11-11 22:00:00,13.67,79.35,90.0,114.0,...,-0.01489,-0.049869,-0.043429,-0.091043,-0.376589,-0.005949,0.192134,0.979391,-0.194356,2.060052


In [5]:
df['datetime_utc'] = pd.to_datetime(df['datetime_utc'], errors='coerce')
df.dtypes

state                                      object
district                                   object
location_id                                 int64
location_name                              object
datetime_utc                  datetime64[ns, UTC]
datetime_local                     datetime64[ns]
latitude                                  float64
longitude                                 float64
pm25                                      float64
pm10                                      float64
no2                                       float64
co                                        float64
so2                                       float64
o3                                        float64
temperature                               float64
humidity                                  float64
wind_speed                                float64
wind_direction                            float64
Roads_count                                 int64
Industrial_zones_count                      int64


In [6]:
df.to_csv("Cleaned_dataset.csv", index=False)

df = pd.read_csv(
    "Cleaned_dataset.csv",
    parse_dates=['datetime_local', 'datetime_utc']
)

In [7]:
df.isnull().sum()

state                             0
district                          0
location_id                       0
location_name                     0
datetime_utc                      0
datetime_local                    0
latitude                          0
longitude                         0
pm25                           5360
pm10                           8972
no2                            6092
co                             9165
so2                            6219
o3                            12926
temperature                   13372
humidity                      14519
wind_speed                    14406
wind_direction                14459
Roads_count                       0
Industrial_zones_count            0
Dump_sites_count                  0
Agricultural_fields_count         0
Query_status                      0
Urban_density_score               0
Industrial_presence               0
Pollution_source_risk             0
Green_area_ratio                  0
dist_nearest_road_m         

In [2]:
#Filling null values using interpolate and median
import pandas as pd
import numpy as np

# --------------------------------------------------
# Load data
# --------------------------------------------------
df = pd.read_csv(
    "Cleaned_dataset.csv",
    parse_dates=['datetime_local', 'datetime_utc']
)

# Ensure proper sorting for time interpolation
df = df.sort_values(['location_id', 'datetime_local']).reset_index(drop=True)

# --------------------------------------------------
#  POLLUTANTS
# --------------------------------------------------
pollutants = ['pm25', 'pm10', 'no2', 'co', 'so2', 'o3']

# Time-based interpolation (small gaps)
df[pollutants] = (
    df.groupby('location_id')[pollutants]
      .transform(lambda x: x.interpolate(method='linear'))
)

# Fill remaining NaNs with median per location
for col in pollutants:
    df[col] = df[col].fillna(
        df.groupby('location_id')[col].transform('median')
    )

# --------------------------------------------------
#  WEATHER VARIABLES
# --------------------------------------------------
weather = ['temperature', 'humidity', 'wind_speed']

df[weather] = (
    df.groupby('location_id')[weather]
      .transform(lambda x: x.interpolate(method='linear'))
)

for col in weather:
    df[col] = df[col].fillna(
        df.groupby('location_id')[col].transform('median')
    )

# --------------------------------------------------
#  WIND DIRECTION (CIRCULAR DATA)
# --------------------------------------------------
if 'wind_direction' in df.columns:

    # Convert degrees → sin/cos
    df['wind_sin'] = np.sin(np.deg2rad(df['wind_direction']))
    df['wind_cos'] = np.cos(np.deg2rad(df['wind_direction']))

    # Interpolate sin/cos
    df[['wind_sin', 'wind_cos']] = (
        df.groupby('location_id')[['wind_sin', 'wind_cos']]
          .transform(lambda x: x.interpolate(method='linear'))
    )

    # Median fallback
    df['wind_sin'] = df['wind_sin'].fillna(
        df.groupby('location_id')['wind_sin'].transform('median')
    )
    df['wind_cos'] = df['wind_cos'].fillna(
        df.groupby('location_id')['wind_cos'].transform('median')
    )

    # Reconstruct angle (0–360)
    df['wind_direction'] = (
        np.rad2deg(np.arctan2(df['wind_sin'], df['wind_cos'])) % 360
    )

    # Drop helper columns
    df.drop(columns=['wind_sin', 'wind_cos'], inplace=True)

# --------------------------------------------------
# SPATIAL / LOCATION FEATURES
# --------------------------------------------------
spatial_features = [
    'Roads_count',
    'Industrial_zones_count',
    'Dump_sites_count',
    'Agricultural_fields_count',
    'Urban_density_score'
]

for col in spatial_features:
    if col in df.columns:
        df[col] = df[col].fillna(
            df.groupby('location_id')[col].transform('median')
        )

# --------------------------------------------------
# FINAL CHECK
# --------------------------------------------------
print("Remaining missing values:")
print(df.isna().sum()[df.isna().sum() > 0])

# --------------------------------------------------
#  SAVE OUTPUT
# --------------------------------------------------
df.to_csv("Imputed_dataset.csv", index=False)
print("✅ Imputation complete. Saved as Imputed_dataset.csv")


Remaining missing values:
pm25                        2301
pm10                        4683
no2                         2301
co                          4960
so2                         2301
o3                          8962
temperature                12155
humidity                   13336
wind_speed                 13371
wind_direction             13371
dist_nearest_road_m         7016
dist_nearest_industry_m    64555
dtype: int64
✅ Imputation complete. Saved as Imputed_dataset.csv


In [3]:
fallback_cols = [
    'pm25','pm10','no2','co','so2','o3',
    'temperature','humidity','wind_speed','wind_direction'
]
for col in fallback_cols:
    df[col] = df[col].fillna(df[col].median())


In [4]:

df['dist_nearest_road_m'] = df['dist_nearest_road_m'].fillna(
    df['dist_nearest_road_m'].median()
)

df['dist_nearest_industry_m'] = df['dist_nearest_industry_m'].fillna(50000)


In [5]:
df.isna().sum().sum()

np.int64(0)

In [6]:
df.isnull().sum()

state                         0
district                      0
location_id                   0
location_name                 0
datetime_utc                  0
datetime_local                0
latitude                      0
longitude                     0
pm25                          0
pm10                          0
no2                           0
co                            0
so2                           0
o3                            0
temperature                   0
humidity                      0
wind_speed                    0
wind_direction                0
Roads_count                   0
Industrial_zones_count        0
Dump_sites_count              0
Agricultural_fields_count     0
Query_status                  0
Urban_density_score           0
Industrial_presence           0
Pollution_source_risk         0
Green_area_ratio              0
dist_nearest_road_m           0
dist_nearest_industry_m       0
dist_nearest_dump_m           0
dist_nearest_agriculture_m    0
hour    

In [7]:
df.to_csv("Imputed_dataset.csv", index=False)
print("Data cleaning done ")

Data cleaning done 


In [1]:
import pandas as pd 
df=pd.read_csv("Imputed_dataset.csv")
df.head()

Unnamed: 0,state,district,location_id,location_name,datetime_utc,datetime_local,latitude,longitude,pm25,pm10,...,pm25_scaled,pm10_scaled,no2_scaled,co_scaled,so2_scaled,o3_scaled,temperature_scaled,humidity_scaled,wind_speed_scaled,wind_direction_scaled
0,Haryana,Faridabad,17,"R K Puram, Delhi - DPCC",2025-11-11 15:00:00+00:00,2025-11-11 20:30:00,28.563262,77.186937,237.0,390.0,...,0.009259,0.371213,0.137582,1.255464,0.738325,-0.005949,-0.229326,0.25668,-0.199909,0.894188
1,Haryana,Faridabad,17,"R K Puram, Delhi - DPCC",2025-11-11 15:15:00+00:00,2025-11-11 20:45:00,28.563262,77.186937,234.0,337.0,...,0.008766,0.290353,0.143616,1.268665,0.759974,-0.005949,-0.207712,0.21653,-0.199909,0.332453
2,Haryana,Faridabad,17,"R K Puram, Delhi - DPCC",2025-11-11 15:30:00+00:00,2025-11-11 21:00:00,28.563262,77.186937,234.0,337.0,...,0.008766,0.290353,0.142822,1.493083,0.756366,-0.005949,-0.207712,0.21653,-0.199909,0.205268
3,Haryana,Faridabad,17,"R K Puram, Delhi - DPCC",2025-11-11 16:00:00+00:00,2025-11-11 21:30:00,28.563262,77.186937,234.0,337.0,...,0.008766,0.290353,0.111383,1.453479,0.846569,-0.005949,-0.294165,0.336981,-0.199909,0.300657
4,Haryana,Faridabad,17,"R K Puram, Delhi - DPCC",2025-11-11 16:30:00+00:00,2025-11-11 22:00:00,28.563262,77.186937,258.0,366.0,...,0.012709,0.334598,0.100586,1.493083,0.861002,-0.005949,-0.380619,0.457433,-0.199909,0.194669
