In [3]:
import pandas as pd
df = pd.read_csv("Final_clean_dataset.csv", parse_dates=['datetime_local', 'datetime_utc'])
# Check that columns exist
print(df.columns)
# Check month values
print(df['month'].unique())


Index(['state', 'district', 'location_id', 'location_name', 'datetime_utc',
       'latitude', 'longitude', 'pm25', 'pm10', 'no2', 'co', 'so2', 'o3',
       'temperature', 'humidity', 'wind_speed', 'wind_direction',
       'Roads_count', 'Industrial_zones_count', 'Dump_sites_count',
       'Agricultural_fields_count', 'Query_status', 'Urban_density_score',
       'Industrial_presence', 'Pollution_source_risk', 'Green_area_ratio',
       'dist_nearest_road_m', 'dist_nearest_industry_m', 'dist_nearest_dump_m',
       'dist_nearest_agriculture_m', 'hour', 'is_peak_hour', 'day_of_week',
       'is_weekend', 'month', 'season', 'season_code', 'pm25_normalized',
       'pm10_normalized', 'no2_normalized', 'co_normalized', 'so2_normalized',
       'o3_normalized', 'temperature_normalized', 'humidity_normalized',
       'wind_speed_normalized', 'wind_direction_normalized', 'pm25_scaled',
       'pm10_scaled', 'no2_scaled', 'co_scaled', 'so2_scaled', 'o3_scaled',
       'temperature_scaled', 'hu

In [4]:
# ============================================================
# SOURCE LABELING WITH SUMMARY (KEEPING ALL EXISTING COLUMNS)
# ============================================================

import pandas as pd
import numpy as np

# ------------------------------------------------------------
# STEP 0: LOAD YOUR DATASET
# ------------------------------------------------------------
df = pd.read_csv("Final_clean_dataset.csv", parse_dates=['datetime_local', 'datetime_utc'])

# ------------------------------------------------------------
# STEP 1: DERIVE SEASON (if not already present)
# ------------------------------------------------------------
def get_season(month):
    if month in [12, 1, 2]:
        return 'Winter'
    elif month in [3, 4, 5]:
        return 'Summer'  # Dry season assumption
    elif month in [6, 7, 8, 9]:
        return 'Monsoon'
    else:
        return 'Post-Monsoon'

if 'season' not in df.columns:
    df['season'] = df['month'].apply(get_season)

# ------------------------------------------------------------
# STEP 2: DEFINE "HIGH" POLLUTANT THRESHOLDS (75th percentile)
# ------------------------------------------------------------
NO2_HIGH = df['no2'].quantile(0.75)
SO2_HIGH = df['so2'].quantile(0.75)
PM25_HIGH = df['pm25'].quantile(0.75)
CO_HIGH = df['co'].quantile(0.75)

# ------------------------------------------------------------
# STEP 3: DEFINE HEURISTIC SOURCE LABELING FUNCTION
# ------------------------------------------------------------
def assign_source(row):
    if row['dist_nearest_road_m'] < 100 and row['no2'] >= NO2_HIGH:
        return 'Vehicular'
    elif row['dist_nearest_industry_m'] < 200 and row['so2'] >= SO2_HIGH:
        return 'Industrial'
    elif (row['dist_nearest_agriculture_m'] < 500 and
          row['season'] in ['Summer', 'Post-Monsoon'] and
          row['pm25'] >= PM25_HIGH):
        return 'Agricultural'
    elif row['pm25'] >= PM25_HIGH and row['co'] >= CO_HIGH:
        return 'Burning'
    else:
        return 'Natural'

# ------------------------------------------------------------
# STEP 4: APPLY HEURISTICS TO LABEL SOURCES
# ------------------------------------------------------------
df['pollution_source'] = df.apply(assign_source, axis=1)

# ------------------------------------------------------------
# STEP 5: VALIDATE AND PRINT SUMMARY
# ------------------------------------------------------------
print("\n================ SOURCE LABELING SUMMARY ================\n")

# Thresholds used
print("Thresholds Used (75th Percentile):")
print(f"NO2  : {NO2_HIGH:.2f}")
print(f"SO2  : {SO2_HIGH:.2f}")
print(f"PM25 : {PM25_HIGH:.2f}")
print(f"CO   : {CO_HIGH:.2f}\n")

# Distribution of pollution sources
source_counts = df['pollution_source'].value_counts()
print("Pollution Source Distribution:")
print(source_counts, "\n")

# Mean pollutant levels per source
pollutant_means = df.groupby('pollution_source')[['no2','so2','pm25','co']].mean()
print("Mean Pollutant Levels by Source (Validation):")
print(pollutant_means, "\n")

# Median distances to sources
distance_medians = df.groupby('pollution_source')[['dist_nearest_road_m','dist_nearest_industry_m','dist_nearest_agriculture_m']].median()
print("Median Distance to Emission Sources (Validation):")
print(distance_medians, "\n")

print("✅ All existing columns kept intact. Only 'pollution_source' added.\n")
print("✅ Dataset saved as: Final_clean_dataset_with_source.csv")
print("========================================================\n")

# ------------------------------------------------------------
# STEP 6: SAVE DATASET WITH NEW LABELS
# ------------------------------------------------------------
df.to_csv("Final_clean_dataset_with_source.csv", index=False)




Thresholds Used (75th Percentile):
NO2  : 25.70
SO2  : 18.98
PM25 : 74.00
CO   : 0.94

Pollution Source Distribution:
pollution_source
Natural       53794
Vehicular     17914
Burning        4019
Industrial     2267
Name: count, dtype: int64 

Mean Pollutant Levels by Source (Validation):
                        no2        so2        pm25        co
pollution_source                                            
Burning           16.830919  19.237686  105.514169  1.402658
Industrial        10.172312  27.103658    5.990609  0.308474
Natural           12.928720  10.700804   44.333442  0.609525
Vehicular         44.149823  19.400965   79.678152  0.968191 

Median Distance to Emission Sources (Validation):
                  dist_nearest_road_m  dist_nearest_industry_m  \
pollution_source                                                 
Burning                     22.940665                  50000.0   
Industrial                  23.750911                      0.0   
Natural                    

In [5]:
df.head()

Unnamed: 0,state,district,location_id,location_name,datetime_utc,latitude,longitude,pm25,pm10,no2,...,so2_scaled,o3_scaled,temperature_scaled,humidity_scaled,wind_speed_scaled,wind_direction_scaled,datetime_local,near_dump,near_agriculture,pollution_source
0,Haryana,Faridabad,17,"R K Puram, Delhi - DPCC",2025-11-11 15:00:00+00:00,28.563262,77.186937,141.5,266.53125,51.393125,...,0.738325,-0.005949,-0.229326,0.25668,-0.199909,0.894188,2025-11-11 20:30:00,0,0,Vehicular
1,Andhra Pradesh,Vijayawada,5408,"Secretariat, Amaravati - APPCB",2025-11-11 15:00:00+00:00,16.515083,80.518167,56.0,169.0,43.2,...,-0.419886,-0.005949,0.256974,0.417283,-0.177697,-1.331553,2025-11-11 20:30:00,0,0,Natural
2,Gujarat,Surat,3409371,"Science Center, Surat - SMC",2025-11-11 15:00:00+00:00,21.170046,72.795405,67.27,90.8,6.48,...,0.353337,-0.005949,0.492559,-0.150848,-0.153263,1.308387,2025-11-11 20:30:00,0,0,Natural
3,Chhattisgarh,Durg,3409367,"Hathkhoj, Bhilai - CECB",2025-11-11 15:00:00+00:00,21.224231,81.40835,55.38,136.86,51.393125,...,-0.112473,-0.005949,-0.503815,-0.04204,-0.20935,-1.312793,2025-11-11 20:30:00,0,0,Vehicular
4,Jharkhand,Dhanbad,5546,Tata Stadium - Jorapokhar - JSPCB,2025-11-11 15:00:00+00:00,23.707909,86.41467,0.0,0.0,0.0,...,-0.56782,-0.005949,-2.206944,-2.433409,-0.182139,0.018094,2025-11-11 20:30:00,0,0,Natural
