In [1]:
#STEP-1 — Load OSM-enriched dataset
import pandas as pd

df = pd.read_csv("data/india_states/osm_enriched_dataset.csv")
print("Rows:", len(df))
df.head()

Rows: 953755


Unnamed: 0,location_id,location_name,sensor_id,parameter_original,parameter_display,value,unit,datetime_utc,datetime_local,latitude,...,parameter_pm25,parameter_so2,parameter_temperature,parameter_wind_direction,parameter_wind_speed,near_road,near_industry,near_farmland,near_landfill,near_dumpyard
0,17,"R K Puram, Delhi - DPCC",12234784,no2,NO₂,110.2,ppb,2025-11-24 13:00:00+00:00,2025-11-24T18:30:00+05:30,28.563262,...,False,False,False,False,False,1,0,0,0,1
1,17,"R K Puram, Delhi - DPCC",12234782,co,CO,3.85,ppb,2025-11-24 13:00:00+00:00,2025-11-24T18:30:00+05:30,28.563262,...,False,False,False,False,False,1,0,0,0,1
2,17,"R K Puram, Delhi - DPCC",12234790,temperature,Temperature (C),17.5,c,2025-11-24 13:00:00+00:00,2025-11-24T18:30:00+05:30,28.563262,...,False,False,True,False,False,1,0,0,0,1
3,17,"R K Puram, Delhi - DPCC",12234788,relativehumidity,RH,78.0,%,2025-11-24 13:00:00+00:00,2025-11-24T18:30:00+05:30,28.563262,...,False,False,False,False,False,1,0,0,0,1
4,17,"R K Puram, Delhi - DPCC",12234789,so2,SO₂,3.5,ppb,2025-11-24 13:00:00+00:00,2025-11-24T18:30:00+05:30,28.563262,...,False,True,False,False,False,1,0,0,0,1


In [5]:
#STEP-2 — Create helper pollution features
high_pollution = df['value'] > df['value'].median()
very_high_pollution = df['value'] > df['value'].quantile(0.75)

#STEP-3 — Rule-Based Labeling Function
def assign_source(row):

    pollutant = str(row['parameter_original']).lower()

    # Vehicular / Traffic
    if row['near_road'] == 1 and row['hour'] in range(7,11) or row['hour'] in range(17,21):
        if pollutant in ['pm25','pm10','no2']:
            return "Vehicular"
    
    # Industrial activity
    if row['near_industry'] == 1 and pollutant in ['so2','no2','pm25']:
        return "Industrial"
    
    # Agricultural burning / crop fields
    if row['near_farmland'] == 1 and pollutant in ['pm25','pm10'] and row['month'] in [3,4,10,11]:
        return "Agricultural Burning"
    
    # Waste burning / landfill emissions
    if row['near_dumpyard'] == 1 or row['near_landfill'] == 1:
        if pollutant in ['co','pm25','pm10']:
            return "Waste-Burning"
    
    # Default class
    return "Natural / Background"


#apply the rule
df['pollution_source'] = df.apply(assign_source, axis=1)
#check distribution
df['pollution_source'].value_counts()

pollution_source
Natural / Background    649358
Industrial              185039
Vehicular                92226
Waste-Burning            21764
Agricultural Burning      5368
Name: count, dtype: int64

In [7]:
#STEP-4 — Balance & sanity-check labels
df['pollution_source'].value_counts(normalize=True)*100

#STEP-5 — Save labeled dataset
output_file = "data/india_states/labeled_dataset.csv"
df.to_csv(output_file, index=False)

print("Saved labeled dataset to:", output_file)

Saved labeled dataset to: data/india_states/labeled_dataset.csv


In [9]:
import pandas as pd

pd.read_csv("data/india_states/labeled_dataset.csv").head()

Unnamed: 0,location_id,location_name,sensor_id,parameter_original,parameter_display,value,unit,datetime_utc,datetime_local,latitude,...,parameter_so2,parameter_temperature,parameter_wind_direction,parameter_wind_speed,near_road,near_industry,near_farmland,near_landfill,near_dumpyard,pollution_source
0,17,"R K Puram, Delhi - DPCC",12234784,no2,NO₂,110.2,ppb,2025-11-24 13:00:00+00:00,2025-11-24T18:30:00+05:30,28.563262,...,False,False,False,False,1,0,0,0,1,Natural / Background
1,17,"R K Puram, Delhi - DPCC",12234782,co,CO,3.85,ppb,2025-11-24 13:00:00+00:00,2025-11-24T18:30:00+05:30,28.563262,...,False,False,False,False,1,0,0,0,1,Waste-Burning
2,17,"R K Puram, Delhi - DPCC",12234790,temperature,Temperature (C),17.5,c,2025-11-24 13:00:00+00:00,2025-11-24T18:30:00+05:30,28.563262,...,False,True,False,False,1,0,0,0,1,Natural / Background
3,17,"R K Puram, Delhi - DPCC",12234788,relativehumidity,RH,78.0,%,2025-11-24 13:00:00+00:00,2025-11-24T18:30:00+05:30,28.563262,...,False,False,False,False,1,0,0,0,1,Natural / Background
4,17,"R K Puram, Delhi - DPCC",12234789,so2,SO₂,3.5,ppb,2025-11-24 13:00:00+00:00,2025-11-24T18:30:00+05:30,28.563262,...,True,False,False,False,1,0,0,0,1,Natural / Background
