In [3]:
#STEP-1 — Install & import libraries
!pip install osmnx geopandas shapely



In [5]:
import osmnx as ox
import geopandas as gpd
import pandas as pd
from shapely.geometry import Point

In [7]:
ox.settings.use_cache = True
ox.settings.log_console = False

In [9]:
#STEP-2 — Load your cleaned dataset
df = pd.read_csv("data/india_states/feature_engineered_dataset.csv")

print("Rows:", len(df))
df.head()

Rows: 953755


Unnamed: 0,location_id,location_name,sensor_id,parameter_original,parameter_display,value,unit,datetime_utc,datetime_local,latitude,...,district_Visakhapatnam,parameter_humidity,parameter_no2,parameter_o3,parameter_pm10,parameter_pm25,parameter_so2,parameter_temperature,parameter_wind_direction,parameter_wind_speed
0,17,"R K Puram, Delhi - DPCC",12234784,no2,NO₂,110.2,ppb,2025-11-24 13:00:00+00:00,2025-11-24T18:30:00+05:30,28.563262,...,False,False,True,False,False,False,False,False,False,False
1,17,"R K Puram, Delhi - DPCC",12234782,co,CO,3.85,ppb,2025-11-24 13:00:00+00:00,2025-11-24T18:30:00+05:30,28.563262,...,False,False,False,False,False,False,False,False,False,False
2,17,"R K Puram, Delhi - DPCC",12234790,temperature,Temperature (C),17.5,c,2025-11-24 13:00:00+00:00,2025-11-24T18:30:00+05:30,28.563262,...,False,False,False,False,False,False,False,True,False,False
3,17,"R K Puram, Delhi - DPCC",12234788,relativehumidity,RH,78.0,%,2025-11-24 13:00:00+00:00,2025-11-24T18:30:00+05:30,28.563262,...,False,True,False,False,False,False,False,False,False,False
4,17,"R K Puram, Delhi - DPCC",12234789,so2,SO₂,3.5,ppb,2025-11-24 13:00:00+00:00,2025-11-24T18:30:00+05:30,28.563262,...,False,False,False,False,False,False,True,False,False,False


In [11]:
stations = df[['location_id', 'latitude', 'longitude']].drop_duplicates()

stations['geometry'] = stations.apply(
    lambda r: Point(r['longitude'], r['latitude']), axis=1
)

gdf_stations = gpd.GeoDataFrame(stations, geometry='geometry', crs="EPSG:4326")
gdf_stations.head()

Unnamed: 0,location_id,latitude,longitude,geometry
0,17,28.563262,77.186937,POINT (77.18694 28.56326)
18286,5408,16.515083,80.518167,POINT (80.51817 16.51508)
41355,5542,31.321907,75.578914,POINT (75.57891 31.32191)
62929,5544,30.349388,76.366642,POINT (76.36664 30.34939)
84885,5546,23.707909,86.41467,POINT (86.41467 23.70791)


In [13]:
#STEP-3 — Define OSM feature tags
TAGS = {
    'highway': True,  # any road type
    'landuse': ['industrial', 'farmland', 'landfill'],
    'amenity': ['waste_disposal', 'recycling']
}

In [15]:
SEARCH_RADIUS = 2000   # meters (2 km)

In [17]:
#STEP-4 — Function to fetch nearby OSM features
def get_osm_features(lat, lon, radius=SEARCH_RADIUS):
    try:
        gdf = ox.features_from_point((lat, lon), tags=TAGS, dist=radius)

        has_road = gdf['highway'].notna().any() if 'highway' in gdf else False
        has_industry = gdf['landuse'].isin(['industrial']).any() if 'landuse' in gdf else False
        has_farmland = gdf['landuse'].isin(['farmland']).any() if 'landuse' in gdf else False
        has_landfill = gdf['landuse'].isin(['landfill']).any() if 'landuse' in gdf else False
        has_dump = gdf['amenity'].isin(['waste_disposal', 'recycling']).any() if 'amenity' in gdf else False

        return pd.Series({
            "near_road": int(has_road),
            "near_industry": int(has_industry),
            "near_farmland": int(has_farmland),
            "near_landfill": int(has_landfill),
            "near_dumpyard": int(has_dump)
        })
    
    except Exception as e:
        return pd.Series({
            "near_road": 0,
            "near_industry": 0,
            "near_farmland": 0,
            "near_landfill": 0,
            "near_dumpyard": 0
        })

In [21]:
#STEP-5 — Apply to all monitoring locations (with progress)
from tqdm import tqdm
tqdm.pandas()

osm_features = gdf_stations.progress_apply(
    lambda r: get_osm_features(r.latitude, r.longitude), axis=1
)

gdf_stations = pd.concat([gdf_stations, osm_features], axis=1)
gdf_stations.head()

100%|██████████████████████████████████████████████████████████████████████████████████| 51/51 [05:44<00:00,  6.76s/it]


Unnamed: 0,location_id,latitude,longitude,geometry,near_road,near_industry,near_farmland,near_landfill,near_dumpyard
0,17,28.563262,77.186937,POINT (77.18694 28.56326),1,0,0,0,1
18286,5408,16.515083,80.518167,POINT (80.51817 16.51508),1,0,1,0,0
41355,5542,31.321907,75.578914,POINT (75.57891 31.32191),1,0,0,0,0
62929,5544,30.349388,76.366642,POINT (76.36664 30.34939),1,1,0,0,0
84885,5546,23.707909,86.41467,POINT (86.41467 23.70791),1,1,0,0,0


In [23]:
#STEP-6 — Merge OSM features back into main dataset
df = df.merge(
    gdf_stations[['location_id','near_road','near_industry','near_farmland','near_landfill','near_dumpyard']],
    on='location_id',
    how='left'
)

In [25]:
#fill missing values
df.fillna(0, inplace=True)

In [27]:
#STEP-7 — Save OSM-enriched dataset
output_file = "data/india_states/osm_enriched_dataset.csv"
df.to_csv(output_file, index=False)

print("Saved:", output_file)

Saved: data/india_states/osm_enriched_dataset.csv
