In [1]:
import pandas as pd
import numpy as np
import osmnx as ox
from shapely.geometry import Point
from concurrent.futures import ThreadPoolExecutor, as_completed
import warnings
import time

In [2]:
ox.settings.log_console = False
ox.settings.use_cache = False
ox.settings.requests_timeout = 30
warnings.filterwarnings('ignore')

In [3]:
TAGS = {
    'highway': ['motorway', 'trunk', 'primary', 'secondary'],
    'landuse': ['industrial', 'commercial', 'farmland', 'farm', 'landfill', 'residential'],
    'amenity': ['waste_disposal']
}

In [4]:
def generate_satellite_points(city, country, lat=None, lon=None, num_points=5, radius_km=8):
    """
    If lat/lon is missing, it finds them.
    Then generates 'num_points' scattered around that center.
    """
    points = []
    
    # 1. Get Center Coordinates if missing
    if lat is None or lon is None:
        try:
            # Geocode the city (e.g., "Delhi, India")
            lat, lon = ox.geocode(f"{city}, {country}")
            # Pause to be nice to the geocoding API
            time.sleep(1) 
        except:
            return [] # Skip if city not found

    # 2. Add the Center Point
    points.append({
        'City': city, 'Country': country,
        'Location_Type': 'City Center',
        'latitude': lat, 'longitude': lon
    })
    
    # 3. Generate Satellite Points (Math logic)
    for i in range(num_points - 1):
        # Random angle (0 to 360 degrees)
        theta = np.random.uniform(0, 2 * np.pi)
        # Random distance (2km to radius_km)
        dist = np.random.uniform(2, radius_km)
        
        # Calculate new coordinates (Approximate conversions)
        delta_lat = (dist / 111.0) * np.cos(theta)
        delta_lon = (dist / (111.0 * np.cos(np.radians(lat)))) * np.sin(theta)
        
        points.append({
            'City': city, 'Country': country,
            'Location_Type': f'Sector_{i+1}',
            'latitude': round(lat + delta_lat, 5),
            'longitude': round(lon + delta_lon, 5)
        })
        
    return points

In [5]:
def fetch_features_safe(row):
    lat, lon = row['latitude'], row['longitude']
    data = row.copy()
    
    # Default values
    defaults = {
        'dist_road': 2000, 'road_count': 0,
        'dist_industry': 2000, 'industry_count': 0,
        'dist_farm': 2000, 'farm_count': 0
    }
    data.update(defaults)

    try:
        # TIMEOUT PROTECTION: If this line hangs > 30s, the settings above will kill it
        gdf = ox.features_from_point((lat, lon), tags=TAGS, dist=2000)
        
        if not gdf.empty:
            gdf = gdf.to_crs(epsg=3857)
            center = ox.projection.project_geometry(Point(lon, lat), to_crs='epsg:3857')[0]
            
            # Roads
            roads = gdf[gdf['highway'].notna()]
            if not roads.empty:
                data['dist_road'] = round(roads.distance(center).min(), 2)
                data['road_count'] = len(roads)
                
            # Industry
            inds = gdf[gdf['landuse'].isin(['industrial', 'commercial'])]
            if not inds.empty:
                data['dist_industry'] = round(inds.distance(center).min(), 2)
                data['industry_count'] = len(inds)
                
            # Farms
            farms = gdf[gdf['landuse'].isin(['farmland', 'farm'])]
            if not farms.empty:
                data['dist_farm'] = round(farms.distance(center).min(), 2)
                data['farm_count'] = len(farms)
                
    except Exception as e:
        # Silent fail is fine, just return defaults
        pass
        
    return data

In [None]:
def process_cities_robust(input_csv):
    print("Loading data...")
    df = pd.read_csv(input_csv)
    
    # Take first 100 cities
    target_cities = df.iloc[:100]
    
    print("Expanding locations...")
    all_locations = []
    for idx, row in target_cities.iterrows():
        lat = row.get('Latitude', None)
        lon = row.get('Longitude', None)
        sites = generate_satellite_points(row['City'], row['Country'], lat, lon)
        all_locations.extend(sites)
    
    expansion_df = pd.DataFrame(all_locations)
    print(f"Created {len(expansion_df)} points.")
    
    print("Starting feature download...")
    final_results = []
    
    # Use fewer workers to reduce blocking risk
    with ThreadPoolExecutor(max_workers=5) as executor:
        futures = {executor.submit(fetch_features_safe, row): row for _, row in expansion_df.iterrows()}
        
        for i, future in enumerate(as_completed(futures)):
            try:
                result = future.result()
                final_results.append(result)
                
                # Print periodically for progress visibility
                if (i+1) % 10 == 0:
                    print(f"{i+1}/{len(expansion_df)} completed...")
            except:
                print("A thread failed (skipping).")

    final_df = pd.DataFrame(final_results)
    final_df.to_csv("site_data_robust.csv", index=False)
    print("Done.")

if __name__ == "__main__":
    process_cities_robust("Pollution_Weather_datset.csv")


Loading data...
Expanding locations...
Created 500 points.
Starting feature download...
10/500 completed...
20/500 completed...
30/500 completed...
40/500 completed...
50/500 completed...
60/500 completed...
70/500 completed...
80/500 completed...
90/500 completed...
100/500 completed...
110/500 completed...
120/500 completed...
130/500 completed...
140/500 completed...
150/500 completed...
160/500 completed...
170/500 completed...
180/500 completed...
190/500 completed...
200/500 completed...
210/500 completed...
220/500 completed...
230/500 completed...
240/500 completed...
250/500 completed...
260/500 completed...
270/500 completed...
280/500 completed...
290/500 completed...
300/500 completed...
310/500 completed...
320/500 completed...
330/500 completed...
340/500 completed...
350/500 completed...
360/500 completed...
370/500 completed...
380/500 completed...
390/500 completed...
400/500 completed...
410/500 completed...
