In [1]:
import pandas as pd
import numpy as np
import requests
import time
import osmnx as ox
from datetime import datetime

# ==========================================
# 1. CONFIGURATION
# ==========================================
OWM_API_KEY = "1c63d4b087353b1d4e226954d3db89cb"  # <--- PASTE KEY HERE
SATELLITES_PER_CITY = 5   # Center + 4 random points
RADIUS_KM = 5             # Radius to scatter points

# ==========================================
# 2. CITY DATA (Your List)
# ==========================================
TARGET_COUNTRIES = {
    "India": [
        "New Delhi", "Mumbai", "Bangalore", "Hyderabad", "Ahmedabad",
        "Chennai", "Kolkata", "Surat", "Pune", "Jaipur",
        "Lucknow", "Kanpur", "Nagpur", "Indore", "Thane",
        "Bhopal", "Visakhapatnam", "Pimpri-Chinchwad", "Patna", "Vadodara",
        "Ghaziabad", "Ludhiana", "Agra", "Nashik", "Faridabad",
        "Meerut", "Rajkot", "Kalyan-Dombivli", "Vasai-Virar", "Varanasi",
        "Srinagar", "Aurangabad", "Dhanbad", "Amritsar", "Navi Mumbai",
        "Allahabad", "Howrah", "Ranchi", "Gwalior", "Jabalpur",
        "Coimbatore", "Vijayawada", "Jodhpur", "Madurai", "Raipur",
        "Kota", "Chandigarh", "Guwahati", "Solapur", "Hubli-Dharwad"
    ],
    "USA": [
        "New York", "Los Angeles", "Chicago", "Houston", "Phoenix",
        "Philadelphia", "San Antonio", "San Diego", "Dallas", "San Jose",
        "Austin", "Jacksonville", "Fort Worth", "Columbus", "Indianapolis",
        "Charlotte", "San Francisco", "Seattle", "Denver", "Oklahoma City",
        "Nashville", "El Paso", "Washington", "Boston", "Las Vegas",
        "Detroit", "Portland", "Memphis", "Louisville", "Baltimore",
        "Milwaukee", "Albuquerque", "Tucson", "Fresno", "Mesa",
        "Sacramento", "Atlanta", "Kansas City", "Colorado Springs", "Omaha",
        "Raleigh", "Miami", "Long Beach", "Virginia Beach", "Oakland",
        "Minneapolis", "Tulsa", "Tampa", "Arlington", "New Orleans"
    ],
    "China": [
        "Chongqing", "Shanghai", "Beijing", "Chengdu", "Guangzhou",
        "Shenzhen", "Tianjin", "Wuhan", "Dongguan", "Xi'an",
        "Hangzhou", "Foshan", "Nanjing", "Shenyang", "Qingdao",
        "Zhengzhou", "Changsha", "Kunming", "Jinan", "Shantou",
        "Hefei", "Harbin", "Suzhou", "Dalian", "Ningbo",
        "Xiamen", "Changchun", "Taiyuan", "Nanning", "Wuxi",
        "Guiyang", "Ürümqi", "Fuzhou", "Zibo", "Tangshan",
        "Zhongshan", "Yantai", "Baotou", "Lanzhou", "Huai'an",
        "Nanchang", "Daqing", "Weifang", "Handan", "Luoyang",
        "Linyi", "Xuzhou", "Quanzhou", "Changzhou", "Nantong"
    ],
    "Germany": [
        "Berlin", "Hamburg", "Munich", "Cologne", "Frankfurt",
        "Stuttgart", "Düsseldorf", "Leipzig", "Dortmund", "Essen",
        "Bremen", "Dresden", "Hanover", "Nuremberg", "Duisburg",
        "Bochum", "Wuppertal", "Bielefeld", "Bonn", "Münster",
        "Karlsruhe", "Mannheim", "Augsburg", "Wiesbaden", "Gelsenkirchen",
        "Mönchengladbach", "Braunschweig", "Chemnitz", "Kiel", "Aachen",
        "Halle", "Magdeburg", "Freiburg", "Krefeld", "Lübeck",
        "Oberhausen", "Erfurt", "Mainz", "Rostock", "Kassel",
        "Hagen", "Saarbrücken", "Hamm", "Potsdam", "Mülheim",
        "Ludwigshafen", "Oldenburg", "Leverkusen", "Osnabrück", "Solingen"
    ],
    "Brazil": [
        "São Paulo", "Rio de Janeiro", "Brasília", "Salvador", "Fortaleza",
        "Belo Horizonte", "Manaus", "Curitiba", "Recife", "Goiânia",
        "Belém", "Porto Alegre", "Guarulhos", "Campinas", "São Luís",
        "São Gonçalo", "Maceió", "Duque de Caxias", "Natal", "Teresina",
        "Campo Grande", "Nova Iguaçu", "João Pessoa", "Santo André", "São Bernardo do Campo",
        "Osasco", "Jaboatão dos Guararapes", "Uberlândia", "Contagem", "Sorocaba",
        "Aracaju", "Feira de Santana", "Cuiabá", "Joinville", "Juiz de Fora",
        "Londrina", "Niterói", "Aparecida de Goiânia", "Ananindeua", "Caxias do Sul",
        "Porto Velho", "Serra", "Vila Velha", "Florianópolis", "Mauá",
        "São Vicente", "Mogi das Cruzes", "Betim", "Santos", "Diadema"
    ],
    "Indonesia": [
        "Jakarta", "Surabaya", "Bekasi", "Bandung", "Medan",
        "Depok", "Tangerang", "Palembang", "Semarang", "Makassar",
        "South Tangerang", "Batam", "Bogor", "Pekanbaru", "Bandar Lampung",
        "Padang", "Malang", "Denpasar", "Samarinda", "Tasikmalaya",
        "Pontianak", "Banjarmasin", "Jambi", "Cimahi", "Balikpapan",
        "Manado", "Yogyakarta", "Surakarta", "Serang", "Mataram",
        "Kupang", "Palu", "Ambon", "Kendari", "Bengkulu",
        "Pangkal Pinang", "Jayapura", "Cilegon", "Tegal", "Probolinggo",
        "Binjai", "Sukabumi", "Pematangsiantar", "Cirebon", "Banda Aceh",
        "Tanjungpinang", "Gorontalo", "Dumai", "Metro", "Bontang"
    ],
    "Pakistan": [
        "Karachi", "Lahore", "Faisalabad", "Rawalpindi", "Gujranwala",
        "Peshawar", "Multan", "Hyderabad", "Islamabad", "Quetta",
        "Bahawalpur", "Sargodha", "Sialkot", "Sukkur", "Larkana",
        "Sheikhupura", "Rahim Yar Khan", "Jhang", "Dera Ghazi Khan", "Gujrat",
        "Sahiwal", "Wah Cantonment", "Mardan", "Kasur", "Okara",
        "Mingora", "Nawabshah", "Chiniot", "Kotri", "Kamoke",
        "Hafizabad", "Sadiqabad", "Mirpur Khas", "Burewala", "Kohat",
        "Khanewal", "Dadu", "Muzaffargarh", "Jacobabad", "Shikarpur",
        "Jaranwala", "Khanpur", "Tando Allahyar", "Khairpur", "Chishtian",
        "Abbottabad", "Daska", "Pakpattan", "Bahawalnagar", "Tando Adam"
    ],
    "Nigeria": [
        "Lagos", "Kano", "Ibadan", "Abuja", "Port Harcourt",
        "Benin City", "Jos", "Ilora", "Kaduna", "Enugu",
        "Zaria", "Warri", "Maiduguri", "Aba", "Onitsha",
        "Ogbomosho", "Ikare", "Ado Ekiti", "Bauchi", "Ilorin",
        "Akure", "Owerri", "Sokoto", "Ebute Ikorodu", "Calabar",
        "Uyo", "Minna", "Abeokuta", "Osogbo", "Ifẹ",
        "Makurdi", "Ondo", "Jimeta", "Gusau", "Mubi",
        "Umuahia", "Ife", "Oyo", "Awka", "Damaturu",
        "Iwo", "Ilesa", "Katsina", "Nnewi", "Bida",
        "Ikot Ekpene", "Lafia", "Okene", "Suleja", "Nguru"
    ],
    "Bangladesh": [
        "Dhaka", "Chittagong", "Khulna", "Rajshahi", "Comilla",
        "Sylhet", "Barisal", "Rangpur", "Narsingdi", "Narayanganj",
        "Gazipur", "Mymensingh", "Bogra", "Jessore", "Dinajpur",
        "Saidpur", "Cox's Bazar", "Tangail", "Pabna", "Sirajganj",
        "Nawabganj", "Kushtia", "Faridpur", "Jamalpur", "Naogaon",
        "Feni", "Brahmanbaria", "Tongī", "Shibganj", "Savār",
        "Kishoreganj", "Habiganj", "Chandpur", "Patiya", "Natore",
        "Lakshmipur", "Bhola", "Noakhali", "Manikganj", "Chuadanga",
        "Satkhira", "Madaripur", "Meherpur", "Sherpur", "Panchagarh",
        "Kurigram", "Gaibandha", "Nilphamari", "Lalmonirhat", "Magura"
    ],
    "Russia": [
        "Moscow", "Saint Petersburg", "Novosibirsk", "Yekaterinburg", "Kazan",
        "Nizhny Novgorod", "Chelyabinsk", "Omsk", "Samara", "Rostov-on-Don",
        "Ufa", "Krasnoyarsk", "Voronezh", "Perm", "Volgograd",
        "Saratov", "Krasnodar", "Tolyatti", "Izhevsk", "Ulyanovsk",
        "Barnaul", "Tyumen", "Irkutsk", "Khabarovsk", "Makhachkala",
        "Vladivostok", "Yaroslavl", "Tomsk", "Orenburg", "Novokuznetsk",
        "Kemerovo", "Ryazan", "Astrakhan", "Penza", "Naberezhnye Chelny",
        "Lipetsk", "Tula", "Kirov", "Cheboksary", "Kaliningrad",
        "Bryansk", "Kursk", "Ivanovo", "Magnitogorsk", "Ulan-Ude",
        "Tver", "Stavropol", "Nizhny Tagil", "Belgorod", "Arkhangelsk"
    ]
}

# ==========================================
# 3. HELPER FUNCTIONS
# ==========================================

def get_coordinates(city, country):
    """
    Uses OSM to get the lat/lon of the city center.
    """
    try:
        query = f"{city}, {country}"
        lat, lon = ox.geocode(query)
        return lat, lon
    except Exception as e:
        print(f"  [Error] Geocoding failed for {city}: {e}")
        return None, None

def fetch_weather_pollution(lat, lon):
    """
    Calls OpenWeatherMap APIs for Weather and Air Pollution.
    """
    data = {}
    
    # --- A. Current Weather ---
    try:
        url_w = f"https://api.openweathermap.org/data/2.5/weather?lat={lat}&lon={lon}&appid={OWM_API_KEY}&units=metric"
        res_w = requests.get(url_w).json()
        
        if res_w.get('cod') == 200:
            data['Temperature'] = res_w['main']['temp']
            data['Humidity'] = res_w['main']['humidity']
            data['Wind_Speed'] = res_w['wind']['speed']
            data['Wind_Direction'] = res_w['wind'].get('deg', 0)
            data['Weather_Condition'] = res_w['weather'][0]['main']
        else:
            print(f"    [API Error] Weather: {res_w.get('message')}")
    except Exception as e:
        print(f"    [Net Error] Weather: {e}")

    # --- B. Air Pollution ---
    try:
        url_p = f"http://api.openweathermap.org/data/2.5/air_pollution?lat={lat}&lon={lon}&appid={OWM_API_KEY}"
        res_p = requests.get(url_p).json()
        
        if 'list' in res_p:
            comp = res_p['list'][0]['components']
            main = res_p['list'][0]['main']
            
            data['AQI'] = main['aqi']
            data['CO'] = comp['co']
            data['NO'] = comp['no']
            data['NO2'] = comp['no2']
            data['O3'] = comp['o3']
            data['SO2'] = comp['so2']
            data['PM2_5'] = comp['pm2_5']
            data['PM10'] = comp['pm10']
            data['NH3'] = comp['nh3']
        else:
             print(f"    [API Error] Pollution: No list found")
    except Exception as e:
        print(f"    [Net Error] Pollution: {e}")

    return data

# ==========================================
# 4. MAIN LOOP
# ==========================================

def main():
    print("--- Starting Master Data Collection ---")
    print(f"Target: {len(TARGET_COUNTRIES)} countries.")
    
    all_data = []
    
    for country, cities in TARGET_COUNTRIES.items():
        print(f"\nProcessing Country: {country.upper()}")
        
        for city in cities:
            print(f"  > Processing {city}...", end=" ", flush=True)
            
            # 1. Get City Center
            center_lat, center_lon = get_coordinates(city, country)
            
            if center_lat is None:
                continue # Skip if geocoding failed
                
            # 2. Define Points (Center + Satellites)
            points_to_check = []
            
            # Add Center
            points_to_check.append({
                'Location_Type': 'City Center',
                'lat': center_lat, 'lon': center_lon
            })
            
            # Add Random Satellites
            for i in range(SATELLITES_PER_CITY - 1):
                theta = np.random.uniform(0, 2 * np.pi)
                dist = np.random.uniform(1, RADIUS_KM)
                
                delta_lat = (dist / 111.0) * np.cos(theta)
                delta_lon = (dist / (111.0 * np.cos(np.radians(center_lat)))) * np.sin(theta)
                
                points_to_check.append({
                    'Location_Type': f'Sector_{i+1}',
                    'lat': round(center_lat + delta_lat, 5),
                    'lon': round(center_lon + delta_lon, 5)
                })
            
            # 3. Fetch Data for these Points
            for pt in points_to_check:
                # API Call
                env_data = fetch_weather_pollution(pt['lat'], pt['lon'])
                
                # Construct Row
                row = {
                    'Country': country,
                    'City': city,
                    'Location_Type': pt['Location_Type'],
                    'Latitude': pt['lat'],
                    'Longitude': pt['lon'],
                    'Timestamp': time.time()
                }
                # Merge dictionaries
                row.update(env_data)
                all_data.append(row)
                
            print(f"Done ({SATELLITES_PER_CITY} pts)")
            # Sleep slightly to avoid rate limits (60 calls/min free tier)
            time.sleep(0.5) 

    # 5. Save Master Dataset
    df = pd.DataFrame(all_data)
    
    # Reorder columns nicely
    cols = ['Country', 'City', 'Location_Type', 'Latitude', 'Longitude', 
            'AQI', 'PM2_5', 'NO2', 'SO2', 'O3', 'CO', 'PM10', 
            'Temperature', 'Humidity', 'Wind_Speed', 'Weather_Condition']
    
    # Keep only columns that exist (in case of API errors)
    final_cols = [c for c in cols if c in df.columns]
    df = df[final_cols]
    
    filename = "Pollution_Weather_ten_countries.csv"
    df.to_csv(filename, index=False)
    print(f"\nSUCCESS! Collected {len(df)} rows.")
    print(f"Data saved to: {filename}")

if __name__ == "__main__":
    main()

--- Starting Master Data Collection ---
Target: 10 countries.

Processing Country: INDIA
  > Processing New Delhi... Done (5 pts)
  > Processing Mumbai... Done (5 pts)
  > Processing Bangalore... Done (5 pts)
  > Processing Hyderabad... Done (5 pts)
  > Processing Ahmedabad... Done (5 pts)
  > Processing Chennai... Done (5 pts)
  > Processing Kolkata... Done (5 pts)
  > Processing Surat... Done (5 pts)
  > Processing Pune... Done (5 pts)
  > Processing Jaipur... Done (5 pts)
  > Processing Lucknow... Done (5 pts)
  > Processing Kanpur... Done (5 pts)
  > Processing Nagpur... Done (5 pts)
  > Processing Indore... Done (5 pts)
  > Processing Thane... Done (5 pts)
  > Processing Bhopal... Done (5 pts)
  > Processing Visakhapatnam... Done (5 pts)
  > Processing Pimpri-Chinchwad... Done (5 pts)
  > Processing Patna... Done (5 pts)
  > Processing Vadodara... Done (5 pts)
  > Processing Ghaziabad... Done (5 pts)
  > Processing Ludhiana... Done (5 pts)
  > Processing Agra... Done (5 pts)
  > P