In [None]:
import requests
import pandas as pd
from datetime import datetime
import time

API_KEY = "your API key"   

# ==========================
# 1. Load CSV File
# ==========================
file_path = r"Update with your CSV file path"  
df_cities = pd.read_csv(file_path)

print("Total cities found:", len(df_cities))

# ==========================
# 2. Storage for results
# ==========================
weather_data = []

# ==========================
# 3. Fetch weather for each city
# ==========================
for index, row in df_cities.iterrows():
    city = str(row["City"]).strip()    

    url = f"https://api.openweathermap.org/data/2.5/weather?q={city}&appid={API_KEY}&units=metric"
    response = requests.get(url)
    data = response.json()

    if "main" in data:
        weather_data.append({
            "City": city,
            "Latitude": data["coord"]["lat"],
            "Longitude": data["coord"]["lon"],
            "Temperature (¬∞C)": data["main"]["temp"],
            "Humidity (%)": data["main"]["humidity"],
            "Wind Speed (m/s)": data["wind"]["speed"],
            "Wind Direction (¬∞)": data["wind"].get("deg", None),
            "Weather Description": data["weather"][0]["description"],
            "Timestamp": datetime.utcnow()
        })

        print(f"‚úî Collected: {city}")

    else:
        print(f"‚ùå Not Found: {city} ‚Üí {data.get('message')}")

    time.sleep(1)  # avoid API rate limit (60 req/min)

# ==========================
# 4. Save output
# ==========================
df_weather = pd.DataFrame(weather_data)
output_file = "city_weather_data.csv"
df_weather.to_csv(output_file, index=False)

print("\n==============================")
print("‚úî Weather data collection DONE!")
print(f"‚úî Saved File: {output_file}")
print("==============================")


In [None]:
import pandas as pd
import numpy as np
import os
import osmnx as ox
from concurrent.futures import ThreadPoolExecutor, as_completed
import time

# ============================================================
#              1. LOAD CITY FILE AND SPLIT INTO 10 CHUNKS
# ============================================================

INPUT_FILE = "Update with your CSV file path"
NUM_CHUNKS = 10   # <-- Updated for 10 machines

print("\nüìå Loading city file...")
df = pd.read_csv(INPUT_FILE)

print(f"Total cities loaded: {len(df)}")
chunks = np.array_split(df, NUM_CHUNKS)

chunk_files = []

for i, chunk in enumerate(chunks):
    file = f"City_chunk_{i+1}.csv"
    chunk.to_csv(file, index=False)
    chunk_files.append(file)
    print(f"‚úî Saved chunk {i+1}: {file} ({len(chunk)} cities)")

print("\nüéâ City split into 10 chunks! Starting processing...\n")

# ============================================================
#                2. OSM FEATURE EXTRACTION FUNCTION
# ============================================================

# OSMnx settings
ox.settings.log_console = False
ox.settings.use_cache = True
ox.settings.cache_folder = "OSM_CACHE"
ox.settings.overpass_rate_limit = False

distance_m = 2000  # 2 km search radius

# Feature tags
tags = {
    'highway': ['primary', 'secondary', 'tertiary', 'residential'], 
    'landuse': ['industrial', 'farmland', 'landfill', 'forest', 'meadow'],
    'amenity': ['waste_disposal', 'recycling'],
    'natural': ['wood', 'grassland']
}

def process_city(city):
    print(f"Processing: {city}")

    # Retry mechanism
    for attempt in range(3):
        try:
            gdf = ox.features_from_address(city, tags=tags, dist=distance_m)
            break
        except Exception as e:
            print(f"Retry {attempt+1}/3 for {city} due to {e}")
            time.sleep(1)
    else:
        print(f"‚ùå Failed for {city}")
        return {
            "City": city,
            "Road_Count": 0,
            "Industrial_Count": 0,
            "Farmland_Count": 0,
            "Landfill_Count": 0,
            "Dump_Site_Count": 0,
            "Recycling_Count": 0,
            "Green_Area_Count": 0
        }

    def count_feature(col, value=None):
        if col not in gdf.columns:
            return 0
        if value:
            return (gdf[col] == value).sum()
        return gdf[col].notna().sum()

    result = {
        "City": city,
        "Road_Count": count_feature("highway"),
        "Industrial_Count": count_feature("landuse", "industrial"),
        "Farmland_Count": count_feature("landuse", "farmland"),
        "Landfill_Count": count_feature("landuse", "landfill"),
        "Dump_Site_Count": count_feature("amenity", "waste_disposal"),
        "Recycling_Count": count_feature("amenity", "recycling"),
        "Green_Area_Count":
            count_feature("landuse", "forest") +
            count_feature("natural", "wood") +
            count_feature("landuse", "meadow") +
            count_feature("natural", "grassland")
    }

    print(f"‚úî Done: {city}")
    return result

# ============================================================
#                  3. PROCESS ALL 10 CHUNKS
# ============================================================

chunk_outputs = []

for file in chunk_files:
    print(f"\nüöÄ Starting processing for: {file}\n")
    df_chunk = pd.read_csv(file)
    cities = df_chunk['City'].dropna().tolist()

    results = []
    max_threads = 20

    with ThreadPoolExecutor(max_workers=max_threads) as executor:
        futures = {executor.submit(process_city, city): city for city in cities}

        for future in as_completed(futures):
            results.append(future.result())

    output_file = f"Processed_{file}"
    pd.DataFrame(results).to_csv(output_file, index=False)
    chunk_outputs.append(output_file)
    print(f"\n‚úî Saved output for {file} ‚Üí {output_file}")

# ============================================================
#                   4. MERGE ALL FINAL OUTPUTS
# ============================================================

print("\nüìå Merging all processed chunks...\n")

dfs = [pd.read_csv(f) for f in chunk_outputs]
final_df = pd.concat(dfs, ignore_index=True)

final_output = "OSM_final_merged_output.csv"
final_df.to_csv(final_output, index=False)

print(f"üéâ ALL DONE! Final merged file saved as: {final_output}")
print(f"Total rows collected: {len(final_df)}")
print("\nNo data loss. Full pipeline complete.")


In [None]:
import pandas as pd
import os

BASE_PATH = "D:\\codes\\brain tumor project yz\\infosys\\"

INPUT_FILES = [
    "Recovered_City_chunk_1.csv",
    "Processed_City_chunk_2.csv",
    "Recovered_City_chunk_3.csv",
    "Recovered_City_chunk_4.csv",
    "Recovered_City_chunk_5.csv",
    "Processed_City_chunk_6.csv",
    "Processed_City_chunk_7.csv",
    "Processed_City_chunk_8.csv",
    "Processed_City_chunk_9.csv",
    "Processed_City_chunk_10.csv"
]

OUTPUT_FILE = BASE_PATH + "OSM_FINAL_NO_DUPLICATES.csv"

FEATURE_COLS = [
    "Road_Count",
    "Industrial_Count",
    "Farmland_Count",
    "Landfill_Count",
    "Dump_Site_Count",
    "Recycling_Count",
    "Green_Area_Count"
]

# ============================================================
#               LOAD & COMBINE ALL FILES
# ============================================================

dfs = []

for file in INPUT_FILES:
    path = BASE_PATH + file
    if os.path.exists(path):
        df = pd.read_csv(path)
        dfs.append(df)
        print(f"‚úî Loaded {file} ({len(df)})")
    else:
        print(f"‚ö† Missing file: {file}")

combined_df = pd.concat(dfs, ignore_index=True)

print(f"\nTotal rows before deduplication: {len(combined_df)}")

# ============================================================
#        REMOVE DUPLICATES ‚Äî KEEP BEST ROW PER CITY
# ============================================================

combined_df["feature_sum"] = combined_df[FEATURE_COLS].sum(axis=1)

# Sort so best rows come first
combined_df = combined_df.sort_values(
    by=["City", "feature_sum"],
    ascending=[True, False]
)

# Drop duplicates by City
final_df = combined_df.drop_duplicates(subset="City", keep="first")

# Cleanup
final_df = final_df.drop(columns=["feature_sum"])

# ============================================================
#                   SAVE FINAL FILE
# ============================================================

final_df.to_csv(OUTPUT_FILE, index=False)

print("\nüéâ FINAL MERGE COMPLETE")
print(f"Final unique cities: {len(final_df)}")
print(f"Saved as: {OUTPUT_FILE}")
