WEEK-2

Import Lib

In [1]:
import pandas as pd
import numpy as np

import File

In [2]:
input_file = "D:\\codes\\brain tumor project yz\\infosys\\Data_set\\Main_data_set\\FINAL_MERGED_WEATHER_OSM_NO_NULL.csv"
output_file = "D:\\codes\\brain tumor project yz\\infosys\\Data_set\\Main_data_set\\FINAL_CLEANED_FEATURE_ENGINEERED_DATASET.csv"

In [3]:
df = pd.read_csv(input_file)
print("Original Dataset Shape:", df.shape)

Original Dataset Shape: (22264, 30)


Remove Empty / Artifact Columns

In [4]:
df = df.loc[:, ~df.columns.str.contains("^Unnamed")]

Fix Encoding Issues

In [5]:
df.rename(columns={
    "Temperature (√Ç¬∞C)": "Temperature (¬∞C)",
    "Wind Direction (√Ç¬∞)": "Wind Direction (¬∞)"
}, inplace=True)


Standardize Timestamp

In [7]:
df["Timestamp"] = pd.to_datetime(df["Timestamp"], utc=True, errors="coerce")

Remove Invalid Records

In [8]:
df.dropna(subset=["Latitude", "Longitude", "Timestamp"], inplace=True)

Remove Duplicate Entries

In [9]:
df.drop_duplicates(
    subset=["Latitude", "Longitude", "Timestamp"],
    inplace=True
)


Handle Missing Values

In [10]:
# Weather data ‚Üí mean imputation
weather_cols = [
    "Temperature (¬∞C)", "Humidity (%)",
    "Wind Speed (m/s)", "Wind Direction (¬∞)"
]

for col in weather_cols:
    if col in df.columns:
        df[col].fillna(df[col].mean(), inplace=True)

# OSM features ‚Üí fill with 0
osm_cols = [
    "Road_Count", "Industrial_Count", "Farmland_Count",
    "Dump_Site_Count", "Recycling_Count", "Green_Area_Count"
]

for col in osm_cols:
    if col in df.columns:
        df[col].fillna(0, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(0, inplace=True)


Temporal Feature Engineering

In [11]:
df["Hour"] = df["Timestamp"].dt.hour
df["Day_of_Week"] = df["Timestamp"].dt.dayofweek
df["Month"] = df["Timestamp"].dt.month

def get_season(month):
    if month in [12, 1, 2]:
        return "Winter"
    elif month in [3, 4, 5]:
        return "Summer"
    elif month in [6, 7, 8]:
        return "Monsoon"
    else:
        return "Post-Monsoon"

df["Season"] = df["Month"].apply(get_season)

Normalize Pollutant & Weather Values

In [12]:
scale_cols = [
    "Temperature (¬∞C)", "Humidity (%)", "Wind Speed (m/s)",
    "CO AQI Value", "NO2 AQI Value", "Ozone AQI Value",
    "PM2.5 AQI Value", "Overall AQI Value"
]

for col in scale_cols:
    if col in df.columns:
        min_val = df[col].min()
        max_val = df[col].max()
        if max_val > min_val:
            df[col + "_Normalized"] = (df[col] - min_val) / (max_val - min_val)

Save Final Dataset

In [13]:
print("Final Dataset Shape:", df.shape)
df.to_csv(output_file, index=False)

print("Preprocessing completed successfully.")
print(f"Saved as: {output_file}")

Final Dataset Shape: (22264, 37)
Preprocessing completed successfully.
Saved as: D:\codes\brain tumor project yz\infosys\Data_set\Main_data_set\FINAL_CLEANED_FEATURE_ENGINEERED_DATASET.csv


spatial proximity features

In [None]:
import pandas as pd
import geopandas as gpd
import osmnx as ox
from shapely.geometry import Point
import numpy as np

# -------------------------------
# CONFIG
# -------------------------------
INPUT_CSV = "D:\\codes\\brain tumor project yz\\infosys\Data_set\\Main_data_set\\FINAL_MERGED_WEATHER_OSM_NO_NULL.csv"
OUTPUT_CSV = "D:\\codes\\brain tumor project yz\\infosys\Data_set\\Main_data_set\\FINAL_DATASET_WITH_SPATIAL_FEATURES_COMPLETE1"

LAT_COL = "latitude"
LON_COL = "longitude"

CLUSTER_PRECISION = 1       # ~2 km clustering
BBOX_BUFFER_DEG = 0.02       # ~2 km bbox
DIST_THRESHOLD_KM = 2.0

# -------------------------------
# 1. Load dataset
# -------------------------------
print("üì• Loading dataset...")
df = pd.read_csv(INPUT_CSV)
df.columns = df.columns.str.strip().str.lower()

df = df.dropna(subset=[LAT_COL, LON_COL]).reset_index(drop=True)

print(f"‚úÖ Total records: {len(df)}")

# -------------------------------
# 2. Create GeoDataFrame
# -------------------------------
print("üåç Creating GeoDataFrame...")
geometry = [Point(xy) for xy in zip(df[LON_COL], df[LAT_COL])]
gdf = gpd.GeoDataFrame(df, geometry=geometry, crs="EPSG:4326")
gdf = gdf.to_crs(epsg=3857)

# -------------------------------
# 3. Create LOCATION CLUSTERS
# -------------------------------
gdf["lat_round"] = gdf[LAT_COL].round(CLUSTER_PRECISION)
gdf["lon_round"] = gdf[LON_COL].round(CLUSTER_PRECISION)
gdf["location_cluster"] = (
    gdf["lat_round"].astype(str) + "_" + gdf["lon_round"].astype(str)
)

clusters = gdf["location_cluster"].unique()
print(f"üìç Total location clusters: {len(clusters)}")

# Prepare result columns
gdf["dist_road_km"] = np.nan
gdf["dist_industry_km"] = np.nan
gdf["dist_dump_km"] = np.nan

# -------------------------------
# 4. Process each cluster
# -------------------------------
for i, cluster in enumerate(clusters, 1):

    cluster_df = gdf[gdf["location_cluster"] == cluster]

    lat = cluster_df[LAT_COL].mean()
    lon = cluster_df[LON_COL].mean()

    north = lat + BBOX_BUFFER_DEG
    south = lat - BBOX_BUFFER_DEG
    east  = lon + BBOX_BUFFER_DEG
    west  = lon - BBOX_BUFFER_DEG

    print(f"\nüöÄ [{i}/{len(clusters)}] Processing cluster: {cluster}")
    print(f"üì¶ BBOX: {north:.4f}, {south:.4f}, {east:.4f}, {west:.4f}")

    try:
        # -------------------------------
        # Fetch OSM features (ONCE)
        # -------------------------------
        print("üõ£ Downloading major roads...")
        roads = ox.features_from_bbox(
            bbox=(north, south, east, west),
            tags={"highway": ["motorway", "trunk", "primary", "secondary"]}
        ).to_crs(epsg=3857)

        print("üè≠ Downloading industrial areas...")
        industries = ox.features_from_bbox(
            bbox=(north, south, east, west),
            tags={"landuse": "industrial"}
        ).to_crs(epsg=3857)

        print("üóë Downloading dump / landfill sites...")
        dumps = ox.features_from_bbox(
            bbox=(north, south, east, west),
            tags={"landuse": "landfill"}
        ).to_crs(epsg=3857)

        # -------------------------------
        # Distance calculations
        # -------------------------------
        pts = cluster_df.copy()

        if not roads.empty:
            pts = gpd.sjoin_nearest(
                pts, roads[["geometry"]],
                how="left", distance_col="dist_road_m"
            )
            gdf.loc[pts.index, "dist_road_km"] = pts["dist_road_m"] / 1000

        if not industries.empty:
            pts = gpd.sjoin_nearest(
                pts, industries[["geometry"]],
                how="left", distance_col="dist_industry_m"
            )
            gdf.loc[pts.index, "dist_industry_km"] = pts["dist_industry_m"] / 1000

        if not dumps.empty:
            pts = gpd.sjoin_nearest(
                pts, dumps[["geometry"]],
                how="left", distance_col="dist_dump_m"
            )
            gdf.loc[pts.index, "dist_dump_km"] = pts["dist_dump_m"] / 1000

    except Exception as e:
        print(f"‚ö†Ô∏è Skipping cluster {cluster} due to error: {e}")
        continue

# -------------------------------
# 5. Binary proximity flags
# -------------------------------
print("\nüßÆ Creating binary 2km proximity flags...")

gdf["near_road_2km"] = (gdf["dist_road_km"] <= DIST_THRESHOLD_KM).astype(int)
gdf["near_industry_2km"] = (gdf["dist_industry_km"] <= DIST_THRESHOLD_KM).astype(int)
gdf["near_dump_2km"] = (gdf["dist_dump_km"] <= DIST_THRESHOLD_KM).astype(int)

# -------------------------------
# 6. Save final dataset
# -------------------------------
final_df = gdf.drop(
    columns=["geometry", "lat_round", "lon_round", "location_cluster"],
    errors="ignore"
)

final_df.to_csv(OUTPUT_CSV, index=False)

print("\n‚úÖ FINAL DATASET SAVED SUCCESSFULLY")
print(f"üìÅ Output file: {OUTPUT_CSV}")


In [5]:
import pandas as pd

# -------------------------------
# 1. Load dataset
# -------------------------------
print("üì• Loading dataset...")
df = pd.read_csv("D:\\codes\\brain tumor project yz\\infosys\Data_set\\Main_data_set\\FINAL_MERGED_WEATHER_OSM_NO_NULL.csv")

print(f"‚úÖ Total rows: {len(df)}")

# -------------------------------
# 2. Check required columns
# -------------------------------
CITY_COL = "City"        # üîÅ change if column name differs
LAT_COL = "Latitude"
LON_COL = "Longitude"

required_cols = [CITY_COL, LAT_COL, LON_COL]
missing = [c for c in required_cols if c not in df.columns]

if missing:
    raise ValueError(f"‚ùå Missing columns: {missing}")

# -------------------------------
# 3. Basic city stats (RAW)
# -------------------------------
raw_unique_cities = df[CITY_COL].nunique()
print(f"üèô Unique cities (RAW): {raw_unique_cities}")

# -------------------------------
# 4. Clean city names
# -------------------------------
df["city_clean"] = (
    df[CITY_COL]
    .astype(str)
    .str.lower()
    .str.strip()
    .str.replace(r"[^a-z\s]", "", regex=True)
)

clean_unique_cities = df["city_clean"].nunique()
print(f"üßπ Unique cities (CLEANED): {clean_unique_cities}")

# -------------------------------
# 5. Top duplicate cities
# -------------------------------
print("\nüîÅ Top 15 most duplicated cities:")
print(
    df["city_clean"]
    .value_counts()
    .head(15)
)

# -------------------------------
# 6. Save cleaned preview (optional)
# -------------------------------
df[["city", "city_clean"]].drop_duplicates().to_csv(
    "CITY_CLEAN_MAPPING.csv", index=False
)

print("\n‚úÖ City duplicate analysis completed")
print("üìÑ Saved: CITY_CLEAN_MAPPING.csv")


üì• Loading dataset...
‚úÖ Total rows: 22264
üèô Unique cities (RAW): 22264
üßπ Unique cities (CLEANED): 22264

üîÅ Top 15 most duplicated cities:
city_clean
gursahaiganj            1
dar es salaam           1
puurs                   1
praskoveya              1
post falls              1
radovis                 1
gyanpur                 1
puttlingen              1
viterbo                 1
tonala                  1
tres pontas             1
villa de cura           1
vitoria da conquista    1
reston                  1
sand springs            1
Name: count, dtype: int64


KeyError: "['city'] not in index"

In [4]:
df = pd.read_csv("D:\\codes\\brain tumor project yz\\infosys\Data_set\\Main_data_set\\FINAL_MERGED_WEATHER_OSM_NO_NULL.csv")
print(df["Country"].value_counts())


Country
United States of America    2856
India                       2400
Brazil                      1543
Germany                     1281
Russian Federation          1172
                            ... 
Lebanon                        1
Seychelles                     1
State of Palestine             1
Saint Kitts and Nevis          1
Monaco                         1
Name: count, Length: 176, dtype: int64


Module 3 ‚Äì Source Labeling Code

In [5]:
import pandas as pd
import numpy as np

# -----------------------------
# LOAD DATA
# -----------------------------
INPUT_CSV = "D:\\codes\\brain tumor project yz\\infosys\\FINAL_DATASET_WITH_SPATIAL_FEATURES_COMPLETE1.csv"
df = pd.read_csv(INPUT_CSV)

print("‚úÖ Dataset loaded:", df.shape)

# -----------------------------
# CLEAN COLUMN NAMES
# -----------------------------
df.columns = df.columns.str.strip().str.lower()

# -----------------------------
# DEFINE THRESHOLDS (DATA-DRIVEN)
# -----------------------------
NO2_HIGH = df["no2 aqi value"].quantile(0.75)
PM_HIGH  = df["pm2.5 aqi value"].quantile(0.75)
CO_HIGH  = df["co aqi value"].quantile(0.75)

print("üìä Thresholds:")
print("NO2_HIGH:", NO2_HIGH)
print("PM_HIGH:", PM_HIGH)
print("CO_HIGH:", CO_HIGH)

# -----------------------------
# SOURCE LABELING LOGIC
# -----------------------------
def label_source(row):

    # Vehicular Pollution
    if row["near_road_2km"] == 1 and row["no2 aqi value"] >= NO2_HIGH:
        return "Vehicular"

    # Industrial Pollution
    if row["near_industry_2km"] == 1 and row["pm2.5 aqi value"] >= PM_HIGH:
        return "Industrial"

    # Waste / Dump related
    if row["near_dump_2km"] == 1 and row["pm2.5 aqi value"] >= PM_HIGH:
        return "Waste Burning"

    # Agricultural / Dust
    if row["pm2.5 aqi value"] >= PM_HIGH and row["wind speed (m/s)"] < 2:
        return "Agricultural/Dust"

    # Natural / Background
    return "Natural"

# -----------------------------
# APPLY LABELING
# -----------------------------
df["pollution_source"] = df.apply(label_source, axis=1)

print("‚úÖ Source labeling completed")

# -----------------------------
# CHECK DISTRIBUTION
# -----------------------------
print("\nüìå Pollution Source Distribution:")
print(df["pollution_source"].value_counts())

# -----------------------------
# SAVE FINAL DATASET
# -----------------------------
OUTPUT_CSV = "FINAL_LABELED_DATASET_MODULE3.csv"
df.to_csv(OUTPUT_CSV, index=False)

print("\nüìÅ Saved:", OUTPUT_CSV)


‚úÖ Dataset loaded: (22264, 33)
üìä Thresholds:
NO2_HIGH: 4.0
PM_HIGH: 78.0
CO_HIGH: 1.0
‚úÖ Source labeling completed

üìå Pollution Source Distribution:
pollution_source
Natural       13129
Vehicular      5671
Industrial     3464
Name: count, dtype: int64

üìÅ Saved: FINAL_LABELED_DATASET_MODULE3.csv
