In [7]:
import pandas as pd
import numpy as np



In [8]:
df = pd.read_csv(
    r"C:\Users\mothe\OneDrive\Documents\Batch-6\enviroscan_week2_master_dataset.csv"
)


In [9]:
df['Timestamp'] = pd.to_datetime(df['Timestamp'])


In [10]:
df.shape


(21882, 26)

In [11]:
df.columns


Index(['Country', 'City', 'AQI Value', 'AQI Category', 'CO AQI Value',
       'CO AQI Category', 'Ozone AQI Value', 'Ozone AQI Category',
       'NO2 AQI Value', 'NO2 AQI Category', 'PM2.5 AQI Value',
       'PM2.5 AQI Category', 'Temperature (C)', 'Humidity (%)',
       'Wind Speed (m/s)', 'Wind Direction (deg)', 'Latitude', 'Longitude',
       'Timestamp', 'Source_API', 'pm_ratio', 'aqi_severity', 'wind_effect',
       'hour', 'month', 'season'],
      dtype='object')

In [12]:
SOURCE_TYPES = [
    'Vehicular',
    'Industrial',
    'Agricultural',
    'Natural'
]


In [13]:
reference_sources = {
    'Industrial': [
        (28.62, 77.21),
        (19.08, 72.88),
        (13.08, 80.27)
    ],
    'Vehicular': [
        (28.61, 77.23),
        (19.07, 72.87),
        (12.97, 77.59)
    ],
    'Agricultural': [
        (29.00, 76.00),
        (26.50, 80.50),
        (23.50, 85.00)
    ]
}


In [14]:
reference_sources = {
    'Industrial': [
        (28.62, 77.21),
        (19.08, 72.88),
        (13.08, 80.27)
    ],
    'Vehicular': [
        (28.61, 77.23),
        (19.07, 72.87),
        (12.97, 77.59)
    ],
    'Agricultural': [
        (29.00, 76.00),
        (26.50, 80.50),
        (23.50, 85.00)
    ]
}


In [15]:
from math import radians, sin, cos, sqrt, atan2


In [16]:
def haversine_distance(lat1, lon1, lat2, lon2):
    R = 6371  # Earth radius in km

    lat1, lon1, lat2, lon2 = map(
        radians, [lat1, lon1, lat2, lon2]
    )

    dlat = lat2 - lat1
    dlon = lon2 - lon1

    a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))

    return R * c


In [17]:
def min_distance_to_sources(lat, lon, source_points):
    distances = [
        haversine_distance(lat, lon, src_lat, src_lon)
        for src_lat, src_lon in source_points
    ]
    return min(distances)


In [18]:
df['dist_to_industry_km'] = df.apply(
    lambda row: min_distance_to_sources(
        row['Latitude'],
        row['Longitude'],
        reference_sources['Industrial']
    ),
    axis=1
)

df['dist_to_vehicular_km'] = df.apply(
    lambda row: min_distance_to_sources(
        row['Latitude'],
        row['Longitude'],
        reference_sources['Vehicular']
    ),
    axis=1
)

df['dist_to_agriculture_km'] = df.apply(
    lambda row: min_distance_to_sources(
        row['Latitude'],
        row['Longitude'],
        reference_sources['Agricultural']
    ),
    axis=1
)


In [19]:
df[
    [
        'dist_to_industry_km',
        'dist_to_vehicular_km',
        'dist_to_agriculture_km'
    ]
].head()


Unnamed: 0,dist_to_industry_km,dist_to_vehicular_km,dist_to_agriculture_km
0,3411.395442,3413.600712,3294.495823
1,13072.713099,13071.743139,13245.094452
2,5767.750961,5769.994302,5643.335088
3,5283.416078,5285.557847,5172.307211
4,14658.287311,14939.624364,14325.685808


In [20]:
def label_pollution_source(row):

    # 1️⃣ Vehicular (RELAXED – increases data)
    if (
        row["NO2 AQI Value"] >= 15 and
        row["dist_to_vehicular_km"] <= 1500
    ):
        return "Vehicular"

    # 2️⃣ Industrial
    elif row["dist_to_industry_km"] <= 2000:
        return "Industrial"

    # 3️⃣ Agricultural
    elif (
        row["PM2.5 AQI Value"] >= 40 and
        row["dist_to_agriculture_km"] <= 3000
    ):
        return "Agricultural"

    # 4️⃣ Residential / Urban (NEW CATEGORY)
    elif (
        row["AQI Value"] < 80 and
        row["dist_to_industry_km"] > 3000
    ):
        return "Residential"

    # 5️⃣ Natural (DEFAULT)
    else:
        return "Natural"


In [21]:
df['pollution_source'] = df.apply(label_pollution_source, axis=1)


In [22]:
df['pollution_source'].value_counts()


pollution_source
Residential     15137
Industrial       2986
Natural          2917
Agricultural      793
Vehicular          49
Name: count, dtype: int64

In [23]:
df['pollution_source'].value_counts()


pollution_source
Residential     15137
Industrial       2986
Natural          2917
Agricultural      793
Vehicular          49
Name: count, dtype: int64

In [24]:
df['pollution_source'].value_counts()


pollution_source
Residential     15137
Industrial       2986
Natural          2917
Agricultural      793
Vehicular          49
Name: count, dtype: int64

In [25]:
df[
    ['pollution_source',
     'dist_to_industry_km',
     'dist_to_vehicular_km',
     'dist_to_agriculture_km']
].head(10)


Unnamed: 0,pollution_source,dist_to_industry_km,dist_to_vehicular_km,dist_to_agriculture_km
0,Residential,3411.395442,3413.600712,3294.495823
1,Residential,13072.713099,13071.743139,13245.094452
2,Residential,5767.750961,5769.994302,5643.335088
3,Residential,5283.416078,5285.557847,5172.307211
4,Residential,14658.287311,14939.624364,14325.685808
5,Residential,13453.938108,13455.690845,13370.342425
6,Residential,6252.424499,6254.616226,6136.723946
7,Residential,6413.816131,6415.985818,6300.177951
8,Residential,3471.692518,3473.910196,3353.413149
9,Natural,4362.816731,4362.32607,4305.030513


In [26]:
df[['pollution_source']].isna().sum()


pollution_source    0
dtype: int64

In [27]:
df.to_csv(
    r"C:\Users\mothe\OneDrive\Documents\Batch-6\enviroscan_week3_labeled_dataset.csv",
    index=False
)
