In [14]:
# Milestone 0: Load libs & datasets
import pandas as pd, numpy as np
import plotly.express as px, plotly.graph_objects as go
import seaborn as sns, matplotlib.pyplot as plt
import folium
from folium.plugins import MarkerCluster



In [15]:
!pip install kaleido==0.2.1





In [17]:
# milstone 0 - importing 
p1 = '/kaggle/input/delhihel/MLDelhi2022.csv'
p2 = '/kaggle/input/delhihel/Delhi_AQI_2018-2024.csv'

df = pd.read_csv(p1, low_memory=False)
df_aqi = pd.read_csv(p2, low_memory=False)

# Standardize timestamp & quick shapes
df['Timestamp'] = pd.to_datetime(df['Timestamp'], errors='coerce')
df_aqi['Timestamp'] = pd.to_datetime(df_aqi['Timestamp'], errors='coerce')
print('MLDelhi2022:', df.shape)
print('Delhi_AQI_2018-2024:', df_aqi.shape)

# Quick head + null summary (save small csv)
# df.head()
df.isna().sum().sort_values(ascending=False).head(20).to_csv('/kaggle/working/missing2022_top20.csv')
df_aqi.isna().sum().sort_values(ascending=False).to_csv('/kaggle/working/missing_aqi.csv')

df.head()

FileNotFoundError: [Errno 2] No such file or directory: '/kaggle/input/delhihel/MLDelhi2022.csv'

In [None]:
# Milstone-1
# preprocessing 

df.columns = (
    df.columns.str.strip()
    .str.lower()
    .str.replace(' ', '_')
    .str.replace('(', '')
    .str.replace(')', '')
    .str.replace('/', '_')
    .str.replace('.', '')
)

# ------------------------------------------------------------------
# 2. Rename pollutant columns EXACTLY so detection is 100% clean
# ------------------------------------------------------------------
rename_map = {
    'pm25_Âµg_mÂ³': 'pm25',
    'pm10_Âµg_mÂ³': 'pm10',
    'no_Âµg_mÂ³': 'no',
    'no2_Âµg_mÂ³': 'no2',
    'nh3_Âµg_mÂ³': 'nh3',
    'so2_Âµg_mÂ³': 'so2',
    'co_mg_mÂ³': 'co',
    'ozone_Âµg_mÂ³': 'ozone',
}

df = df.rename(columns=rename_map)

# ------------------------------------------------------------------
# 3. Extract datetime features
# ------------------------------------------------------------------
df['year'] = df['timestamp'].dt.year
df['month'] = df['timestamp'].dt.month
df['day'] = df['timestamp'].dt.day
df['hour'] = df['timestamp'].dt.hour
df['dayofweek'] = df['timestamp'].dt.dayofweek
df['is_weekend'] = df['dayofweek'].isin([5,6])

# Season
season_map = {
    12:'Winter',1:'Winter',2:'Winter',
    3:'Spring',4:'Spring',
    5:'Summer',6:'Summer',
    7:'Monsoon',8:'Monsoon',
    9:'Autumn',10:'Autumn',11:'Autumn'
}
df['season'] = df['month'].map(season_map)

# ------------------------------------------------------------------
# 4. SAFE pollutant selection
# ------------------------------------------------------------------
pollutants = ['pm25','pm10','no','no2','nh3','so2','co','ozone']
pollutants = [p for p in pollutants if p in df.columns]

print("Detected pollutant columns:", pollutants)

# ------------------------------------------------------------------
# 5. Impute pollutants safely
# ------------------------------------------------------------------
df = df.sort_values('timestamp')

for col in pollutants:
    # numeric check
    df[col] = pd.to_numeric(df[col], errors='coerce')

    missing_ratio = df[col].isna().mean()

    if missing_ratio <= 0.20:
        # time-based interpolation only when using timestamp index
        temp = df[['timestamp', col]].drop_duplicates(subset=['timestamp'])
        temp = temp.set_index('timestamp')[col].interpolate('time')
        df[col] = df['timestamp'].map(temp)
        df[col] = df[col].fillna(df[col].median())
    else:
        df[col + '_needs_impute'] = True

# ------------------------------------------------------------------
# 6. Export cleaned version
# ------------------------------------------------------------------
df.to_csv('/kaggle/working/MLDelhi2022_preprocessed.csv', index=False)

# df.iloc[2000]
df.head()

In [None]:
# milestone 2

# input 
clean_path = "/kaggle/working/MLDelhi2022_preprocessed.csv"
aqi_path   = "/kaggle/input/delhihel/Delhi_AQI_2018-2024.csv"

df = pd.read_csv(clean_path, parse_dates=["timestamp"])
df_aqi = pd.read_csv(aqi_path, parse_dates=["Timestamp"])
df_aqi.columns = df_aqi.columns.str.lower().str.strip()

print(df.shape)
print(df_aqi.shape)

df.head()


In [None]:
# =======================================
# BLOCK 2: Annual AQI Trend (2018â€“2024)
# =======================================

df_aqi['year'] = df_aqi['timestamp'].dt.year
annual = df_aqi.groupby('year')['aqi'].mean().reset_index()

fig = px.line(
    annual, x='year', y='aqi', markers=True,
    title='Mean Annual AQI (2018â€“2024)',
    template='plotly_white'
)
fig.update_traces(marker=dict(size=10, line=dict(width=2)))
fig.show()


Insights to add in slides:

AQI peaks every winter

Clear dip in 2020 (COVID lockdown)

Rising trend post-2021

In [None]:
# ================================
# BLOCK 4: Weekday vs Weekend AQI
# ================================

df_aqi['dayofweek'] = df_aqi['timestamp'].dt.dayofweek
df_aqi['is_weekend'] = df_aqi['dayofweek'].isin([5,6])

week = df_aqi.groupby('is_weekend')['aqi'].mean().reset_index()

fig = px.bar(
    week, x='is_weekend', y='aqi',
    labels={'is_weekend': 'Weekend (True/False)'},
    title='Average AQI: Weekend vs Weekday',
    template='plotly_white'
)
fig.show()


In [None]:
# =======================================
# BLOCK 3: Monthly AQI Seasonal Pattern
# =======================================

df_aqi['month'] = df_aqi['timestamp'].dt.month
monthly = df_aqi.groupby(['year','month'])['aqi'].mean().reset_index()

fig = px.line(
    monthly, x='month', y='aqi', color='year',
    title='Monthly AQI Cycles (Overlay by Year)',
    template='plotly_white'
)
fig.update_layout(legend_title="Year")
fig.show()


In [None]:
# ============================================
# BLOCK 3 â€” Beautiful AQI Map with Hover Labels
# ============================================

import folium
import pandas as pd

df_aqi = pd.read_csv("/kaggle/input/delhihel/Delhi_AQI_2018-2024.csv", parse_dates=["Timestamp"])
df_aqi.columns = df_aqi.columns.str.lower().str.strip()

# station-level AQI
site_avg = df_aqi.groupby('site_clean').agg({
    'aqi': 'mean',
    'lat': 'first',
    'lon': 'first'
}).reset_index()

# AQI color function
def aqi_color(aqi):
    if aqi <= 50:   return "#00e400"  # green
    elif aqi <= 100:return "#ffff00"  # yellow
    elif aqi <= 200:return "#ff7e00"  # orange
    elif aqi <= 300:return "#ff0000"  # red
    elif aqi <= 400:return "#8f3f97"  # purple
    else:           return "#7e0023"  # maroon

# Create clean modern map
m = folium.Map(
    location=[site_avg['lat'].mean(), site_avg['lon'].mean()],
    zoom_start=11,
    tiles="CartoDB Positron"
)

# Add CircleMarkers with tooltip labels
for _, r in site_avg.iterrows():
    folium.CircleMarker(
        location=[r['lat'], r['lon']],
        radius=9,
        color=aqi_color(r['aqi']),
        fill=True,
        fill_opacity=0.85,
        weight=2,
        tooltip=folium.Tooltip(
            text=f"{r['site_clean']}",
            sticky=True,
            direction="top",
            opacity=0.9
        ),
        popup=f"<b>{r['site_clean']}</b><br>AQI: {r['aqi']:.2f}"
    ).add_to(m)

m.save("/kaggle/working/Delhi_AQI_Map_Clean_Hover.html")
m


In [None]:
# ==================================
# BLOCK 6: Pollutant Distributions
# ==================================

polls = ['pm25','pm10','no','no2','nh3','so2','co','ozone']
df[polls].hist(bins=30, figsize=(16,14), color='teal')
plt.suptitle("Pollutant Value Distributions", fontsize=18)
plt.show()


In [None]:
# ==================================
# BLOCK 7: Correlation Heatmap
# ==================================

corr = df[polls].corr()

fig = px.imshow(
    corr, text_auto=".2f",
    title="Pollutant Correlation Matrix",
    template="plotly_white",
    color_continuous_scale='RdBu_r'
)
fig.show()


In [None]:
# =============================================
# BLOCK A: Correlation of AQI vs Pollutants
# =============================================

df = pd.read_csv('/kaggle/working/MLDelhi2022_preprocessed.csv', parse_dates=['timestamp'])

polls = ['pm25','pm10','no','no2','nh3','so2','co','ozone']

corr_aqi = df[polls + ['aqi']].corr()['aqi'].sort_values(ascending=False)
display(corr_aqi)


In [None]:
# =======================================================
# BLOCK B: Plotly Heatmap â€” AQI vs Pollutants Only
# =======================================================

import plotly.express as px

corr_matrix = df[polls + ['aqi']].corr()[['aqi']].drop('aqi')

fig = px.imshow(
    corr_matrix,
    text_auto=".2f",
    aspect="auto",
    color_continuous_scale='RdBu_r',
    title="Correlation of AQI with Individual Pollutants",
    template="plotly_white"
)
fig.update_layout(coloraxis_colorbar=dict(title="Correlation"))
fig.show()


In [None]:
# ============================================
# BLOCK 1 â€” Event Impact (November + Diwali)
# ============================================

import pandas as pd
import plotly.express as px

# load fresh
df_aqi = pd.read_csv("/kaggle/input/delhihel/Delhi_AQI_2018-2024.csv", parse_dates=["Timestamp"])
df_aqi.columns = df_aqi.columns.str.lower().str.strip()

# Extract date parts safely
df_aqi['year']  = df_aqi['timestamp'].dt.year
df_aqi['month'] = df_aqi['timestamp'].dt.month
df_aqi['day']   = df_aqi['timestamp'].dt.day

# -----------------------------
# ðŸŒ¾ Crop Burning Impact (November)
# -----------------------------
df_nov = df_aqi[df_aqi['month'] == 11]

nov_daily = df_nov.groupby('day')['aqi'].mean().reset_index()

fig = px.line(
    nov_daily, x='day', y='aqi',
    title='Average AQI in November (Crop Burning Season)',
    template='plotly_white', markers=True
)
fig.show()

# -----------------------------
# ðŸŽ† Diwali Impact (Â±7 days)
# -----------------------------
diwali_dates = ['2018-11-07','2019-10-27','2020-11-14','2021-11-04','2022-10-24']
d_start = pd.to_datetime(diwali_dates) - pd.Timedelta(days=7)
d_end   = pd.to_datetime(diwali_dates) + pd.Timedelta(days=7)

mask = False
for s, e in zip(d_start, d_end):
    mask |= df_aqi['timestamp'].between(s, e)

df_diwali = df_aqi[mask]

diwali_daily = df_diwali.set_index('timestamp').resample('D')['aqi'].mean().reset_index()

fig2 = px.line(
    diwali_daily, x='timestamp', y='aqi',
    title="AQI Around Diwali (Â±7 Days)",
    template='plotly_white', markers=True
)
fig2.show()


In [None]:
# ============================================
# BLOCK 2 â€” AQI Category Distribution (Pie)
# ============================================

def categorize(aqi):
    if aqi <= 50:   return "Good"
    elif aqi <= 100:return "Satisfactory"
    elif aqi <= 200:return "Moderate"
    elif aqi <= 300:return "Poor"
    elif aqi <= 400:return "Very Poor"
    else:           return "Severe"

df_aqi['aqi_cat'] = df_aqi['aqi'].apply(categorize)

cat_count = df_aqi['aqi_cat'].value_counts().reset_index()
cat_count.columns = ['category', 'count']   # FIX the naming issue

fig = px.pie(
    cat_count,
    names='category',
    values='count',
    title="AQI Category Distribution (2018â€“2024)",
    hole=0.35,
    color_discrete_sequence=px.colors.sequential.OrRd
)
fig.update_traces(textposition='inside', textinfo='percent+label')
fig.show()


In [None]:
# ================================
# BLOCK 8: PCA on Pollutants
# ================================

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# scale pollutants
X = df[polls].dropna()
scaler = StandardScaler()
Xs = scaler.fit_transform(X)

pca = PCA(n_components=2)
components = pca.fit_transform(Xs)

pca_df = pd.DataFrame({
    'PC1': components[:,0],
    'PC2': components[:,1]
})

fig = px.scatter(
    pca_df, x='PC1', y='PC2',
    title="PCA: Pollution Pattern Clusters",
    opacity=0.5,
    template='plotly_white'
)
fig.show()

explained = pca.explained_variance_ratio_
explained


In [None]:
# knn

In [None]:
# ===============================================
# BLOCK 1 â€” KNN end-to-end (split â†’ train â†’ predict)
# ===============================================

import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# LOAD DATA
df = pd.read_csv('/kaggle/working/MLDelhi2022_preprocessed.csv', parse_dates=['timestamp'])

# TARGET: PM2.5
y = df['pm25']

# FEATURES:
num_features = [
    'pm10','no','no2','nh3','so2','co','ozone',
    'aqi', 
    'year','month','day','dayofweek','hour',
    'lat','lon',
    'is_weekend'
]

cat_features = ['dayname','monthname','season','site_clean']

# Numerical features
X_num = df[num_features].copy()

# One-hot encode categorical features
X_cat = pd.get_dummies(df[cat_features], drop_first=True)

# Final Features
X = pd.concat([X_num, X_cat], axis=1)

# TRAIN-TEST SPLIT (time-based)
split = int(len(X) * 0.8)
X_train, X_test = X.iloc[:split], X.iloc[split:]
y_train, y_test = y.iloc[:split], y.iloc[split:]

# SCALE FEATURES
scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_train)
X_test_s = scaler.transform(X_test)

# FIND BEST K
errors = []
for k in range(1, 21):
    model = KNeighborsRegressor(n_neighbors=k)
    model.fit(X_train_s, y_train)
    pred = model.predict(X_test_s)
    rmse = mean_squared_error(y_test, pred, squared=False)
    errors.append(rmse)

best_k = np.argmin(errors) + 1
print("BEST K =", best_k)

# FINAL MODEL
knn = KNeighborsRegressor(n_neighbors=best_k)
knn.fit(X_train_s, y_train)
pred_knn = knn.predict(X_test_s)

# METRICS
mae  = mean_absolute_error(y_test, pred_knn)
rmse = mean_squared_error(y_test, pred_knn, squared=False)
r2   = r2_score(y_test, pred_knn)

print("\nMODEL PERFORMANCE")
print("MAE :", mae)
print("RMSE:", rmse)
print("RÂ²  :", r2)

# Save predictions for next block
pred_df = pd.DataFrame({
    "actual": y_test.values,
    "predicted": pred_knn
}, index=y_test.index)

pred_df.to_csv("/kaggle/working/knn_predictions.csv")
print("Saved -> /kaggle/working/knn_predictions.csv")


In [None]:
# ===============================================
# BLOCK 2 â€” KNN Failure Analysis
# ===============================================

import pandas as pd
import numpy as np
import plotly.graph_objects as go

pred_df = pd.read_csv("/kaggle/working/knn_predictions.csv", index_col=0)

# ABSOLUTE ERROR
pred_df["error"] = (pred_df["actual"] - pred_df["predicted"]).abs()

# BASIC SUMMARY
print("Mean Error:", pred_df["error"].mean())
print("Median Error:", pred_df["error"].median())
print("Max Error:", pred_df["error"].max())

# FAILURE RATE: error > 30 Âµg/mÂ³ (tunable threshold)
threshold = 30
fail_rate = (pred_df["error"] > threshold).mean() * 100
print(f"\nFAILURE RATE (> {threshold}): {fail_rate:.2f}%")

# TOP WORST FAILURES
worst = pred_df.sort_values("error", ascending=False).head(10)
print("\nTOP 10 WORST FAILURES")
print(worst)

# VISUALIZE WORST FAILURE DAY
wd = worst.index[0]
actual = pred_df.loc[wd, "actual"]
pred   = pred_df.loc[wd, "predicted"]

fig = go.Figure()
fig.add_trace(go.Bar(x=["Actual PM2.5"], y=[actual], name="Actual"))
fig.add_trace(go.Bar(x=["Predicted PM2.5"], y=[pred], name="KNN Prediction"))
fig.update_layout(
    title=f"KNN Failure Example â€” Index {wd} (Error: {abs(actual-pred):.1f})",
    template="plotly_white"
)
fig.show()


KNN  ,descision tree , linear regression will fail due to non linear data 

In [None]:
# station wise EDA

In [None]:
import pandas as pd

df_aqi = pd.read_csv('/kaggle/input/delhihel/Delhi_AQI_2018-2024.csv', parse_dates=['Timestamp'])
df_aqi.columns = [c.lower().strip() for c in df_aqi.columns]
df_aqi = df_aqi.rename(columns={'timestamp':'timestamp', 'site':'site_clean'})

# Create date parts
df_aqi['year'] = df_aqi['timestamp'].dt.year

# Daily mean AQI per station
daily_station = df_aqi.groupby(['site_clean', df_aqi['timestamp'].dt.date]).agg(
    aqi=('aqi', 'mean')
).reset_index().rename(columns={'timestamp':'date'})

daily_station['date'] = pd.to_datetime(daily_station['date'])
daily_station['year'] = daily_station['date'].dt.year

daily_station.head()


In [None]:
import plotly.express as px

# compute yearly mean per station
yr_station = daily_station.groupby(['site_clean','year'])['aqi'].mean().reset_index()

# choose top stations by record count
topN = 12
top_sites = df_aqi['site_clean'].value_counts().index[:topN]
plot_df = yr_station[yr_station['site_clean'].isin(top_sites)]

fig = px.line(
    plot_df,
    x='year', y='aqi', color='site_clean',
    title='Yearly AQI Trend by Station (2018â€“2024)',
    markers=True,
    template='plotly_white'
)

fig.show()


In [None]:
station_rank = daily_station.groupby('site_clean')['aqi'].mean().reset_index()
station_rank = station_rank.sort_values('aqi', ascending=False)

top10 = station_rank.head(10)

fig = px.bar(
    top10,
    x='site_clean',
    y='aqi',
    title='Top 10 Most Polluted Stations (2018â€“2024)',
    template='plotly_white',
    text='aqi'
)
fig.update_traces(texttemplate='%{text:.1f}')
fig.show()

station_rank.head(20)


In [None]:
df_poll = pd.read_csv('/kaggle/working/MLDelhi2022_preprocessed.csv', parse_dates=['timestamp'])
df_poll.columns = [c.lower().strip() for c in df_poll.columns]

# daily averages per station
daily_pm = df_poll.groupby(['site_clean', df_poll['timestamp'].dt.date]).agg(
    pm25=('pm25','mean')
).reset_index().rename(columns={'timestamp':'date'})

daily_pm['date'] = pd.to_datetime(daily_pm['date'])
daily_pm['year'] = daily_pm['date'].dt.year

daily_pm.head()


In [None]:
WHO_LIMIT = 25  # daily PM2.5 safe limit

exceed = daily_pm.groupby('site_clean').apply(
    lambda g: (g['pm25'] > WHO_LIMIT).mean() * 100
).reset_index().rename(columns={0:'who_exceed_pct'})

exceed = exceed.sort_values('who_exceed_pct', ascending=False)

fig = px.bar(
    exceed.head(15),
    x='site_clean',
    y='who_exceed_pct',
    title='WHO PM2.5 Exceedance Rate by Station (Daily > 25 Âµg/mÂ³)',
    template='plotly_white',
    text='who_exceed_pct'
)
fig.update_traces(texttemplate='%{text:.1f}%', marker_color='crimson')
fig.show()

exceed.head(10)


In [None]:
# add season mapping
season_map = {
    12:'Winter',1:'Winter',2:'Winter',
    3:'Spring',4:'Spring',
    5:'Summer',6:'Summer',
    7:'Monsoon',8:'Monsoon',
    9:'Autumn',10:'Autumn',11:'Autumn'
}
daily_pm['season'] = daily_pm['date'].dt.month.map(season_map)

top9 = daily_pm['site_clean'].value_counts().index[:9]
plot_df = daily_pm[daily_pm['site_clean'].isin(top9)]

fig = px.box(
    plot_df,
    x='season',
    y='pm25',
    color='season',
    facet_col='site_clean',
    facet_col_wrap=3,
    title='Seasonal PM2.5 Distribution (Top 9 Stations)',
    template='plotly_white'
)
fig.show()


In [None]:
EXTREME = 300

extreme_rank = daily_pm.groupby('site_clean').apply(
    lambda g: (g['pm25'] > EXTREME).sum()
).reset_index().rename(columns={0:'extreme_days'})

extreme_rank = extreme_rank.sort_values('extreme_days', ascending=False)

fig = px.bar(
    extreme_rank.head(12),
    x='site_clean', y='extreme_days',
    title='Stations With Most Extreme Pollution Days (PM2.5 > 300)',
    template='plotly_white',
    text='extreme_days'
)
fig.update_traces(texttemplate='%{text}')
fig.show()

extreme_rank.head(15)


In [None]:
#  desicion tree 

In [None]:
# BLOCK 1 â€” Train a compact Decision Tree (single runnable cell)
# Paste & run in Kaggle

import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
import joblib
import os

# --- Load dataset (your preprocessed file) ---
path = "/kaggle/working/MLDelhi2022_preprocessed.csv"
df = pd.read_csv(path, parse_dates=["timestamp"], low_memory=False)
print("Loaded rows:", len(df))

# --- Choose target and features (interpretable set) ---
target = "pm25"   # predicting PM2.5 (meaningful)
candidate_num = ["pm10","no","no2","nh3","so2","co","ozone","lat","lon","hour","month","dayofweek"]
# Keep only those present
features = [c for c in candidate_num if c in df.columns]
print("Using features:", features)

# --- Drop rows where target is missing, keep time ordering ---
df = df.dropna(subset=[target]).sort_values("timestamp").reset_index(drop=True)

# --- Median impute numeric features (safe) ---
for c in features:
    if df[c].dtype == 'O':
        df[c] = pd.to_numeric(df[c], errors='coerce')
    med = df[c].median()
    df[c] = df[c].fillna(med)

# convert bool to int if exists
if "is_weekend" in df.columns:
    df["is_weekend"] = df["is_weekend"].astype(int)
    features = features + ["is_weekend"] if "is_weekend" not in features else features

# --- Create X, y and a time-based split (80/20 chronological) ---
X = df[features]
y = df[target]

split = int(len(X) * 0.8)
X_train, X_test = X.iloc[:split].copy(), X.iloc[split:].copy()
y_train, y_test = y.iloc[:split].copy(), y.iloc[split:].copy()

print("Train / Test sizes:", X_train.shape, X_test.shape)

# --- Train small Decision Tree (interpretable) ---
dt = DecisionTreeRegressor(max_depth=4, min_samples_leaf=30, random_state=42)
dt.fit(X_train, y_train)

# --- Save model & feature list ---
os.makedirs("/kaggle/working/models", exist_ok=True)
joblib.dump(dt, "/kaggle/working/models/decision_tree_pm25_depth4.pkl")
pd.Series(features).to_csv("/kaggle/working/models/dt_features_pm25.csv", index=False)

print("Model trained and saved â†’ /kaggle/working/models/decision_tree_pm25_depth4.pkl")


In [None]:
# FIXED BLOCK 2 â€” Evaluate, visualize tree, feature importance, extract rules & failure stats (robust)
import pandas as pd
import numpy as np
import joblib
import matplotlib.pyplot as plt
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.tree import plot_tree, export_text
import plotly.graph_objects as go
import os, warnings

warnings.filterwarnings("ignore")

# --- reload model & features ---
model_path = "/kaggle/working/models/decision_tree_pm25_depth4.pkl"
feat_path  = "/kaggle/working/models/dt_features_pm25.csv"
dt = joblib.load(model_path)

# Read feature file robustly and filter to actual df columns later
features_raw = pd.read_csv(feat_path, header=None).iloc[:,0].astype(str).str.strip().tolist()
print("Raw features loaded from file:", features_raw[:50])

# --- reload data (same preprocessing as Block 1) ---
df = pd.read_csv("/kaggle/working/MLDelhi2022_preprocessed.csv", parse_dates=["timestamp"], low_memory=False)
df = df.dropna(subset=["pm25"]).sort_values("timestamp").reset_index(drop=True)

# Clean column names (defensive)
df.columns = df.columns.map(lambda c: c.strip() if isinstance(c, str) else c)

# Filter features to those actually present in df
features = [f for f in features_raw if f in df.columns]
missed = [f for f in features_raw if f not in features]
if missed:
    print("Warning: the following saved features were NOT found in the dataframe and will be ignored:", missed)
print("Final features used:", features)

# If no features left, stop with clear message
if len(features) == 0:
    raise RuntimeError("No valid features found. Check /kaggle/working/models/dt_features_pm25.csv and dataset columns.")

# Impute numeric features safely
for c in features:
    df[c] = pd.to_numeric(df[c], errors='coerce')
    df[c] = df[c].fillna(df[c].median())

if "is_weekend" in df.columns and "is_weekend" in features:
    df["is_weekend"] = df["is_weekend"].astype(int)

# Build X, y and time splits
X = df[features]
y = df["pm25"]

split = int(len(X) * 0.8)
X_train, X_test = X.iloc[:split], X.iloc[split:]
y_train, y_test = y.iloc[:split], y.iloc[split:]

# --- predict ---
y_pred = dt.predict(X_test)
# make a series aligned with y_test index for easy lookup
y_pred_series = pd.Series(y_pred, index=y_test.index)

# --- metrics ---
mae = mean_absolute_error(y_test, y_pred_series)
rmse = mean_squared_error(y_test, y_pred_series, squared=False)
r2 = r2_score(y_test, y_pred_series)
print(f"MAE: {mae:.2f}  RMSE: {rmse:.2f}  R2: {r2:.3f}")

# --- Failure stats ---
abs_err = (y_test - y_pred_series).abs()
pct_fail_30 = (abs_err > 30).mean() * 100   # >30 Âµg/m3
pct_fail_50 = (abs_err > 50).mean() * 100
print(f"Percent predictions with abs error>30: {pct_fail_30:.2f}%")
print(f"Percent predictions with abs error>50: {pct_fail_50:.2f}%")

# Top-10 worst failures (using aligned series)
worst = abs_err.sort_values(ascending=False).head(10)
worst_idx = worst.index.tolist()

worst_df = pd.DataFrame({
    "timestamp": df.loc[worst_idx, "timestamp"].values,
    "actual_pm25": y_test.loc[worst_idx].values,
    "predicted_pm25": y_pred_series.loc[worst_idx].values,
    "abs_error": worst.values
})
worst_df = worst_df.reset_index(drop=True)
print("Top 10 worst failures:")
display(worst_df)

# --- Save failure table ---
os.makedirs("/kaggle/working/outputs", exist_ok=True)
worst_df.to_csv("/kaggle/working/outputs/dt_top10_failures.csv", index=False)
print("Saved -> /kaggle/working/outputs/dt_top10_failures.csv")

# --- Plot Actual vs Predicted (last 120 days of test set for clarity) ---
n_plot = min(120, len(y_test))
idxs = y_test.index[-n_plot:]
fig = go.Figure()
fig.add_trace(go.Scatter(x=df.loc[idxs,"timestamp"], y=y_test.loc[idxs], mode='lines', name='Actual PM2.5', line=dict(color='black')))
fig.add_trace(go.Scatter(x=df.loc[idxs,"timestamp"], y=y_pred_series.loc[idxs], mode='lines', name='DT Predicted PM2.5', line=dict(color='orange', dash='dash')))
fig.update_layout(title="Decision Tree â€” Actual vs Predicted (last {} test days)".format(n_plot),
                  xaxis_title="Date", yaxis_title="PM2.5 (Âµg/mÂ³)", template='plotly_white')
fig.show()

# --- Plot residuals histogram ---
plt.figure(figsize=(6,4))
plt.hist((y_test - y_pred_series).dropna(), bins=60)
plt.title("Residuals (y_test - y_pred)")
plt.xlabel("Error (Âµg/mÂ³)")
plt.ylabel("Count")
plt.tight_layout()
plt.savefig("/kaggle/working/outputs/dt_residuals_hist.png", dpi=150)
plt.show()
print("Saved -> /kaggle/working/outputs/dt_residuals_hist.png")

# --- Plot and save tree (matplotlib) ---
plt.figure(figsize=(20,10))
plot_tree(dt, feature_names=features, fontsize=10, filled=True, rounded=True, max_depth=4)
plt.title("Decision Tree (max_depth=4)")
plt.savefig("/kaggle/working/outputs/dt_tree.png", dpi=200, bbox_inches='tight')
plt.show()
print("Saved tree image -> /kaggle/working/outputs/dt_tree.png")

# --- Feature importance bar plot ---
imp = pd.Series(dt.feature_importances_, index=features).sort_values(ascending=True)
plt.figure(figsize=(8,6))
imp.plot(kind='barh')
plt.title("Decision Tree Feature Importances")
plt.xlabel("Importance")
plt.tight_layout()
plt.savefig("/kaggle/working/outputs/dt_feature_importance.png", dpi=180)
plt.show()
print("Saved feature importance -> /kaggle/working/outputs/dt_feature_importance.png")

# --- Extract readable rules (text) ---
rules = export_text(dt, feature_names=features, max_depth=4)
with open("/kaggle/working/outputs/dt_rules.txt","w") as f:
    f.write(rules)
print("Saved textual rules -> /kaggle/working/outputs/dt_rules.txt")
print("\n--- Sample rules (top) ---\n")
print("\n".join(rules.splitlines()[:40]))


In [None]:
# assosiation Rule mining 


In [7]:

df = pd.read_csv("/kaggle/working/MLDelhi2022_preprocessed.csv", parse_dates=["timestamp"] , low_memory = False)

print(df.dtypes)




FileNotFoundError: [Errno 2] No such file or directory: '/kaggle/working/MLDelhi2022_preprocessed.csv'