In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
import pickle

# -----------------------------
# 1. LOAD DATASET
# -----------------------------
df = pd.read_csv("city_day.csv")   # file must be in same folder

# Keep only Delhi rows
df = df[df['City'] == 'Delhi']

# Keep useful columns only
df = df[['PM2.5', 'CO', 'NO2', 'AQI']]

# Remove missing values
df = df.dropna()

print("Dataset shape:", df.shape)
print(df.head())

# -----------------------------
# 2. SPLIT DATA (80% train, 20% test)
# -----------------------------
X = df[['PM2.5', 'CO', 'NO2']]   # inputs
y = df['AQI']                    # target output

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# -----------------------------
# 3. TRAIN MODEL
# -----------------------------
model = LinearRegression()
model.fit(X_train, y_train)

# -----------------------------
# 4. CHECK ACCURACY
# -----------------------------
pred = model.predict(X_test)
error = mean_absolute_error(y_test, pred)

print("Model MAE:", error)

# -----------------------------
# 5. SAVE MODEL
# -----------------------------
pickle.dump(model, open("aqi_model.pkl", "wb"))
print("Model saved as aqi_model.pkl")


Dataset shape: (1998, 4)
        PM2.5     CO    NO2    AQI
10229  313.22  15.20  36.39  472.0
10230  186.18   9.54  32.87  454.0
10231   87.18  10.61  30.31  143.0
10232  151.84  11.54  36.91  319.0
10233  146.60   9.20  34.92  325.0
Model MAE: 37.71812618605193
Model saved as aqi_model.pkl
