In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import r2_score
#import seaborn as sns
#import matplotlib.pyplot as plt
import joblib

In [2]:
df = pd.read_csv("datasets/transformed_data.csv")

In [3]:
df.head()

Unnamed: 0,City,Date,PM2.5,PM10,NO,NO2,NOx,NH3,CO,SO2,O3,Benzene,Toluene,Xylene,AQI
0,Ahmedabad,2015-01-01,67.450578,118.127103,0.92,18.22,17.15,23.483476,0.92,27.64,133.36,0.0,0.02,0.0,166.463581
1,Ahmedabad,2015-01-02,67.450578,118.127103,0.97,15.69,16.46,23.483476,0.97,24.55,34.06,3.68,5.5,3.77,166.463581
2,Ahmedabad,2015-01-03,67.450578,118.127103,17.4,19.3,29.7,23.483476,17.4,29.07,30.7,6.8,16.4,2.25,166.463581
3,Ahmedabad,2015-01-04,67.450578,118.127103,1.7,18.48,17.97,23.483476,1.7,18.59,36.08,4.43,10.14,1.0,166.463581
4,Ahmedabad,2015-01-05,67.450578,118.127103,22.1,21.42,37.76,23.483476,22.1,39.33,39.31,7.01,18.89,2.78,166.463581


In [4]:
# Split the data into features and target variable
X = df.drop(['City', 'Date', 'AQI','Toluene','Xylene','Benzene'], axis=1)  # Features
y = df['AQI']  # Target variable

In [5]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [6]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
# Create LightGBM dataset
train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_test, label=y_test, reference=train_data)

In [8]:
# Train the model with default parameters
bst = lgb.LGBMRegressor()
bst.fit(X_train, y_train)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000747 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2295
[LightGBM] [Info] Number of data points in the train set: 22851, number of used features: 9
[LightGBM] [Info] Start training from score 157.974617


In [9]:
# Make predictions
y_pred = bst.predict(X_test)

In [10]:
# Calculate Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)

# Print MSE
print("Mean Squared Error:", mse)


Mean Squared Error: 1354.935110330556


In [11]:
# Calculate the R-squared value
r2 = r2_score(y_test, y_pred)
print("R-squared value:", r2)

R-squared value: 0.8824164105705051


In [14]:
# Define the parameter grid to search
param_grid = {
    'n_estimators': [50, 100, 150, 200],
    'learning_rate': [0.01, 0.05, 0.1, 0.3],
    'max_depth': [3, 4, 5, 6, 7],
    'num_leaves': [15, 31, 63, 127],
    'subsample': [0.6, 0.7, 0.8, 0.9, 1.0],
    'colsample_bytree': [0.6, 0.7, 0.8, 0.9, 1.0],
    'reg_alpha': [0, 0.1, 0.5, 1.0],
    'reg_lambda': [0, 0.1, 0.5, 1.0]
}

# Initialize LightGBM regressor
lgb_regressor = lgb.LGBMRegressor()

# Randomized Search Cross Validation
lgb_random = RandomizedSearchCV(estimator=lgb_regressor, param_distributions=param_grid,
                                n_iter=200, cv=3, verbose=2, random_state=42, n_jobs=-1)

# Perform the search
lgb_random.fit(X_train, y_train)

# Best parameters found
print("Best parameters found:", lgb_random.best_params_)

# Refit the model with best parameters
best_lgb_regressor = lgb_random.best_estimator_
best_lgb_regressor.fit(X_train, y_train)

# Make predictions
y_pred = best_lgb_regressor.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print("Mean Squared Error:", mse)
print("R-squared value:", r2)

Fitting 4 folds for each of 200 candidates, totalling 800 fits
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001111 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2295
[LightGBM] [Info] Number of data points in the train set: 22851, number of used features: 9
[LightGBM] [Info] Start training from score 157.974617
Best parameters found: {'subsample': 0.8, 'reg_lambda': 0.5, 'reg_alpha': 0.5, 'num_leaves': 127, 'n_estimators': 200, 'max_depth': 6, 'learning_rate': 0.1, 'colsample_bytree': 0.8}
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001165 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2295
[LightGBM] [Info] Number of data points in the train set: 22851, number of used features: 9
[LightGBM] [Info] Start training from score 157.974617
Mean Squared Error: 1368.6476854473688
R-squared value: 0.88122640981

In [13]:
joblib.dump(lgb_random, 'lgb_reg.pkl')

['lgb_reg.pkl']