In [1]:
import xgboost as xgb
import pandas as pd
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
import joblib

In [2]:
df = pd.read_csv("datasets/transformed_data.csv")

In [3]:
# Split the data into features and target variable
X = df.drop(['City', 'Date', 'AQI','Toluene','Xylene','Benzene'], axis=1)  # Features
y = df['AQI']  # Target variable

In [4]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
# Initialize XGBoost regressor
xgb_regressor = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)

# Fit the model
xgb_regressor.fit(X_train, y_train)

# Make predictions
y_pred = xgb_regressor.predict(X_test)

In [6]:
# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print("Mean Squared Error:", mse)
print("R-squared value:", r2)

Mean Squared Error: 1493.7613809549568
R-squared value: 0.8703688290423055


In [7]:
# Define the parameter grid to search
param_grid = {
    'n_estimators': [50, 100, 150, 200],  
    'max_depth': [3, 4, 5, 6, 7],       
    'learning_rate': [0.01, 0.05, 0.1, 0.3],       
    'subsample': [0.6, 0.7, 0.8, 0.9, 1.0],         
    'colsample_bytree': [0.6, 0.7, 0.8, 0.9, 1.0],  
    'gamma': [0, 0.1, 0.2, 0.3, 0.4] 
}

In [8]:
# Initialize XGBoost regressor
xgb_regressor = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)

# Randomized Search Cross Validation
xgb_random = RandomizedSearchCV(estimator=xgb_regressor, param_distributions=param_grid,
                                n_iter=200, cv=3, verbose=2, random_state=42, n_jobs=-1)

# Perform the search
xgb_random.fit(X, y)

Fitting 3 folds for each of 200 candidates, totalling 600 fits


In [9]:
y_pred = xgb_random.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print("Mean Squared Error:", mse)
print("R-squared value:", r2)

Mean Squared Error: 771.4846903483331
R-squared value: 0.9330492372738585


In [10]:
joblib.dump(xgb_random, 'xgboost_reg.pkl')

['xgboost_reg.pkl']