Import necessary modules

In [23]:
# Import necessary modules
import numpy as np
import pandas as pd 
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# Load processed dataset
data = pd.read_csv('Processed_Train.csv')

# Descriptive features
X = data.iloc[:, 1:-1]
X = X.select_dtypes(include=np.number)

# Target label
y = data.iloc[:, -1].values

print("Descriptive Features (X) Dimensions: ", X.shape)
print("Target Label (y) dimensions: ", y.shape)

# Split Train data in the ratio of 75:25 for training & Validation
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1234)

# 1. Linear Regression
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)
y_pred_lr = lin_reg.predict(X_test)
rmse_lr = mean_squared_error(y_test, y_pred_lr, squared=False)
print("RMSE of Linear Regression: ", rmse_lr)

# 2. Support Vector Regressor (SVR)
svr_reg = SVR(kernel="linear")
svr_reg.fit(X_train, y_train)
y_pred_svr = svr_reg.predict(X_test)
rmse_svr = mean_squared_error(y_test, y_pred_svr, squared=False)
print("RMSE of SVR: ", rmse_svr)

# 3. Random Forest Regressor
rf_reg = RandomForestRegressor(random_state=1234)

# Hyperparameter tuning for Random Forest
param_grid_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search_rf = GridSearchCV(rf_reg, param_grid_rf, cv=3, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search_rf.fit(X_train, y_train)

rf_reg_best = grid_search_rf.best_estimator_
y_pred_rf = rf_reg_best.predict(X_test)
rmse_rf = mean_squared_error(y_test, y_pred_rf, squared=False)
print("Best hyperparameters for Random Forest: ", grid_search_rf.best_params_)
print("RMSE of Random Forest Regressor: ", rmse_rf)

# 4. XGBoost Regressor
xgb_reg = xgb.XGBRegressor(random_state=1234)

# Hyperparameter tuning for XGBoost
param_grid_xgb = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2]
}

grid_search_xgb = GridSearchCV(xgb_reg, param_grid_xgb, cv=3, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search_xgb.fit(X_train, y_train)

xgb_reg_best = grid_search_xgb.best_estimator_
y_pred_xgb = xgb_reg_best.predict(X_test)
rmse_xgb = mean_squared_error(y_test, y_pred_xgb, squared=False)
print("Best hyperparameters for XGBoost: ", grid_search_xgb.best_params_)
print("RMSE of XGBoost Regressor: ", rmse_xgb)

# Ensemble Model (Simple Averaging)
ensemble_preds = (y_pred_lr + y_pred_svr + y_pred_rf + y_pred_xgb) / 4
rmse_ensemble = mean_squared_error(y_test, ensemble_preds, squared=False)
print("RMSE of Ensemble Model: ", rmse_ensemble)
# 5. Neural Network
model = Sequential()
model.add(Dense(64, input_dim=X_train.shape[1], activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='linear'))
model.compile(optimizer='adam', loss='mean_squared_error')

model.fit(X_train, y_train, epochs=50, batch_size=32, verbose=0)
y_pred_nn = model.predict(X_test)
rmse_nn = mean_squared_error(y_test, y_pred_nn, squared=False)
print("RMSE of Neural Network: ", rmse_nn)


Descriptive Features (X) Dimensions:  (3870, 42)
Target Label (y) dimensions:  (3870,)
RMSE of Linear Regression:  404.10701021694484
RMSE of SVR:  439.1328948995218
Best hyperparameters for Random Forest:  {'max_depth': None, 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 200}
RMSE of Random Forest Regressor:  394.38572871038645
Best hyperparameters for XGBoost:  {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 100}
RMSE of XGBoost Regressor:  442.99860011495986
RMSE of Ensemble Model:  395.0555121137333
RMSE of Neural Network:  423.07984961395965
