In [None]:
# Import necessary modules
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn.metrics import mean_squared_error

# Load processed dataset
data = pd.read_csv('Processed_Train.csv')

# Descriptive features
X = data.iloc[:, 1:-1]
X = X.select_dtypes(include=np.number)

# Target label
y = data.iloc[:, -1].values

print("Descriptive Features (X) Dimensions: ", X.shape)
print("Target Label (y) dimensions: ", y.shape)

# Split Train data in the ratio of 75:25 for training & Validation
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1234)

# 1. Linear Regression
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)
y_pred_lr = lin_reg.predict(X_test)
rmse_lr = mean_squared_error(y_test, y_pred_lr, squared=False)
print("RMSE of Linear Regression: ", rmse_lr)

# 2. Support Vector Regressor (SVR)
svr_reg = SVR(kernel="linear")
svr_reg.fit(X_train, y_train)
y_pred_svr = svr_reg.predict(X_test)
rmse_svr = mean_squared_error(y_test, y_pred_svr, squared=False)
print("RMSE of SVR: ", rmse_svr)

# 3. Random Forest Regressor
rf_reg = RandomForestRegressor(random_state=1234)

# Hyperparameter tuning for Random Forest
param_grid_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search_rf = GridSearchCV(rf_reg, param_grid_rf, cv=3, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search_rf.fit(X_train, y_train)

rf_reg_best = grid_search_rf.best_estimator_
y_pred_rf = rf_reg_best.predict(X_test)
rmse_rf = mean_squared_error(y_test, y_pred_rf, squared=False)
print("Best hyperparameters for Random Forest: ", grid_search_rf.best_params_)
print("RMSE of Random Forest Regressor: ", rmse_rf)

# 4. XGBoost Regressor
xgb_reg = xgb.XGBRegressor(random_state=1234)

# Hyperparameter tuning for XGBoost
param_grid_xgb = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2]
}

grid_search_xgb = GridSearchCV(xgb_reg, param_grid_xgb, cv=3, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search_xgb.fit(X_train, y_train)

xgb_reg_best = grid_search_xgb.best_estimator_
y_pred_xgb = xgb_reg_best.predict(X_test)
rmse_xgb = mean_squared_error(y_test, y_pred_xgb, squared=False)
print("Best hyperparameters for XGBoost: ", grid_search_xgb.best_params_)
print("RMSE of XGBoost Regressor: ", rmse_xgb)

# Ensemble Model (Simple Averaging)
ensemble_preds = (y_pred_lr + y_pred_svr + y_pred_rf + y_pred_xgb) / 4
rmse_ensemble = mean_squared_error(y_test, ensemble_preds, squared=False)
print("RMSE of Ensemble Model: ", rmse_ensemble)


In [None]:
conda install xgboost

Collecting package metadata (current_repodata.json): done
Solving environment: \ 
The environment is inconsistent, please check the package plan carefully
The following packages are causing the inconsistency:

  - defaults/osx-64::holoviews==1.15.0=py39hecd8cb5_0
  - conda-forge/noarch::fslpy==3.11.2=pyhd8ed1ab_0
  - defaults/noarch::tifffile==2021.7.2=pyhd3eb1b0_2
  - conda-forge/osx-64::dipy==1.6.0=py39h7cc1f47_0
  - defaults/osx-64::imageio==2.19.3=py39hecd8cb5_0
  - defaults/osx-64::scikit-learn==1.0.2=py39hae1ba45_1
  - conda-forge/noarch::imagehash==4.3.1=pyhd8ed1ab_0
  - conda-forge/osx-64::phik==0.12.2=py39h407aec2_2
  - conda-forge/osx-64::nipype==1.8.5=py39h6e9494a_1
  - defaults/osx-64::bottleneck==1.3.5=py39h67323c0_0
  - conda-forge/osx-64::aesara-base==2.8.7=py39h6e9494a_1
  - conda-forge/noarch::arviz==0.13.0=pyhd8ed1ab_0
  - defaults/osx-64::scikit-image==0.19.2=py39hae1ba45_0
  - conda-forge/osx-64::netcdf4==1.5.7=nompi_py39h8cf34ae_101
  - defaults/osx-64::datashape==