#First ML Project

In [None]:
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor

In [None]:
data = fetch_california_housing()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = data.target

In [None]:
# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
# Define a function to evaluate models
def evaluate_model(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    return rmse, r2

In [None]:
# Initialize models with default parameters
models = {
    "Linear Regression": LinearRegression(),
    "Ridge Regression": Ridge(),
    "Lasso Regression": Lasso(),
    "Decision Tree": DecisionTreeRegressor(random_state=42),
    "Random Forest": RandomForestRegressor(random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(random_state=42),
    "Support Vector Regressor": SVR(),
    "XGBoost": XGBRegressor(random_state=42, objective='reg:squarederror')
}


In [None]:
# Evaluate each model
results = {}
for name, model in models.items():
    rmse, r2 = evaluate_model(model, X_train, X_test, y_train, y_test)
    results[name] = {"RMSE": rmse, "R2 Score": r2}

In [None]:
results_df = pd.DataFrame(results).T
print(results_df.sort_values(by="RMSE"))

                              RMSE  R2 Score
XGBoost                   0.471794  0.830137
Random Forest             0.505143  0.805275
Gradient Boosting         0.542217  0.775643
Support Vector Regressor  0.597498  0.727563
Decision Tree             0.702829  0.623042
Ridge Regression          0.745557  0.575816
Linear Regression         0.745581  0.575788
Lasso Regression          1.144856 -0.000219


In [None]:
# Hyperparameter tuning for the best models (optional example for Random Forest)
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10]
}

In [None]:
grid_search = GridSearchCV(RandomForestRegressor(random_state=42), param_grid, cv=3, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

In [None]:
# Best model after tuning
best_rf = grid_search.best_estimator_
rmse, r2 = evaluate_model(best_rf, X_train, X_test, y_train, y_test)
print("Best Random Forest after tuning:\nRMSE: {:.4f}, R2 Score: {:.4f}".format(rmse, r2))

Best Random Forest after tuning:
RMSE: 0.5038, R2 Score: 0.8063
