In [161]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

np.random.seed(42)

data = pd.read_csv('Clean_Dataset.csv')
data

X = data.drop('price', axis=1)
y = data['price']

# Identify numerical and categorical columns
numerical_cols = X.select_dtypes(include=np.number).columns.tolist()
categorical_cols = X.select_dtypes(include='object').columns.tolist()

# Preprocessing pipeline
numerical_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(drop='first', handle_unknown='ignore')


preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Define regression models to evaluate
models = [
    ('Linear Regression', LinearRegression()),
    ('Ridge Regression', Ridge(random_state=42)),
    ('Lasso Regression', Lasso(random_state=42)),
    ('Decision Tree Regression', DecisionTreeRegressor(random_state=42)),
    ('Random Forest Regression', RandomForestRegressor(random_state=42)),
    ('Gradient Boosting Regression', GradientBoostingRegressor(random_state=42))
]


for name, model in models:

    # Create a pipeline combining preprocessing and model
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', model)
    ])

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Fit the pipeline (preprocessor + model) on the training data
    pipeline.fit(X_train, y_train)

    # Predict on the test data
    y_pred = pipeline.predict(X_test)

    # Evaluate the model
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    print(f"Model: {name}")
    print(f"Mean Absolute Error (MAE): {mae:.2f}")
    print(f"Mean Squared Error (MSE): {mse:.2f}")
    print(f"R-squared (R2): {r2:.2f}")
    print("-----------------------------")




Model: Linear Regression
Mean Absolute Error (MAE): 4230.67
Mean Squared Error (MSE): 38040507.92
R-squared (R2): 0.93
-----------------------------




Model: Ridge Regression
Mean Absolute Error (MAE): 4232.89
Mean Squared Error (MSE): 38069573.64
R-squared (R2): 0.93
-----------------------------




Model: Lasso Regression
Mean Absolute Error (MAE): 4316.53
Mean Squared Error (MSE): 39344050.20
R-squared (R2): 0.92
-----------------------------




Model: Decision Tree Regression
Mean Absolute Error (MAE): 736.59
Mean Squared Error (MSE): 7199867.98
R-squared (R2): 0.99
-----------------------------




Model: Random Forest Regression
Mean Absolute Error (MAE): 763.24
Mean Squared Error (MSE): 4719069.56
R-squared (R2): 0.99
-----------------------------
Model: Gradient Boosting Regression
Mean Absolute Error (MAE): 2848.63
Mean Squared Error (MSE): 22647366.15
R-squared (R2): 0.96
-----------------------------


