In [1]:
import pandas as pd

# Load the datasets
train_path = 'housing_train.csv'
test_path = 'housing_test.csv'

# Read the datasets to inspect their structure
train_data = pd.read_csv(train_path)
test_data = pd.read_csv(test_path)

# Display the first few rows of both datasets to understand the structure
train_data.head(), test_data.head()

(   Id  MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \
 0   1          60       RL         65.0     8450   Pave   NaN      Reg   
 1   2          20       RL         80.0     9600   Pave   NaN      Reg   
 2   3          60       RL         68.0    11250   Pave   NaN      IR1   
 3   4          70       RL         60.0     9550   Pave   NaN      IR1   
 4   5          60       RL         84.0    14260   Pave   NaN      IR1   
 
   LandContour Utilities  ... PoolArea PoolQC Fence MiscFeature MiscVal MoSold  \
 0         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      2   
 1         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      5   
 2         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      9   
 3         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      2   
 4         Lvl    AllPub  ...        0    NaN   NaN         NaN       0     12   
 
   YrSold  SaleType  SaleCondition  SalePrice  
 0   2

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Select numeric features and target
X = train_data.select_dtypes(include=np.number).drop(columns=["SalePrice", "Id"])
y = train_data["SalePrice"]

# Handle missing values by filling them with the median
X = X.fillna(X.median())

# Split the training data for validation
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Models to evaluate
models = {
    "Linear Regression": LinearRegression(),
    "Ridge Regression": Ridge(),
    "Lasso Regression": Lasso(),
    "KNN": KNeighborsRegressor(),
    "SVR": SVR(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest": RandomForestRegressor(),
    #"Bagging (Decision Tree)": BaggingRegressor(base_estimator=DecisionTreeRegressor(), random_state=42),
    "XGB": XGBRegressor(eval_metric="rmse", random_state=42)
}

# Dictionary to store results
results = {
    "Model": [],
    "R Square": [],
    "MAE": [],
    "MSE": [],
    "RMSE": []
}

# Train and evaluate each model
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    
    r2 = r2_score(y_val, y_pred)
    mae = mean_absolute_error(y_val, y_pred)
    mse = mean_squared_error(y_val, y_pred)
    rmse = np.sqrt(mse)
    
    results["Model"].append(name)
    results["R Square"].append(r2)
    results["MAE"].append(mae)
    results["MSE"].append(mse)
    results["RMSE"].append(rmse)

# Create a DataFrame for the results
results_df = pd.DataFrame(results)
results_df.sort_values(by="R Square", ascending=False, inplace=True)
results_df.reset_index(drop=True, inplace=True)

results_df

Unnamed: 0,Model,R Square,MAE,MSE,RMSE
0,Random Forest,0.886616,18116.168929,869689300.0,29490.49592
1,XGB,0.881936,18458.336807,905587800.0,30092.985992
2,Ridge Regression,0.823097,22973.061649,1356902000.0,36836.152764
3,Lasso Regression,0.823093,22976.081763,1356937000.0,36836.621227
4,Linear Regression,0.82309,22975.856509,1356958000.0,36836.908846
5,Decision Tree,0.807413,27153.027397,1477203000.0,38434.403278
6,KNN,0.703596,28153.173288,2273512000.0,47681.361296
7,SVR,-0.02463,59556.254128,7859249000.0,88652.403257
