In [61]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")
test_ids = test_df["Id"]

X_train = train_df.drop("SalePrice", axis=1)
y_train = train_df["SalePrice"]
X_test = test_df.copy()

num_cols = X_train.select_dtypes(include=["int64", "float64"]).columns
cat_cols = X_train.select_dtypes(include=["object"]).columns

num_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())  
])

cat_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer([
    ("num", num_pipeline, num_cols),
    ("cat", cat_pipeline, cat_cols)
])

models = {
    "Linear Regression": LinearRegression(),
    "Random Forest": RandomForestRegressor(random_state=42, n_estimators=200),
    "SVR": SVR()
}
results = []

for name, model in models.items():
    pipe = Pipeline([
        ("preprocessor", preprocessor),
        ("regressor", model)
    ])
    
    pipe.fit(X_train, y_train)
    preds = pipe.predict(X_train) 
    
    mae = mean_absolute_error(y_train, preds)
    rmse = mean_squared_error(y_train, preds)
    r2 = r2_score(y_train, preds)
    
    results.append([name, mae, rmse, r2*100])

results_df = pd.DataFrame(results, columns=["Model", "MAE", "RMSE", "R2 (%)"])
best_model_name = results_df.loc[results_df["R2 (%)"].idxmax(), "Model"]

print(" Model Comparison:\n", results_df)
print("\n Best Model:", best_model_name)

best_pipe = Pipeline([
    ("preprocessor", preprocessor),
    ("regressor", models[best_model_name])
])
best_pipe.fit(X_train, y_train)
test_preds = best_pipe.predict(X_test)

submission = pd.DataFrame({
    "Id": test_ids,
    "SalePrice": test_preds
})
submission.to_csv("submission.csv", index=False)
print("\n Submission file created: submission.csv")

 Model Comparison:
                Model           MAE          RMSE     R2 (%)
0  Linear Regression  13325.423645  4.283798e+08  93.207640
1      Random Forest   6464.888606  1.224299e+08  98.058761
2                SVR  55473.536382  6.618366e+09  -4.940348

 Best Model: Random Forest

 Submission file created: submission.csv
