In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns 
from scipy.stats import skew 
from sklearn.preprocessing import PowerTransformer 
from sklearn.model_selection import train_test_split 
from sklearn.pipeline import Pipeline 
from sklearn.preprocessing import StandardScaler,OneHotEncoder 
from sklearn.linear_model import LinearRegression 
from sklearn.metrics import mean_squared_error 
from sklearn.ensemble import RandomForestRegressor 
from sklearn.compose import ColumnTransformer 
from sklearn.metrics import make_scorer,r2_score, mean_absolute_error 
from xgboost import XGBRegressor 
from sklearn.model_selection import cross_val_score 
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV
import joblib

In [2]:
train_df=pd.read_csv('train.csv') 
test_df=pd.read_csv('test.csv')

threshold_relation = 0.05 
threshold = 0.9 
dropped_empty = [] 
dropped_relation = [] 
target = 'SalePrice'

In [3]:
for col in train_df.columns: 
    null_fraction = train_df[col].isnull().mean() 
    if null_fraction > threshold: 
        train_df.drop(columns=[col], inplace=True) 
        test_df.drop(columns=[col], inplace=True) 
        dropped_empty.append(col) 
        
for col in train_df.columns: 
    if col == target: 
        continue 
    if pd.api.types.is_numeric_dtype(train_df[col]):
         corr = train_df[col].corr(train_df[target]) 
         if abs(corr) < threshold_relation: 
            dropped_relation.append(col) 
            train_df.drop(columns=[col], inplace=True) 
            test_df.drop(columns=[col], inplace=True) 
            
print("Dropped columns:", dropped_empty) 
print("Dropped columns:", dropped_relation)

Dropped columns: ['Alley', 'PoolQC', 'MiscFeature']
Dropped columns: ['Id', 'BsmtFinSF2', 'LowQualFinSF', 'BsmtHalfBath', '3SsnPorch', 'MiscVal', 'MoSold', 'YrSold']


In [4]:
numeric_transform = Pipeline(steps=[ 
    ("imputer", SimpleImputer(strategy="median")), 
    ("scaler", StandardScaler()) 
    ])
    
categorical_transform = Pipeline(steps=[
     ("imputer", SimpleImputer(strategy="most_frequent")), 
     ("encoder", OneHotEncoder(handle_unknown="ignore")) 
     ])

In [5]:
X = train_df.drop(columns=['SalePrice']) 
joblib.dump(X.columns.tolist(), "features.pkl")
y = train_df['SalePrice'] 

X_train, X_val, y_train, y_val = train_test_split( 
    X, y, test_size=0.2, random_state=42 
)

In [6]:
numeric_feat = train_df.select_dtypes(include=['int64','float64']).columns.drop('SalePrice') 
categorical_feat = train_df.select_dtypes(include=['object']).columns 

preprocessor = ColumnTransformer( 
    transformers=[ 
        ('nums',numeric_transform,numeric_feat), 
        ('cate',categorical_transform,categorical_feat) 
        ] 
) 

pipeline_lr = Pipeline([ 
    ('preprocess',preprocessor), 
    ('model',LinearRegression()) 
])

pipeline_rf = Pipeline([ 
    ('preprocess',preprocessor), 
    ('model',RandomForestRegressor(random_state=42)) 
]) 

pipeline_xgb = Pipeline([ 
    ('preprocess', preprocessor), 
    ('model', XGBRegressor(random_state=42, n_estimators=100)) 
])

In [7]:
pipelines = {
    'LinearRegression': pipeline_lr,
    'RandomForest': pipeline_rf,
    'XGBoost': pipeline_xgb
}

param_grids = {
   'LinearRegression': {
        "model__fit_intercept": [True, False],
    },
    'RandomForest': {       
        "model__n_estimators": [200,400,600],
        "model__max_depth": [None, 10,20,30],
        "model__min_samples_split": [2,5,10],
        "model__min_samples_leaf": [1, 2,4],
    },
    'XGBoost': {
        "model__n_estimators": [300,500, 800],        
        "model__max_depth": [3, 6,9],               
        "model__learning_rate": [0.05, 0.1],      
        "model__subsample": [0.8,1],                
        "model__colsample_bytree": [0.8,1],          
        "model__min_child_weight": [1, 3],        
        "model__reg_lambda": [1, 2]             
}
}
best_models={}

In [8]:
best_models = {}

for name, pipeline in pipelines.items():
    print(f"Running search for {name}...")
    grid = GridSearchCV(pipeline, param_grids[name], cv=5, scoring='r2', n_jobs=-1)
    grid.fit(X_train, y_train)
    best_models[name] = grid.best_estimator_
    print(f"Best params for {name}: {grid.best_params_}")
    print(f"Best CV R² for {name}: {grid.best_score_:.4f}\n")


for name, model in best_models.items():
    y_pred = model.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    mae = mean_absolute_error(y_val, y_pred)
    r2 = r2_score(y_val, y_pred)
    print(f"{name} - RMSE: {rmse:.4f}, MAE: {mae:.4f}, R²: {r2:.4f}")

Running search for LinearRegression...


Best params for LinearRegression: {'model__fit_intercept': False}
Best CV R² for LinearRegression: 0.7557

Running search for RandomForest...
Best params for RandomForest: {'model__max_depth': None, 'model__min_samples_leaf': 2, 'model__min_samples_split': 2, 'model__n_estimators': 400}
Best CV R² for RandomForest: 0.8411

Running search for XGBoost...
Best params for XGBoost: {'model__colsample_bytree': 0.8, 'model__learning_rate': 0.05, 'model__max_depth': 3, 'model__min_child_weight': 1, 'model__n_estimators': 800, 'model__reg_lambda': 1, 'model__subsample': 0.8}
Best CV R² for XGBoost: 0.8772

LinearRegression - RMSE: 29594.6644, MAE: 18158.1097, R²: 0.8858
RandomForest - RMSE: 29039.9431, MAE: 17548.0883, R²: 0.8901
XGBoost - RMSE: 24021.1480, MAE: 15134.0449, R²: 0.9248


In [14]:
y_predict_l = best_models['LinearRegression'].predict(X_val)
y_predict_t = best_models['RandomForest'].predict(X_val)
y_predict_xgb = best_models['XGBoost'].predict(X_val)

mse_r = mean_squared_error(y_val, y_predict_l)
mae_r = mean_absolute_error(y_val, y_predict_l)
rmse_r = np.sqrt(mse_r)
r2_linear = r2_score(y_val, y_predict_l)


mse_t = mean_squared_error(y_val, y_predict_t)
mae_t = mean_absolute_error(y_val, y_predict_t)
rmse_t = np.sqrt(mse_t)
r2_random = r2_score(y_val, y_predict_t)


rmse_xgb = np.sqrt(mean_squared_error(y_val, y_predict_xgb))
mae_xgb = mean_absolute_error(y_val, y_predict_xgb)
r2_xgb = r2_score(y_val, y_predict_xgb)

In [13]:
print(f"Linear Regression RMSE: {rmse_r:.4f}") 
print(f"Linear Regression MSE: {mse_r:.4f}") 
print(f"Linear Regression MAE: {mae_r:.4f}") 
print(f"Linaer R²: {r2_linear:.4f}") 
print(f"Random Forest RMSE: {rmse_t:.4f}") 
print(f"Random Forest MSE: {mse_t:.4f}") 
print(f"Random Forest MAE: {mae_t:.4f}") 
print(f"Random R²: {r2_random:.4f}") 
print(f"XGBoost RMSE: {rmse_xgb:.4f}") 
print(f"XGBoost MAE: {mae_xgb:.4f}") 
print(f"XGBoost R²: {r2_xgb:.4f}") 

Linear Regression RMSE: 29594.6644
Linear Regression MSE: 875844162.1447
Linear Regression MAE: 18158.1097
Linaer R²: 0.8858
Random Forest RMSE: 29039.9431
Random Forest MSE: 843318293.9934
Random Forest MAE: 17548.0883
Random R²: 0.8901
XGBoost RMSE: 24021.1480
XGBoost MAE: 15134.0449
XGBoost R²: 0.9248


In [15]:
test_predict_lr=best_models['LinearRegression'].predict(test_df) 
 

test_predict_rf=best_models['RandomForest'].predict(test_df) 


test_predict_xgb=best_models["XGBoost"].predict(test_df) 
 
print(f"Linear result :{test_predict_lr}") 
print(f"Tree result :{test_predict_rf}") 
print(f"XGB result{test_predict_xgb}")

Linear result :[117284.68046892 165773.64923134 187019.04493292 ... 184074.20946058
 109284.45779602 223852.76664729]
Tree result :[128040.47492063 153604.30427579 179569.84242262 ... 155650.142
 118563.96333333 223221.017625  ]
XGB result[122112.62 167339.   183017.11 ... 173105.92 122493.59 217460.97]


In [16]:
joblib.dump(best_models["XGBoost"], "house_price.pkl")


['house_price.pkl']