In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import Ridge, Lasso
from sklearn.metrics import mean_squared_error

In [None]:
df = pd.read_csv('../../data/clean_data/train_df_ready.csv')
df.head()

In [None]:
def find_best_model_cv(x_train, y_train, models):
    best_model_name = None
    best_model = None
    best_score = float('inf')    
    
    for model_name, model in models.items():
        scores = cross_val_score(model, x_train, y_train, cv=5, scoring='neg_mean_squared_error')
        rmse_scores = np.sqrt(-scores)
        avg_rmse = rmse_scores.mean()
        
        print(f"{model_name}: RMSE = {avg_rmse:.4f}")
        
        if avg_rmse < best_score:
            best_score = avg_rmse
            best_model_name = model_name
            best_model = model
    
    print(f'Best model is: {best_model_name}')
    
    return best_model

In [None]:
models = {
    'Ridge Regression': Ridge(max_iter=10000),
    'Lasso Regression': Lasso(max_iter=10000),
    'GradientBoosting Regression': GradientBoostingRegressor()
}

In [None]:
x = df.drop(columns=['SalePrice'])
y = df['SalePrice']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

In [None]:
best_model = find_best_model_cv(x_train, y_train, models)

In [None]:
best_model.fit(x_train, y_train)
y_pred = best_model.predict(x_test)
test_mse = mean_squared_error(y_test, y_pred)
test_rmse = np.sqrt(test_mse)
print(f'Test RMSE: {test_rmse:.4f}')

In [None]:
params = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 5, 7],
    'subsample': [0.8, 1.0]
}

grid_search = GridSearchCV(GradientBoostingRegressor(), param_grid=params, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(x_train, y_train)

best_params = grid_search.best_params_
best_score = np.sqrt(-grid_search.best_score_)

print(best_params, best_score)

In [None]:
model = GradientBoostingRegressor(learning_rate=best_params['learning_rate'],
                                  max_depth=best_params['max_depth'],
                                  n_estimators=best_params['n_estimators'],
                                  subsample=best_params['subsample'])
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
test_mse = mean_squared_error(y_test, y_pred)
test_rmse = np.sqrt(test_mse)
print(f'Test RMSE: {test_rmse:.4f}')