In [None]:
import src.dataPipeline as dataPipeline
import importlib

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from tqdm import tqdm

from sklearn.model_selection import train_test_split
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold
# Evaluating the model
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error,r2_score ,make_scorer
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

importlib.reload(dataPipeline)


In [None]:
def manual_cv_score(X, y, cv, model):
    fold_train_mapes = []
    fold_test_mapes = []

    # Manual CV loop
    for train_idx, test_idx in cv.split(X):
        # Split the data for this fold
        X_tr, X_te = X.iloc[train_idx], X.iloc[test_idx]
        y_tr, y_te = y.iloc[train_idx], y.iloc[test_idx]

        # Fit the model
        model.fit(X_tr, y_tr)

        # Predict on training fold
        y_tr_pred = model.predict(X_tr)
        fold_train_mapes.append(mean_absolute_percentage_error(y_tr, y_tr_pred))

        # Predict on test fold
        y_te_pred = model.predict(X_te)
        fold_test_mapes.append(mean_absolute_percentage_error(y_te, y_te_pred))

    # Calculate mean & std for train/test MAPE across folds
    train_mape_mean = np.mean(fold_train_mapes) * 100
    train_mape_std  = np.std(fold_train_mapes)  * 100

    test_mape_mean  = np.mean(fold_test_mapes)  * 100
    test_mape_std   = np.std(fold_test_mapes)   * 100

    return train_mape_mean, train_mape_std, test_mape_mean, test_mape_std

In [None]:
dp = dataPipeline.DataPipeline()
df = dp.runPipeline(
    filePath="../data/immo_data_202208_v2.csv",
    imputer=None,
    normalizeAndStandardize= False,
    get_dummies = False
)

In [None]:
df = df.drop(columns=["Availability"])
df["type_unified"] = df["type_unified"].astype('category')

In [None]:
df.loc[df['Space extracted'] < 5, 'Space extracted'] = np.nan
df.loc[df["Floor"] >= 41, "Floor"] = np.nan

#Filling Floor for House types with zeros
house_types = [
    'detached-house', 'villa', 'semi-detached-house', 'terrace-house',
    'chalet', 'farmhouse', 'rustico', 'castle', 'detached-secondary-suite'
]
df.loc[
    (df['type_unified'].isin(house_types)) & (df['Floor'].isna()),
    'Floor'
] = 0
#Fill na with 0
df["detail_responsive#surface_usable"] = df["detail_responsive#surface_usable"].fillna(0)
df["Number of floors:"] = df["Number of floors:"].fillna(1)
df["Plot_area_unified"] = df["Plot_area_unified"].fillna(0)

In [None]:
cat_col = ["type_unified"] + [f"region_group_{i}" for i in range(50)]

house_type = ["type_unified"]
num_col = [col for col in df.columns if col not in cat_col + ["price_cleaned"]]

# Preprocessing for numerical data
numerical_transformer = Pipeline(steps=[
    ('imputer', KNNImputer(n_neighbors=5)),
    ('scaler', StandardScaler())
])

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessors in a column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, num_col),
        ('cat', categorical_transformer, house_type)
    ]
)

In [None]:
hist_gradient_boosting = HistGradientBoostingRegressor(
    max_iter=100, max_depth=10, random_state=42
)


In [None]:
# Create and combine preprocessing and modeling in a pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', hist_gradient_boosting)
])

# Separate target and features
X = df.drop(columns=["price_cleaned"])
y = df["price_cleaned"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=42)
pipeline.fit(X_train, y_train)

In [None]:
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_train_preprocessed = pd.DataFrame(X_train_preprocessed)

X_test_preprocessed = preprocessor.transform(X_test)
X_test_preprocessed = pd.DataFrame(X_test_preprocessed)

cv = KFold(n_splits=5, shuffle=True, random_state=42)

In [None]:
learnrates = [0.1, 0.01]
# Prepare lists to store results
train_mape_scores_mean = []
train_mape_scores_std = []
test_mape_scores_mean = []
test_mape_scores_std = []

# Loop over different depths
for lr in tqdm(learnrates):
    # Define your estimator with the current max_depth
    hist_gradient_boosting = HistGradientBoostingRegressor(
        loss='squared_error',
        quantile=None,
        learning_rate=lr,
        max_iter=5000,
        max_leaf_nodes=31,
        max_depth=None,
        min_samples_leaf=20,
        l2_regularization=0.0,
        max_features=1.0,
        max_bins=255,
        categorical_features='warn',
        monotonic_cst=None,
        interaction_cst=None,
        warm_start=False,
        early_stopping='auto',
        scoring='loss',
        validation_fraction=0.1,
        n_iter_no_change=10,
        tol=1e-07,
        verbose=0,
        random_state=42
    )

    # Build a Pipeline (you can add more steps if needed)
    pipeline = Pipeline(steps=[('model', hist_gradient_boosting)])

    # Call the function to perform manual CV and get MAPE metrics
    train_mape_mean, train_mape_std, test_mape_mean, test_mape_std = manual_cv_score(
        X=X_train_preprocessed,
        y=y_train,
        cv=cv,
        model=pipeline
    )

    # Store results
    train_mape_scores_mean.append(train_mape_mean)
    train_mape_scores_std.append(train_mape_std)
    test_mape_scores_mean.append(test_mape_mean)
    test_mape_scores_std.append(test_mape_std)

    # Print progress
    print(
        f"learnrate: {lr}, "
        f"Train MAPE: {train_mape_mean:.2f}% (±{train_mape_std:.2f}), "
        f"Test MAPE: {test_mape_mean:.2f}% (±{test_mape_std:.2f})"
    )

In [None]:
# Plot the results
plt.figure(figsize=(12, 6))
plt.errorbar(learnrates, train_mape_scores_mean, yerr=train_mape_scores_std, label='Train MAPE')
plt.errorbar(learnrates, test_mape_scores_mean, yerr=test_mape_scores_std, label='Test MAPE')
plt.xlabel('Learning Rate')
plt.ylabel('MAPE (%)')
plt.title('Train and Test MAPE for different Learning Rates')
plt.legend()
plt.show()

In [None]:
list_max_bins = [80, 128, 200]
# Prepare lists to store results
train_mape_scores_mean = []
train_mape_scores_std = []
test_mape_scores_mean = []
test_mape_scores_std = []

# Loop over different depths
for mb in tqdm(list_max_bins):
    # Define your estimator with the current max_depth
    hist_gradient_boosting = HistGradientBoostingRegressor(
        loss='squared_error',
        quantile=None,
        learning_rate=0.1,
        max_iter=5000,
        max_leaf_nodes=31,
        max_depth=None,
        min_samples_leaf=20,
        l2_regularization=0.0,
        max_features=1.0,
        max_bins=mb,
        categorical_features='warn',
        monotonic_cst=None,
        interaction_cst=None,
        warm_start=False,
        early_stopping='auto',
        scoring='loss',
        validation_fraction=0.1,
        n_iter_no_change=10,
        tol=1e-07,
        verbose=0,
        random_state=42
    )

    # Build a Pipeline (you can add more steps if needed)
    pipeline = Pipeline(steps=[('model', hist_gradient_boosting)])

    # Call the function to perform manual CV and get MAPE metrics
    train_mape_mean, train_mape_std, test_mape_mean, test_mape_std = manual_cv_score(
        X=X_train_preprocessed,
        y=y_train,
        cv=cv,
        model=pipeline
    )

    # Store results
    train_mape_scores_mean.append(train_mape_mean)
    train_mape_scores_std.append(train_mape_std)
    test_mape_scores_mean.append(test_mape_mean)
    test_mape_scores_std.append(test_mape_std)

    # Print progress
    print(
        f"Max binning: {mb}, "
        f"Train MAPE: {train_mape_mean:.2f}% (±{train_mape_std:.2f}), "
        f"Test MAPE: {test_mape_mean:.2f}% (±{test_mape_std:.2f})"
    )

In [None]:
# Plot the results
plt.figure(figsize=(12, 6))
plt.errorbar(list_max_bins, train_mape_scores_mean, yerr=train_mape_scores_std, label='Train MAPE')
plt.errorbar(list_max_bins, test_mape_scores_mean, yerr=test_mape_scores_std, label='Test MAPE')
plt.xlabel('Learning Rate')
plt.ylabel('MAPE (%)')
plt.title('Train and Test MAPE for different Learning Rates')
plt.legend()
plt.show()

In [None]:
import optuna

In [None]:
X_test

In [None]:
X_test = pd.get_dummies(X_test)
X_train = pd.get_dummies(X_train)

In [None]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
def objective(trial):
    params = {
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2, log=True),
        'max_iter': trial.suggest_int('max_iter', 100, 500),
        'max_leaf_nodes': trial.suggest_int('max_leaf_nodes', 10, 50),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 5, 50),
        'l2_regularization': trial.suggest_float('l2_regularization', 1e-3, 1e-1, log=True),
        'max_bins': trial.suggest_int('max_bins',50, 255)
    }
    model = HistGradientBoostingRegressor(**params,early_stopping=True, validation_fraction=0.1, n_iter_no_change=10)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mape = mean_absolute_percentage_error(y_test, y_pred)
    return mape

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=30)

print("Best parameters: ", study.best_params)
print("Best MAPE: ", study.best_value)


In [None]:
best_hist_gradient_boosting = HistGradientBoostingRegressor(
    loss='squared_error',
    quantile=None,
    learning_rate=study.best_params['learning_rate'],
    max_iter=study.best_params['max_iter'],
    max_leaf_nodes=study.best_params['max_leaf_nodes'],
    max_depth=study.best_params['max_depth'],
    min_samples_leaf=study.best_params['min_samples_leaf'],
    l2_regularization=study.best_params['l2_regularization'],
    max_features=1.0,
    max_bins=study.best_params['max_bins'],
    categorical_features='warn',
    monotonic_cst=None,
    interaction_cst=None,
    warm_start=False,
    early_stopping='auto',
    scoring='loss',
    validation_fraction=0.1,
    n_iter_no_change=10,
    tol=1e-07,
    verbose=0,
    random_state=42
)

In [None]:
best_hist_gradient_boosting.fit(X_train, y_train)
y_pred = best_hist_gradient_boosting.predict(X_test)
mape = mean_absolute_percentage_error(y_test, y_pred)
print(f"MAPE: {mape:.2f}%")

# Kaggle

In [None]:
df_kaggle = dp.prepare_kaggle_dataset(
    filePath="../data/test_data-Kaggle-v0.11.csv",
    imputer=None,
    normalizeAndStandardize=False,
    get_dummies=False
)
df_kaggle.head()
df_kaggle.loc[df_kaggle['Space extracted'] < 5, 'Space extracted'] = np.nan
df_kaggle.loc[df_kaggle["Floor"] >= 41, "Floor"] = np.nan

df_kaggle = df_kaggle.drop(columns=["Availability"])

cat_col = ["type_unified"] + [f"region_group_{i}" for i in range(50)]

house_type = ["type_unified"]
num_col = [col for col in df_kaggle.columns if col not in cat_col + ["price_cleaned"]]


In [None]:
df_kaggle.head()

In [None]:
df_kaggle = df_kaggle.drop(['Type:', 'Hall height:'], axis=1)

In [None]:
df_kaggle.head()

In [None]:
df_kaggle["type_unified"] = df_kaggle["type_unified"].astype('category')
df_kaggle = pd.get_dummies(df_kaggle)

In [None]:
num_col = [col for col in df_kaggle.columns if col not in cat_col + ["price_cleaned"]]

In [None]:
df_kaggle[num_col] = scaler.fit_transform(df_kaggle[num_col])

In [None]:
best_hist_gradient_boosting.fit(X_train, y_train)

In [None]:
best_hist_gradient_boosting.fit(X_train, y_train)
from src.utils.helperFunctions import create_kaggle_results

results = best_hist_gradient_boosting.predict(df_kaggle)
create_kaggle_results(results, path_to_kaggledata="../data/test_data-Kaggle-v0.11.csv", csv_name='hist_gradient_v1')