In [7]:
import src.dataPipeline as dataPipeline
import importlib

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from tqdm import tqdm

import optuna

from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostRegressor
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold
# Evaluating the model
from sklearn.metrics import mean_absolute_percentage_error,r2_score ,make_scorer
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

importlib.reload(dataPipeline)


<module 'src.dataPipeline' from 'C:\\Users\\wartm\\Documents\\FHNW\\immo_challenge\\src\\dataPipeline.py'>

In [8]:
dp = dataPipeline.DataPipeline()
df = dp.runPipeline(
    filePath="../data/immo_data_202208_v2.csv",
    imputer=None,
    normalizeAndStandardize= False,
    basic_house_imputer = True,
    get_dummies = False
)

  self.data = pd.read_csv(filePath)
  ]].bfill(axis=1)['Space extracted']
  ]].bfill(axis=1)['Plot_area_unified']
  ]].bfill(axis=1)['Availability']


In [9]:
df.head()

Unnamed: 0,Availability,Floor,detail_responsive#surface_usable,Floor_space_merged,ForestDensityL,ForestDensityM,ForestDensityS,NoisePollutionRailwayL,NoisePollutionRailwayM,NoisePollutionRailwayS,...,gde_workers_total,price_cleaned,Space extracted,type_unified,Plot_area_unified,No. of rooms:,Last refurbishment:,Year built:,Number of floors:,region_group
0,On request,4.0,,,0.511176,0.286451,0.090908,0.0,0.0,0.0,...,331.0,1150000.0,100.0,penthouse,,5.0,,,,11.0
1,On request,,,242.0,0.511176,0.286451,0.090908,0.0,0.0,0.0,...,331.0,1420000.0,156.0,terrace-house,222.0,5.0,,,,11.0
2,Immediately,2.0,,,0.163362,0.095877,0.001911,0.0,0.0,0.0,...,33493.0,720000.0,93.0,penthouse,,5.0,,,,11.0
3,On request,,,257.0,0.511176,0.286451,0.090908,0.0,0.0,0.0,...,331.0,1430000.0,154.0,detached-house,370.0,5.0,,,,11.0
4,On request,0.0,,,0.333865,0.279276,0.145835,0.0,0.0,0.0,...,1355.0,995000.0,142.0,flat,,5.0,,,,11.0


In [10]:
df["region_group"] = df["region_group"].astype("category")
df["type_unified"] = df["type_unified"].astype("category")
df["Availability"] = df["Availability"].astype("category")

In [11]:
X = df.drop("price_cleaned", axis=1)
y = df["price_cleaned"]

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Kategorische und numerische Spalten definieren
cat_col = ['region_group', 'type_unified', 'Availability']
numerical_features = [col for col in df.columns if col not in cat_col + ["price_cleaned"]]

# Preprocessing für numerische Daten
numerical_transformer = Pipeline(steps=[
    ('imputer', KNNImputer(n_neighbors=5)),  # Fehlwerte auffüllen
    ('scaler', StandardScaler())  # Standardisieren
])

# Preprocessing für kategorische Daten
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))  # One-Hot-Encoding
])

# ColumnTransformer erstellen
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, cat_col)
    ]
)

# Pipeline nur für das Preprocessing erstellen
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor)
])


X_train_transformed = pipeline.fit_transform(X_train)
X_test_transformed = pipeline.transform(X_test)

In [14]:
# Define the objective function for Optuna
def objective(trial):
    # Suggest hyperparameters
    n_estimators = trial.suggest_int("n_estimators", 50, 500)
    learning_rate = trial.suggest_float("learning_rate", 0.01, 1.0, log=True)
    max_depth = trial.suggest_int("max_depth", 1, 10)

    # Create the base estimator
    base_estimator = DecisionTreeRegressor(max_depth=max_depth)

    # Create AdaBoostRegressor with the suggested hyperparameters
    model = AdaBoostRegressor(
        estimator=base_estimator,  # Changed from base_estimator to estimator
        n_estimators=n_estimators,
        learning_rate=learning_rate,
        random_state=42
    )


    # Use K-Fold Cross Validation if dataset is large enough, otherwise use train-test split
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    mape_scores = []

    for train_index, val_index in kf.split(X_train_transformed):
        X_train_fold, X_val_fold = X_train_transformed[train_index], X_train_transformed[val_index]
        y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]

        # Train the model
        model.fit(X_train_fold, y_train_fold)

        # Predict on the validation fold
        y_pred = model.predict(X_val_fold)

        # Evaluate the model using Mean Absolute Percentage Error
        mape = mean_absolute_percentage_error(y_val_fold, y_pred)
        mape_scores.append(mape)

    return sum(mape_scores) / len(mape_scores)

# Create a study and optimize the objective function
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=10, timeout=120)

# Output the best hyperparameters and score
print("Best parameters:", study.best_params)
print("Best MAPE:", study.best_value)

[I 2024-12-29 17:51:15,296] A new study created in memory with name: no-name-af8f11f4-1105-4e33-af38-751b1519132c
[I 2024-12-29 17:53:42,413] Trial 0 finished with value: 1.5386483105586333 and parameters: {'n_estimators': 241, 'learning_rate': 0.035909310460729746, 'max_depth': 2}. Best is trial 0 with value: 1.5386483105586333.
[I 2024-12-29 17:59:52,986] Trial 1 finished with value: 0.5504078064945181 and parameters: {'n_estimators': 248, 'learning_rate': 0.5187635838081097, 'max_depth': 10}. Best is trial 1 with value: 0.5504078064945181.


Best parameters: {'n_estimators': 248, 'learning_rate': 0.5187635838081097, 'max_depth': 10}
Best MAPE: 0.5504078064945181


In [28]:
# Train the best model with the optimal parameters
best_params = study.best_params
best_model = AdaBoostRegressor(
    base_estimator=DecisionTreeRegressor(max_depth=best_params["max_depth"]),
    n_estimators=best_params["n_estimators"],
    learning_rate=best_params["learning_rate"],
    random_state=42
)
best_model.fit(X_train_transformed, y_train)

# Evaluate the final model
final_mape = mean_absolute_percentage_error(y_test, best_model.predict(X_test_transformed))
print("Final MAPE with the best model:", final_mape)

[I 2024-12-29 17:12:21,065] A new study created in memory with name: no-name-c5d7ded9-1765-4eb3-b24d-b429d63a09c7
[I 2024-12-29 17:13:19,050] Trial 0 finished with value: 5.4784900723243855 and parameters: {'n_estimators': 461, 'learning_rate': 0.5639378541596162, 'max_depth': 1}. Best is trial 0 with value: 5.4784900723243855.
[I 2024-12-29 17:29:06,341] Trial 1 finished with value: 0.569172827491238 and parameters: {'n_estimators': 498, 'learning_rate': 0.38558802221495375, 'max_depth': 10}. Best is trial 1 with value: 0.569172827491238.


Best parameters: {'n_estimators': 498, 'learning_rate': 0.38558802221495375, 'max_depth': 10}
Best MAPE: 0.569172827491238


TypeError: AdaBoostRegressor.__init__() got an unexpected keyword argument 'base_estimator'

# Price logaritmiert

In [15]:
y_test_log = np.log(y_test)
y_train_log = np.log(y_train)

In [17]:
def objective(trial):
    # Suggest hyperparameters
    n_estimators = trial.suggest_int("n_estimators", 50, 500)
    learning_rate = trial.suggest_float("learning_rate", 0.01, 1.0, log=True)
    max_depth = trial.suggest_int("max_depth", 1, 10)

    # Create the base estimator
    base_estimator = DecisionTreeRegressor(max_depth=max_depth)

    # Create AdaBoostRegressor with the suggested hyperparameters
    model = AdaBoostRegressor(
        estimator=base_estimator,  # Changed from base_estimator to estimator
        n_estimators=n_estimators,
        learning_rate=learning_rate,
        random_state=42
    )


    # Use K-Fold Cross Validation if dataset is large enough, otherwise use train-test split
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    mape_scores = []

    for train_index, val_index in kf.split(X_train_transformed):
        X_train_fold, X_val_fold = X_train_transformed[train_index], X_train_transformed[val_index]
        y_train_fold, y_val_fold = y_train_log.iloc[train_index], y_train_log.iloc[val_index]

        # Train the model
        model.fit(X_train_fold, y_train_fold)

        # Predict on the validation fold
        y_pred = model.predict(X_val_fold)

        # Evaluate the model using Mean Absolute Percentage Error
        mape = mean_absolute_percentage_error(np.exp(y_val_fold), np.exp(y_pred))
        mape_scores.append(mape)

    return sum(mape_scores) / len(mape_scores)

# Create a study and optimize the objective function
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=20, timeout=120)

# Output the best hyperparameters and score
print("Best parameters:", study.best_params)
print("Best MAPE:", study.best_value)


[I 2024-12-29 18:00:32,645] A new study created in memory with name: no-name-56aa84c8-0840-4c29-9d5e-335e813f03e3
[I 2024-12-29 18:01:16,822] Trial 0 finished with value: 0.46127282240777784 and parameters: {'n_estimators': 74, 'learning_rate': 0.05019823607946922, 'max_depth': 2}. Best is trial 0 with value: 0.46127282240777784.
[I 2024-12-29 18:02:47,663] Trial 1 finished with value: 0.4742881910186676 and parameters: {'n_estimators': 300, 'learning_rate': 0.803613511388423, 'max_depth': 2}. Best is trial 0 with value: 0.46127282240777784.


Best parameters: {'n_estimators': 74, 'learning_rate': 0.05019823607946922, 'max_depth': 2}
Best MAPE: 0.46127282240777784


In [19]:
# Train the best model with the optimal parameters
best_params = study.best_params
best_model = AdaBoostRegressor(
    estimator=DecisionTreeRegressor(max_depth=best_params["max_depth"]),
    n_estimators=best_params["n_estimators"],
    learning_rate=best_params["learning_rate"],
    random_state=42
)
best_model.fit(X_train_transformed, y_train_log)

# Evaluate the final model
final_mape = mean_absolute_percentage_error(y_test, np.exp(best_model.predict(X_test_transformed)))
print("Final MAPE with the best model:", final_mape)


Final MAPE with the best model: 0.45984305946082693


# Mehr Hyperparameter

In [27]:
def objective(trial):
    n_estimators = trial.suggest_int("n_estimators", 50, 1000)
    learning_rate = trial.suggest_float("learning_rate", 0.001, 1.0, log=True)
    max_depth = trial.suggest_int("max_depth", 2, 20)
    min_samples_split = trial.suggest_int("min_samples_split", 2, 20)
    min_samples_leaf = trial.suggest_int("min_samples_leaf", 1, 20)
    max_features = trial.suggest_categorical("max_features", ["sqrt", "log2", None])

    # Create the base estimator
    base_estimator = DecisionTreeRegressor(
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        max_features=max_features
    )

    # Create AdaBoostRegressor
    model = AdaBoostRegressor(
        estimator=base_estimator,
        n_estimators=n_estimators,
        learning_rate=learning_rate,
        random_state=42
    )


    # Use K-Fold Cross Validation if dataset is large enough, otherwise use train-test split
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    mape_scores = []

    for train_index, val_index in kf.split(X_train_transformed):
        X_train_fold, X_val_fold = X_train_transformed[train_index], X_train_transformed[val_index]
        y_train_fold, y_val_fold = y_train_log.iloc[train_index], y_train_log.iloc[val_index]

        # Train the model
        model.fit(X_train_fold, y_train_fold)

        # Predict on the validation fold
        y_pred = model.predict(X_val_fold)

        # Evaluate the model using Mean Absolute Percentage Error
        mape = mean_absolute_percentage_error(np.exp(y_val_fold), np.exp(y_pred))
        mape_scores.append(mape)

    return sum(mape_scores) / len(mape_scores)

# Create a study and optimize the objective function
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=20, timeout=180)

# Output the best hyperparameters and score
print("Best parameters:", study.best_params)
print("Best MAPE:", study.best_value)


[I 2024-12-29 18:37:13,361] A new study created in memory with name: no-name-b057ba57-bafb-4b70-b7e2-423ec6736390
[I 2024-12-29 18:40:03,554] Trial 0 finished with value: 0.5118593385987135 and parameters: {'n_estimators': 258, 'learning_rate': 0.0019831691403489203, 'max_depth': 2, 'min_samples_split': 16, 'min_samples_leaf': 13, 'max_features': None}. Best is trial 0 with value: 0.5118593385987135.
[I 2024-12-29 18:43:01,444] Trial 1 finished with value: 0.4119995156537217 and parameters: {'n_estimators': 780, 'learning_rate': 0.00287603911878279, 'max_depth': 6, 'min_samples_split': 10, 'min_samples_leaf': 9, 'max_features': 'sqrt'}. Best is trial 1 with value: 0.4119995156537217.


Best parameters: {'n_estimators': 780, 'learning_rate': 0.00287603911878279, 'max_depth': 6, 'min_samples_split': 10, 'min_samples_leaf': 9, 'max_features': 'sqrt'}
Best MAPE: 0.4119995156537217


# Kaggle Wetbewerb

In [20]:
df_kaggle = dp.prepare_kaggle_dataset(
    filePath="../data/test_data-Kaggle-v0.11.csv",
    imputer=None,
    normalizeAndStandardize=False,
    basic_house_imputer=True,
    get_dummies=False
)
df_kaggle.head()

  self.data = pd.read_csv(filePath)
  ]].bfill(axis=1)['Space extracted']
  ]].bfill(axis=1)['Plot_area_unified']
  ]].bfill(axis=1)['Availability']
  df.loc[mask, 'plz_parsed'] = df.loc[mask, 'address'].apply(extractPlz)


Error in column: Type:


Unnamed: 0,Availability,Floor,detail_responsive#surface_usable,Floor_space_merged,ForestDensityL,ForestDensityM,ForestDensityS,NoisePollutionRailwayL,NoisePollutionRailwayM,NoisePollutionRailwayS,...,Space extracted,type_unified,Plot_area_unified,No. of rooms:,Last refurbishment:,Year built:,Number of floors:,Type:,Hall height:,region_group
0,On request,0.0,0.0,,0.164382,0.10003,0.063548,0.003811,0.0,0.0,...,220.0,villa,733.0,5.0,,,1.0,,,11
1,On request,0.0,0.0,,0.260855,0.170434,0.083253,0.002623,0.0,0.0,...,230.0,detached-house,702.0,5.0,,,1.0,,,11
2,On request,,0.0,,0.434114,0.357984,0.125505,0.0,0.0,0.0,...,131.0,stepped-house,0.0,5.0,,,1.0,,,11
3,Immediately,0.0,0.0,140.0,0.14819,0.07661,0.0,0.005193,0.0,0.0,...,140.0,terrace-house,206.0,5.0,,,1.0,,,11
4,On request,0.0,0.0,242.0,0.511176,0.286451,0.090908,0.0,0.0,0.0,...,156.0,terrace-house,222.0,5.0,,,1.0,,,11


In [21]:
df_kaggle = df_kaggle.drop(['Type:', 'Hall height:'], axis=1)

In [22]:
df_kaggle["region_group"] = df_kaggle["region_group"].astype("category")
df_kaggle["type_unified"] = df_kaggle["type_unified"].astype("category")
df_kaggle["Availability"] = df_kaggle["Availability"].astype("category")

In [23]:
X_kaggle = df_kaggle
X_kaggle_transformed = pipeline.transform(X_kaggle)

In [24]:
from src.utils.helperFunctions import create_kaggle_results

results = best_model.predict(X_kaggle_transformed)
results = np.exp(results)
create_kaggle_results(results, path_to_kaggledata="../data/test_data-Kaggle-v0.11.csv", csv_name='adaboost_log_price.csv')

File adaboost_log_price.csv_2024-12-29_18-10-37.csv created successfully.


  indexes = pd.read_csv(path_to_kaggledata)['Unnamed: 0']
