In [2]:
import src.dataPipeline as dataPipeline
import importlib

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from tqdm import tqdm

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold
# Evaluating the model
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error ,make_scorer
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

from sklearn.ensemble import StackingRegressor

from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression, Lasso

import numpy as np

In [3]:
dp = dataPipeline.DataPipeline()
df = dp.runPipeline(
    filePath="../data/immo_data_202208_v2.csv",
    imputer=None,
    normalizeAndStandardize= False,
    basic_house_imputer = False,
    get_dummies = False
)

df.head()

  self.data = pd.read_csv(filePath)
  ]].bfill(axis=1)['Space extracted']
  ]].bfill(axis=1)['Plot_area_unified']
  ]].bfill(axis=1)['Availability']


Unnamed: 0,Availability,Floor,detail_responsive#surface_usable,Floor_space_merged,ForestDensityL,ForestDensityM,ForestDensityS,NoisePollutionRailwayL,NoisePollutionRailwayM,NoisePollutionRailwayS,...,gde_workers_total,price_cleaned,Space extracted,type_unified,Plot_area_unified,No. of rooms:,Last refurbishment:,Year built:,Number of floors:,region_group
0,On request,4.0,,,0.511176,0.286451,0.090908,0.0,0.0,0.0,...,331.0,1150000.0,100.0,penthouse,,5.0,,,,81.0
1,On request,,,242.0,0.511176,0.286451,0.090908,0.0,0.0,0.0,...,331.0,1420000.0,156.0,terrace-house,222.0,5.0,,,,81.0
2,Immediately,2.0,,,0.163362,0.095877,0.001911,0.0,0.0,0.0,...,33493.0,720000.0,93.0,penthouse,,5.0,,,,81.0
3,On request,,,257.0,0.511176,0.286451,0.090908,0.0,0.0,0.0,...,331.0,1430000.0,154.0,detached-house,370.0,5.0,,,,81.0
4,On request,0.0,,,0.333865,0.279276,0.145835,0.0,0.0,0.0,...,1355.0,995000.0,142.0,flat,,5.0,,,,81.0


In [4]:
df["region_group"] = df["region_group"].astype("category")
df["type_unified"] = df["type_unified"].astype("category")
df["Availability"] = df["Availability"].astype("category")

In [5]:
X = df.drop("price_cleaned", axis=1)
y = df["price_cleaned"]

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Kategorische und numerische Spalten definieren
cat_col = ['region_group', 'type_unified', 'Availability']
numerical_features = [col for col in df.columns if col not in cat_col + ["price_cleaned"]]

# Preprocessing für numerische Daten
numerical_transformer = Pipeline(steps=[
    ('imputer', KNNImputer(n_neighbors=5)),  # Fehlwerte auffüllen
    ('scaler', StandardScaler())  # Standardisieren
])

# Preprocessing für kategorische Daten
categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))  # One-Hot-Encoding
])

# ColumnTransformer erstellen
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, cat_col)
    ]
)

# Pipeline nur für das Preprocessing erstellen
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor)
])


X_train_transformed = pipeline.fit_transform(X_train)
X_test_transformed = pipeline.transform(X_test)

In [13]:
y_train_log = np.log(y_train)
y_test_log = np.log(y_test)

In [14]:
# Define base models
xgboost_model = XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
random_forest_model = RandomForestRegressor(n_estimators=100, random_state=42)
linear_model = LinearRegression()

# Define the meta-model (Lasso)
lasso_meta_model = Lasso(alpha=0.1)  # Adjust alpha as needed

# Create the Stacking Regressor
stacking_model = StackingRegressor(
    estimators=[
        ('xgboost', xgboost_model),
        ('random_forest', random_forest_model),
        ('linear', linear_model)
    ],
    n_jobs=-1,
    final_estimator=lasso_meta_model
)

In [15]:
# Define cross-validation strategy
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Define MAPE as a scoring metric for cross-validation
#mape_scorer = make_scorer(mean_absolute_percentage_error, greater_is_better=False)
#cv_scores = cross_val_score(stacking_model, X_train_transformed, y_train, scoring=mape_scorer, cv=kf)

# Train the stacking model on the full training data
stacking_model.fit(X_train_transformed, y_train_log)

# Evaluate on the test set
y_pred = stacking_model.predict(X_test_transformed)

# Calculate metrics on test data
test_mape = mean_absolute_percentage_error(np.exp(y_test_log), np.exp(y_pred))

# Print results
#print(f"Cross-Validated MAPE (mean): {np.mean(-cv_scores):.4f}")
#print(f"Cross-Validated MAPE (std): {np.std(-cv_scores):.4f}")
print(f"Test MAPE: {test_mape:.4f}")


Test MAPE: 0.2823
