In [None]:
import src.dataPipeline as dataPipeline
import importlib

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from tqdm import tqdm

import optuna

from sklearn.model_selection import train_test_split
from sklearn.tree import  RandomForestClassifier
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score, KFold,StratifiedKFold
from sklearn.metrics import accuracy_score, make_scorer, f1_score, recall_score, precision_score
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

importlib.reload(dataPipeline)


In [None]:
dp = dataPipeline.DataPipeline()
df = dp.runPipeline(
    filePath="../../data/immo_data_202208_v2.csv",
    imputer=None,
    normalizeAndStandardize= False,
    basic_house_imputer = True,
    get_dummies = False
)

In [None]:
df.head()

In [None]:
merge_mapping = {
    'attic-room': 'attic-flat',
    'castle': 'farmhouse',
    'detached-secondary-suite': 'detached-house',
    'single-room': 'studio',
    'secondary-suite': 'duplex-maisonette'
}

df['type_unified'] = df['type_unified'].replace(merge_mapping)

In [None]:
df["region_group"] = df["region_group"].astype("category")
df["Availability"] = df["Availability"].astype("category")

In [None]:
X = df.drop("type_unified", axis=1)
y = df["type_unified"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
cat_col = ['region_group', 'Availability']
numerical_features = [col for col in df.columns if col not in cat_col + ["type_unified"]]

# Preprocessing für numerische Daten
numerical_transformer = Pipeline(steps=[
    ('imputer', KNNImputer(n_neighbors=5)),  # Fehlwerte auffüllen
    ('scaler', StandardScaler())  # Standardisieren
])

# Preprocessing für kategorische Daten
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Fehlwerte auffüllen
    ('onehot', OneHotEncoder(handle_unknown='ignore'))  # One-Hot-Encoding
])

# ColumnTransformer erstellen
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, cat_col)
    ]
)

# Pipeline nur für das Preprocessing erstellen
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor)
])

X_train_transformed = pipeline.fit_transform(X_train)
X_test_transformed = pipeline.transform(X_test)

In [None]:
# Define cross-validator
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Metrics to collect
accuracy_scores = []
f1_scores = []
recall_scores = []
precision_scores = []

# Iterate through the folds
for train_index, val_index in kf.split(X_train, y_train):
    # Split original data into train and validation sets
    X_fold_train, X_fold_val = X_train.iloc[train_index], X_train.iloc[val_index]
    y_fold_train, y_fold_val = y_train.iloc[train_index], y_train.iloc[val_index]

    # Transform train and validation sets
    X_fold_train_transformed = pipeline.fit_transform(X_fold_train)
    X_fold_val_transformed = pipeline.transform(X_fold_val)

    # Train the model
    RandomForrest = RandomForestClassifier
    RandomForrest.fit(X_fold_train_transformed, y_fold_train)

    # Predict on the validation set
    y_val_pred = RandomForrest.predict(X_fold_val_transformed)

    # Calculate metrics
    accuracy_scores.append(accuracy_score(y_fold_val, y_val_pred))
    f1_scores.append(f1_score(y_fold_val, y_val_pred, average='weighted'))
    recall_scores.append(recall_score(y_fold_val, y_val_pred, average='weighted'))
    precision_scores.append(precision_score(y_fold_val, y_val_pred, average='weighted'))

In [None]:
print(f"Accuracy: {np.mean(accuracy_scores)}")
print(f"F1: {np.mean(f1_scores)}")
print(f"Recall: {np.mean(recall_scores)}")
print(f"Precision: {np.mean(precision_scores)}")
