In [2]:
import src.dataPipeline as dataPipeline
import importlib

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from tqdm import tqdm

import optuna

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score, KFold,StratifiedKFold
# Evaluating the model
from sklearn.metrics import accuracy_score, make_scorer, f1_score, recall_score, precision_score
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

importlib.reload(dataPipeline)


<module 'src.dataPipeline' from 'C:\\Users\\wartm\\Documents\\FHNW\\immo_challenge\\src\\dataPipeline.py'>

In [None]:
dp = dataPipeline.DataPipeline()
df = dp.runPipeline(
    filePath="../../data/immo_data_202208_v2.csv",
    imputer=None,
    normalizeAndStandardize= False,
    basic_house_imputer = True,
    get_dummies = False
)

In [None]:
df.head()

In [None]:
df["region_group"] = df["region_group"].astype("category")
df["Availability"] = df["Availability"].astype("category")

In [None]:
X = df.drop("type_unified", axis=1)
y = df["type_unified"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
cat_col = ['region_group', 'Availability']
numerical_features = [col for col in df.columns if col not in cat_col + ["type_unified"]]

# Preprocessing für numerische Daten
numerical_transformer = Pipeline(steps=[
    ('imputer', KNNImputer(n_neighbors=5)),  # Fehlwerte auffüllen
    ('scaler', StandardScaler())  # Standardisieren
])

# Preprocessing für kategorische Daten
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Fehlwerte auffüllen
    ('onehot', OneHotEncoder(handle_unknown='ignore'))  # One-Hot-Encoding
])

# ColumnTransformer erstellen
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, cat_col)
    ]
)

# Pipeline nur für das Preprocessing erstellen
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor)
])

X_train_transformed = pipeline.fit_transform(X_train)
X_test_transformed = pipeline.transform(X_test)