In [None]:
import src.dataPipeline as dataPipeline
import importlib
importlib.reload(dataPipeline)
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.linear_model import LinearRegression
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_percentage_error, r2_score
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold

importlib.reload(dataPipeline)


In [None]:
def plot_residuals(y_test, y_pred):
    #subplot with residuals vs. predicted values and histogram of residuals
    fig, axs = plt.subplots(1, 2, figsize=(15, 5))
    #residuals vs. predicted values
    sns.scatterplot(x=y_pred, y=y_test - y_pred, color='blue',ax=axs[0])
    axs[0].axhline(y=0, color='r', linestyle='-')
    axs[0].set_xlabel('Predicted values')
    axs[0].set_ylabel('Residuals')
    axs[0].set_title('Residuals vs. Predicted values')
    #histogram of residuals
    sns.histplot(y_test - y_pred, bins=30, ax=axs[1])
    axs[1].set_title('Histogram of Residuals')
    plt.show()

In [None]:
def calculate_metrics(X_train, y_test, y_pred):
    n = len(y_test)  # Number of observations
    k = X_train.shape[1]  # Number of predictors
    r2 = round(r2_score(y_test, y_pred), 4)
    R2_adjusted = round(1 - (1 - r2) * (n - 1) / (n - k - 1), 4)
    mape = round(mean_absolute_percentage_error(y_test, y_pred) * 100, 4)
    return r2, R2_adjusted, mape

In [None]:
dp = dataPipeline.DataPipeline()
df = dp.runPipeline(
    filePath="../data/immo_data_202208_v2.csv",
    imputer=None,
    normalizeAndStandardize= False,
    get_dummies = False
)

In [None]:
df['price_cleaned'].isna().sum()

In [None]:
df.head()

In [None]:
sns.scatterplot(data=df, x="Space extracted", y = "price_cleaned",hue="region_group")

# Lineare Modelle mit Space extracted

In [None]:
df[['Space extracted']].isna().sum()

In [None]:
df[['Space extracted']] = df[['Space extracted']].astype(float)

In [None]:
impute_mean = df.copy()
impute_mean['Space extracted'] = impute_mean['Space extracted'].fillna(impute_mean['Space extracted'].mean())
impute_delete = df.copy()
impute_delete = impute_delete.dropna(subset=['Space extracted'])
impute_median = df.copy()
impute_median['Space extracted'] = impute_median['Space extracted'].fillna(impute_median['Space extracted'].median())
impute_knn = df.copy()
# Versuch Fläche 1 imputieren
impute_knn.loc[impute_knn['Space extracted'] < 5, 'Space extracted'] = np.nan
imputer = KNNImputer(n_neighbors=5)
impute_knn['Space extracted'] = imputer.fit_transform(impute_knn[['Space extracted']])

In [None]:
y = np.log(impute_knn[['price_cleaned']].values)
X = np.log(impute_knn[['Space extracted']].values)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=42)
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [None]:
plt.scatter(X_test, y_test)
plt.plot(X_test, model.predict(X_test), color='red')
plt.show()

In [None]:
plot_residuals(y_test.flatten(), y_pred.flatten())

In [None]:
#create df with X_test and y_test and y_pred
df_pred = pd.DataFrame(data={'Space extracted': X_test.flatten(), 'price_cleaned': y_test.flatten(), 'price_pred': y_pred.flatten()})

df_pred["Diff"] = df_pred["price_cleaned"] - df_pred["price_pred"]
df_pred

In [None]:
#scaling y_pred back to original scale
y_pred = np.exp(y_pred)
y_test = np.exp(y_test)

r2, R2_adjusted, mape = calculate_metrics(X_train,y_test, y_pred)
print(f"R2 {r2},Adjusted R^2:{R2_adjusted}, MAPE:{mape}%")

# Lineare Modelle mit Space extracted und No. of rooms

In [None]:
df_two_features = df[['Space extracted', 'No. of rooms:', 'price_cleaned']].copy()
df_two_features.loc[df_two_features['Space extracted'] < 5, 'Space extracted'] = np.nan

#df_two_features['No. of rooms:'] = df_two_features['No. of rooms:'].fillna(1)
#df_two_features['Space extracted'] = df_two_features['Space extracted'].fillna(0)

In [None]:
imputer = KNNImputer(n_neighbors=5)
df_two_features['Space extracted'] = imputer.fit_transform(df_two_features[['Space extracted']])
df_two_features['No. of rooms:'] = imputer.fit_transform(df_two_features[['No. of rooms:']])


In [None]:
df_two_features["Space extracted"] = np.log(df_two_features[['Space extracted']])
df_two_features["No. of rooms:"] = np.log(df_two_features[['No. of rooms:']])

In [None]:
X = df_two_features.drop(columns=["price_cleaned"]).values
y = np.log(df_two_features[['price_cleaned']].values)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=42)
#standardizing with scikit learn
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)  # Fit on training data and transform
X_test = scaler.transform(X_test)       # Transform test data with the same scaler

model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)


In [None]:
plot_residuals(y_test.flatten(), y_pred.flatten())

In [None]:
#scaling y_pred back to original scale
y_pred = np.exp(y_pred)
y_test = np.exp(y_test)

r2, R2_adjusted, mape = calculate_metrics(X_train,y_test, y_pred)
print(f"R2 {r2},Adjusted R^2:{R2_adjusted}, MAPE:{mape}%")

# Modelle mit einfachen Features


In [None]:
easy_features = ["Floor","price_cleaned","detail_responsive#surface_usable","Number of floors:","Plot_area_unified","Space extracted","No. of rooms:","type_unified"]
df_small = df[easy_features].copy()
df_small.loc[df_small['Space extracted'] < 5, 'Space extracted'] = np.nan
df_small.head()

In [None]:
#Filling Floor for House types with zeros
house_types = [
    'detached-house', 'villa', 'semi-detached-house', 'terrace-house',
    'chalet', 'farmhouse', 'rustico', 'castle', 'detached-secondary-suite'
]
df_small.loc[
    (df_small['type_unified'].isin(house_types)) & (df_small['Floor'].isna()),
    'Floor'
] = 0

In [None]:
#Fill na with 0
df_small["detail_responsive#surface_usable"] = df_small["detail_responsive#surface_usable"].fillna(0)
df_small["Number of floors:"] = df_small["Number of floors:"].fillna(1)
df_small["Plot_area_unified"] = df_small["Plot_area_unified"].fillna(0)

In [None]:
#Fill na for Floor, Space extracted No. of rooms: with knn
imputer = KNNImputer(n_neighbors=5)
df_small['Floor'] = imputer.fit_transform(df_small[['Floor']])
df_small['Space extracted'] = imputer.fit_transform(df_small[['Space extracted']])
df_small['No. of rooms:'] = imputer.fit_transform(df_small[['No. of rooms:']])

In [None]:
df_small.isna().sum()

In [None]:
df_small['Space extracted'] = np.log(df_small[['Space extracted']])
df_small = pd.get_dummies(df_small, columns=['type_unified'], drop_first=True)

In [None]:
X = df_small.drop(columns=["price_cleaned"]).values
y = np.log(df_small[['price_cleaned']].values)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=42)
#standardizing with scikit learn
scaler = StandardScaler()
X_train[:, :6] = scaler.fit_transform(X_train[:, :6])
X_test[:, :6] = scaler.transform(X_test[:, :6])

model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [None]:
plot_residuals(y_test.flatten(), y_pred.flatten())

In [None]:
#scaling y_pred back to original scale
y_pred = np.exp(y_pred)
y_test = np.exp(y_test)

r2, R2_adjusted, mape = calculate_metrics(X_train,y_test, y_pred)
print(f"R2 {r2},Adjusted R^2:{R2_adjusted}, MAPE:{mape}%")

# Modelle mit bestehen Features + Standort

In [None]:
df.head()

In [None]:
experiment_standort = ["Floor","price_cleaned","detail_responsive#surface_usable","Number of floors:","Plot_area_unified","Space extracted","No. of rooms:","type_unified", "region_group"]
df_3 = df[experiment_standort].copy()
df_3.loc[df_3['Space extracted'] < 5, 'Space extracted'] = np.nan
df_3.head()

In [None]:
#Filling Floor for House types with zeros
house_types = [
    'detached-house', 'villa', 'semi-detached-house', 'terrace-house',
    'chalet', 'farmhouse', 'rustico', 'castle', 'detached-secondary-suite'
]
df_3.loc[
    (df_3['type_unified'].isin(house_types)) & (df_3['Floor'].isna()),
    'Floor'
] = 0
#Fill na with 0
df_3["detail_responsive#surface_usable"] = df_3["detail_responsive#surface_usable"].fillna(0)
df_3["Number of floors:"] = df_3["Number of floors:"].fillna(1)
df_3["Plot_area_unified"] = df_3["Plot_area_unified"].fillna(0)

#Fill na for Floor, Space extracted No. of rooms: with knn
imputer = KNNImputer(n_neighbors=5)
df_3['Floor'] = imputer.fit_transform(df_3[['Floor']])
df_3['Space extracted'] = imputer.fit_transform(df_3[['Space extracted']])
df_3['No. of rooms:'] = imputer.fit_transform(df_3[['No. of rooms:']])

In [None]:
df_3['Space extracted'] = np.log(df_3[['Space extracted']])
df_3 = pd.get_dummies(df_3, columns=['type_unified'], drop_first=True)
df_3 = pd.get_dummies(df_3, columns=['region_group'], drop_first=True)
X = df_3.drop(columns=["price_cleaned"]).values
print(X.shape)
y = np.log(df_3[['price_cleaned']].values)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=42)
#standardizing with scikit learn
scaler = StandardScaler()
X_train[:, :6] = scaler.fit_transform(X_train[:, :6])
X_test[:, :6] = scaler.transform(X_test[:, :6])

model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [None]:
plot_residuals(y_test.flatten(), y_pred.flatten())

In [None]:
#plotting predicted vs. actual values
plt.scatter(y_test, y_pred)
plt.xlabel('Actual values')
plt.ylabel('Predicted values')
plt.title('Actual vs. Predicted values')
plt.show()

In [None]:
#scaling y_pred back to original scale
y_pred = np.exp(y_pred)
y_test = np.exp(y_test)

r2, R2_adjusted, mape = calculate_metrics(X_train,y_test, y_pred)
print(f"R2 {r2},Adjusted R^2:{R2_adjusted}, MAPE:{mape}%")

In [None]:
#print training metrics
y_pred_train = model.predict(X_train)
r2, R2_adjusted, mape = calculate_metrics(X_train,y_train, y_pred_train)
print(f"R2 {r2},Adjusted R^2:{R2_adjusted}, MAPE:{mape}%")