In [None]:
!pip install pytorch_tabnet scikit-learn pandas

# Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from pytorch_tabnet.tab_model import TabNetRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import matplotlib.pyplot as plt
from xgboost import XGBRegressor
from sklearn.impute import SimpleImputer

# Load the dataset
data = pd.read_csv('Funda_dataset_cleaned.csv')
data.drop(['Aangeboden sinds', 'Verkoopdatum'], axis=1, inplace=True)
data.drop_duplicates(inplace=True)

# Calculate the correlation matrix
corr_matrix = data.corr()

# Select the variables that have a correlation less than 0.1 with the target variable
drop_list = corr_matrix[corr_matrix['Laatste vraagprijs'].abs() < 0.1].index.to_list()

# Drop the variables from the data
data = data.drop(drop_list, axis=1)


numerical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Select categorical columns with fewer than 1000 unique values
categorical_col = [col for col in data.columns if data[col].dtype == 'O' and data[col].nunique() < 1000]


numerical_col = [col for col in data.columns if data[col].dtype in ['float64', 'int64']]


my_cols = categorical_col + numerical_col

#  ColumnTransformer
preprocessor = ColumnTransformer(transformers=[
    ('num', numerical_transformer, numerical_col),
    ('cat', categorical_transformer, categorical_col)])


eval_set_pipe = Pipeline([
    ('preprocessor', preprocessor)
])

# Selected columns for training
selected_columns = ['Status', 'Soort bouw', 'Energielabel', 'Aantal dagen tot verkoop', 'Soort dak', 'Soort woonhuis',
                     'Laatste vraagprijs', 'Aantal kamers', 'Overige inpandige ruimte m2', 'Perceel m2', 'Achtertuin m2',
                     'Vraagprijs per m2', 'Woonruimte in m2', 'Gebouwgebonden buitenruimte in m2', 'Externe bergruimte in m2',
                     'Inhoud in m3', 'Aantal slaapkamers', 'Bathrooms', 'Toilets', 'Kelder', 'Voorzieningen_Domotica',
                     'Voorzieningen_Zwembad', 'Voorzieningen_Verwarming', 'Voorzieningen_Stromendwater', 'Voorzieningen_Elektrischedeur',
                     'Voorzieningen_Alarminstallatie', 'Isolatie_Vollediggeïsoleerd', 'Verwarming_Gedeeltelijkevloerverwarming',
                     'Verwarming_Warmtepomp', 'Verwarming_Gehelevloerverwarming', 'Verwarming_Blokverwarming', 'Verwarming_Gashaard',
                     'Verwarming_Openhaard', 'Verwarming_Houtkachel', 'Balkon/dakterras_Dakterrasaanwezig', 'Tuin_Zijtuin',
                     'Tuin_Tuinrondom', 'Soort garage_Parkeerplaats', 'Soort garage_Inpandig', 'Soort garage_Parkeerkelder']

selected_data = data[selected_columns]

X = data.drop('Laatste vraagprijs', axis=1)
y = data['Laatste vraagprijs']
#y_log = np.log1p(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


categorical_cols = X.select_dtypes(include=['object', 'category']).columns
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns


numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Create the preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Applying the transformations
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_test_preprocessed = preprocessor.transform(X_test)

# Initialize TabNetRegressor with adjusted hyperparameters
regressor = TabNetRegressor(
    n_d=16,
    n_a=16,
    n_steps=5,
    gamma=1.3,
    lambda_sparse=0.001,
    optimizer_params=dict(lr=2e-2),
    mask_type='entmax',
    scheduler_params=dict(max_lr=0.05, steps_per_epoch=100, epochs=20),
)

# Train the model
regressor.fit(
    X_train_preprocessed, y_train.values.reshape(-1, 1),
    eval_set=[(X_test_preprocessed, y_test.values.reshape(-1, 1))],
    max_epochs=100,
    eval_metric=['mae', 'mse', 'rmse']
)

# Make predictions on the test set
y_test_pred_tabnet = regressor.predict(X_test_preprocessed)

# Calculate metrics on the original scale
y_test_pred_original_scale = np.expm1(y_test_pred_tabnet)
y_test_original_scale = np.expm1(y_test)

mae = mean_absolute_error(y_test_original_scale, y_test_pred_original_scale)
mse = mean_squared_error(y_test_original_scale, y_test_pred_original_scale)
r2 = r2_score(y_test_original_scale, y_test_pred_original_scale)

print("TabNet - MAE:", mae, "MSE:", mse, "R2 Score:", r2)

  corr_matrix = data.corr()


Categorical Columns:
['Status', 'Soort bouw', 'Energielabel', 'Aantal dagen tot verkoop', 'Soort dak', 'Soort woonhuis']
Numerical Columns:
['Laatste vraagprijs', 'Aantal kamers', 'Overige inpandige ruimte m2', 'Perceel m2', 'Achtertuin m2', 'Vraagprijs per m2', 'Woonruimte in m2', 'Gebouwgebonden buitenruimte in m2', 'Externe bergruimte in m2', 'Inhoud in m3', 'Aantal slaapkamers', 'Bathrooms', 'Toilets', 'Kelder', 'Voorzieningen_Domotica', 'Voorzieningen_Zwembad', 'Voorzieningen_Verwarming', 'Voorzieningen_Stromendwater', 'Voorzieningen_Elektrischedeur', 'Voorzieningen_Alarminstallatie', 'Isolatie_Vollediggeïsoleerd', 'Verwarming_Gedeeltelijkevloerverwarming', 'Verwarming_Warmtepomp', 'Verwarming_Gehelevloerverwarming', 'Verwarming_Blokverwarming', 'Verwarming_Gashaard', 'Verwarming_Openhaard', 'Verwarming_Houtkachel', 'Balkon/dakterras_Dakterrasaanwezig', 'Tuin_Zijtuin', 'Tuin_Tuinrondom', 'Soort garage_Parkeerplaats', 'Soort garage_Inpandig', 'Soort garage_Parkeerkelder']




epoch 0  | loss: 132.60093| val_0_mae: 11.03966| val_0_mse: 122.05075| val_0_rmse: 11.04766|  0:00:02s
epoch 1  | loss: 58.7985 | val_0_mae: 7.37748 | val_0_mse: 54.74439| val_0_rmse: 7.39894 |  0:00:05s
epoch 2  | loss: 20.18549| val_0_mae: 2.72553 | val_0_mse: 7.92119 | val_0_rmse: 2.81446 |  0:00:08s
epoch 3  | loss: 13.48406| val_0_mae: 3.26612 | val_0_mse: 11.49678| val_0_rmse: 3.39069 |  0:00:11s
epoch 4  | loss: 5.79684 | val_0_mae: 1.78896 | val_0_mse: 3.67781 | val_0_rmse: 1.91776 |  0:00:13s
epoch 5  | loss: 2.34952 | val_0_mae: 1.76231 | val_0_mse: 3.37132 | val_0_rmse: 1.83612 |  0:00:16s
epoch 6  | loss: 1.4596  | val_0_mae: 0.90305 | val_0_mse: 1.03117 | val_0_rmse: 1.01547 |  0:00:19s
epoch 7  | loss: 0.81547 | val_0_mae: 0.652   | val_0_mse: 0.63041 | val_0_rmse: 0.79398 |  0:00:21s
epoch 8  | loss: 0.5516  | val_0_mae: 0.54296 | val_0_mse: 0.43347 | val_0_rmse: 0.65839 |  0:00:24s
epoch 9  | loss: 0.446   | val_0_mae: 0.55376 | val_0_mse: 0.44358 | val_0_rmse: 0.66602 



TabNet - MAE: 43565.38967563293 MSE: 10849646437.527084 R2 Score: 0.78640797904675


In [None]:
#RMSE
rmse = np.sqrt(mse)
print(rmse)

104161.63611199224
