In [140]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import numpy as np

pd.set_option('display.max_rows', 500)  # Replace 100 with the desired number
pd.set_option('display.max_columns', 500)  # Replace 100 with the desired number

# Load the dataset
data = pd.read_csv('cleanedNewTest.csv')


# Test model performance when filtering within5 years
data = data[data['years_from_today'] < 5]

# Penalitize old transactions as a correction for inflation and other economic factors.
data['years_from_today_weighted'] = data['years_from_today'] ** 2

# Test model performance on specific municipalities
municipalities = ['Brøndby']
data = data[data['props_pageProps_address_municipality_name'].isin(municipalities)]

print(data['props_pageProps_address_municipality_name'].value_counts())
print(data.shape)

props_pageProps_address_municipality_name
Brøndby    80
Name: count, dtype: int64
(80, 22)


In [141]:
# Selecting the target variable (let's choose 'sqm_price' for this example)
target = 'sqm_price'

# Preparing the data for the model
X = data.drop([target, 'lastPrice', 'props_pageProps_address_events_0_at', 'props_pageProps_address_events_0_label',
               'props_pageProps_dataLayer_virtualPagePath', 'props_pageProps_dataLayer_detailMetaData', 'years_from_today',
               'props_pageProps_address_coordinates_lat', 'props_pageProps_address_coordinates_lon'], axis=1)

y = data[target]

# Encoding categorical variables
categorical_cols = X.select_dtypes(include=['object', 'category']).columns
numerical_cols = X.select_dtypes(include=[np.number]).columns

encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
X_encoded = encoder.fit_transform(X[categorical_cols])

# Standardizing numerical features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X[numerical_cols])

# Combining encoded categorical and scaled numerical features
X_encoded_df = pd.DataFrame(X_encoded, columns=encoder.get_feature_names_out(categorical_cols))
X_scaled_df = pd.DataFrame(X_scaled, columns=numerical_cols)
X_combined = pd.concat([X_scaled_df, X_encoded_df], axis=1)

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.2, random_state=42)

# Training a RandomForestRegressor model
model = RandomForestRegressor(n_estimators=300, random_state=42)
model.fit(X_train, y_train)

# Predicting and evaluating the model
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5

# Outputting the model's performance
mse, rmse



(13133.199401142369, 114.60017190712398)

The machine learning model performs very good on for example Brøndby with almost no outliers, and very bad on for example Frederiksberg with many diverse transactions. This could indicate that clustering within each municiplaity could be a good way to handle and label the outliers instead of removing them and thereefter use their labels as inputs in a machine learning model. Also, an approach could be to train a specific model for each municipality.