In [34]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import numpy as np

# Load the dataset
data = pd.read_csv('cleanedNewTest.csv')

data['years_from_today_weighted'] = data['years_from_today'] ** 2
data = data[data['years_from_today'] < 4]

data.shape

(1962, 22)

In [39]:
# Selecting the target variable (let's choose 'sqm_price' for this example)
target = 'sqm_price'

# Preparing the data for the model
X = data.drop([target, 'lastPrice', 'props_pageProps_address_events_0_at', 'props_pageProps_address_events_0_label',
               'props_pageProps_dataLayer_virtualPagePath', 'props_pageProps_dataLayer_detailMetaData', 'years_from_today',
               'props_pageProps_address_coordinates_lat', 'props_pageProps_address_coordinates_lon', 'years_from_today_weighted'], axis=1)

y = data[target]

# Encoding categorical variables
categorical_cols = X.select_dtypes(include=['object', 'category']).columns
numerical_cols = X.select_dtypes(include=[np.number]).columns

encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
X_encoded = encoder.fit_transform(X[categorical_cols])

# Standardizing numerical features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X[numerical_cols])

# Combining encoded categorical and scaled numerical features
X_encoded_df = pd.DataFrame(X_encoded, columns=encoder.get_feature_names_out(categorical_cols))
X_scaled_df = pd.DataFrame(X_scaled, columns=numerical_cols)
X_combined = pd.concat([X_scaled_df, X_encoded_df], axis=1)

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.2, random_state=42)

# Training a RandomForestRegressor model
model = RandomForestRegressor(n_estimators=300, random_state=42)
model.fit(X_train, y_train)

# Predicting and evaluating the model
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5

# Outputting the model's performance
mse, rmse



(6325181495.177461, 79531.00964515326)

In [51]:
y.sort_values().to_frame()

Unnamed: 0,sqm_price
4048,1.729730e+02
3111,1.729730e+02
7888,2.526316e+02
10565,3.506721e+02
11272,3.506721e+02
...,...
4387,2.706383e+06
4393,2.706383e+06
4390,2.706383e+06
4396,2.706383e+06
