In [None]:
import pandas as pd

df = pd.read_csv('adresowo_warszawa_wroclaw.csv').dropna(subset=['price_total_zl_cleaned'])

In [13]:
X = df[['area_m2', 'rooms', 'photos', 'locality', 'owner_direct', 'date_posted']]
y = df['price_total_zl_cleaned']

In [14]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [24]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import IterativeImputer


categorical_features = ['locality', 'owner_direct', 'date_posted']
numerical_features = ['area_m2', 'rooms', 'photos']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline(steps=[
            ('imputer', IterativeImputer()),
            ('scaler', StandardScaler())
        ]), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ]
)

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(
        max_depth=10,
        min_samples_split=5,
        random_state=42
    ))
])

In [25]:
pipeline.fit(X_train, y_train)

0,1,2
,steps,"[('preprocessor', ...), ('regressor', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,estimator,
,missing_values,
,sample_posterior,False
,max_iter,10
,tol,0.001
,n_nearest_features,
,initial_strategy,'mean'
,fill_value,
,imputation_order,'ascending'
,skip_complete,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,n_estimators,100
,criterion,'squared_error'
,max_depth,10
,min_samples_split,5
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [26]:

y_pred = pipeline.predict(X_test)
from sklearn.metrics import mean_absolute_error, root_mean_squared_error, r2_score

mae = mean_absolute_error(y_test, y_pred)
rmse = root_mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f'Mean Absolute Error: {mae}')
print(f'Root Mean Squared Error: {rmse}')
print(f'R^2 Score: {r2}')

Mean Absolute Error: 147294.1295517709
Root Mean Squared Error: 225847.5158674029
R^2 Score: 0.7103800400170714


In [27]:
from sklearn.model_selection import GridSearchCV


param_grid = {
    'regressor__n_estimators': [100, 200],
    'regressor__max_depth': [10, 20, None],
    'regressor__min_samples_split': [2, 5, 10]
}

grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='neg_mean_squared_error')

In [28]:
grid_search.fit(X_train, y_train)

0,1,2
,estimator,Pipeline(step...m_state=42))])
,param_grid,"{'regressor__max_depth': [10, 20, ...], 'regressor__min_samples_split': [2, 5, ...], 'regressor__n_estimators': [100, 200]}"
,scoring,'neg_mean_squared_error'
,n_jobs,
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,estimator,
,missing_values,
,sample_posterior,False
,max_iter,10
,tol,0.001
,n_nearest_features,
,initial_strategy,'mean'
,fill_value,
,imputation_order,'ascending'
,skip_complete,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,n_estimators,200
,criterion,'squared_error'
,max_depth,
,min_samples_split,10
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [29]:
best_model = grid_search.best_estimator_
y_pred_best = best_model.predict(X_test)

In [30]:
from sklearn.metrics import mean_absolute_error, root_mean_squared_error, r2_score

mae = mean_absolute_error(y_test, y_pred_best)
rmse = root_mean_squared_error(y_test, y_pred_best)
r2 = r2_score(y_test, y_pred_best)
print(f'Mean Absolute Error: {mae}')
print(f'Root Mean Squared Error: {rmse}')
print(f'R^2 Score: {r2}')

Mean Absolute Error: 142014.06426231176
Root Mean Squared Error: 216178.12270821485
R^2 Score: 0.7346486293329667


In [31]:
import joblib

joblib.dump(best_model, 'best_random_forest_model.pkl')

['best_random_forest_model.pkl']

In [34]:
X_test.iloc[0]

area_m2                         61
rooms                            3
photos                        17.0
locality        Warszawa Białołęka
owner_direct                  True
date_posted     ponad miesiąc temu
Name: 315, dtype: object

In [None]:
model_loaded = joblib.load('best_random_forest_model.pkl')

X_new = pd.DataFrame(
   [[61, 'Warszawa Białołęka', 3, True, 5, '6 dni temu']],
   columns=['area_m2', 'locality', 'rooms', 'owner_direct', 'photos', 'date_posted']
)
predicted_price = model_loaded.predict(X_new)
print(f"Predykowana cena: {predicted_price[0]:,.2f} zł")

Predykowana cena: 829,191.99 zł
