# Contents

[1. The Data](#1)

[2. Imports](#2)

[3. Feature Engineering and Data Cleaning](#3)

&emsp; [3.1. Feature Engineering](#3.1) <br>
&emsp; [3.2. Outliers removal](#3.2)


[4. Exploratory Data Analysis](#4)

[5. Apartments Location Heatmap](#5)

[6. Regression Model](#6)

[7. Model Evaluation](#7)

<span id = "2"></span>
# Imports

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, explained_variance_score
import xgboost
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split

import folium
from folium import plugins
from folium.plugins import HeatMap, FastMarkerCluster

%matplotlib inline
sns.set_style('whitegrid')
sns.set_context('paper', font_scale=1.5)

In [None]:
df2 = pd.read_csv('../input/house-prices-in-poland/Houses.csv', encoding='ISO-8859-2', index_col=0)

In [None]:
df2.info()

In [None]:
df2.describe().transpose()

<span id = "3"></span>
# Feature Engineering and Data Cleaning

<span id = "3.1"></span>
#### Feature Engineering

In [None]:
df3 = df2.copy()

# create new column,which tells us: what is the price per square meter?
df3['price_per_sqmeter'] = df3['price'] / df3['sq']
df3.head()

In [None]:
df4 = df3

# create new column, which tells us: how many square meters per room?
df4['sqm_per_room'] = df4['sq'] / df4['rooms']
df4.head()

<span id = "3.2"></span>
#### Outliers Removal 

In [None]:
## sq
df4.sort_values('sq')

In [None]:
# Create list of rows with mistakenly sq value
out_sq = df4[(df4['sq'] > 450) | (df4['sq'] < 10)].index.tolist()

# Remove 
df4 = df4.drop(out_sq)

In [None]:
## year
df4.sort_values('year')

In [None]:
# Create list of rows with mistakenly year value
out_year = df4[(df4['year'] > 2024) | (df4['year'] < 1200)].index.tolist()

# Remove 
df4 = df4.drop(out_year)

In [None]:
## sqm_per_room
df4.sort_values('sqm_per_room', ascending = True)

In [None]:
# mean sqm_per_room
df4['sqm_per_room'].mean()

In [None]:
# How many rows are there more square meters per room than 51?
len(df4[df4['sqm_per_room'] > 51])

In [None]:
# How many rows are there less square meters per room than 10?
len(df4[df4['sqm_per_room'] < 10])

In [None]:
# Create a list of these rows
out_sqm_per_room = df4[(df4['sqm_per_room'] > 51) | (df4['sqm_per_room'] < 10)].index.tolist()

# Remove 
df5 = df4.drop(out_sqm_per_room)

In [None]:
## price_per_sqmeter
df5.sort_values('price_per_sqmeter')

In [None]:
df5[(df5['price_per_sqmeter'] > 58000.0) | (df5['price_per_sqmeter'] < 2000.0)]

In [None]:
# Create a list of rows where price_per_sqmeter > 58k and price_per_sqmeter < 2k
out_price_per_sqmeter = df5[(df5['price_per_sqmeter'] > 58000.0) | (df5['price_per_sqmeter'] < 2000.0)].index.tolist()
out_price_per_sqmeter

In [None]:
# remove these rows
df5 = df5.drop(out_price_per_sqmeter)

In [None]:
df5.describe().transpose()

<span id = "4"></span>
# EDA

In [None]:
# Count of sales announcement in a given city
plt.figure(figsize=(12,6))
sns.countplot(x = 'city', data = df5, 
              order = ['Warszawa', 'Kraków', 
                       'Poznań']).set_title("Count of sales announcement in a given city")

<p>In total, in the three cities we have a little less than 24,000 housing ads.
Warsaw and Krakow are on a similar level with slightly less than 10,000 ads per city. Poznań has around 4,000 ads.</p>

In [None]:
# Histogram of price per square meter
plt.figure(figsize=(12,6))
g = sns.histplot(x = 'price_per_sqmeter', data = df5, bins = 25, color = 'tomato', 
                 kde = True)
g.set(xlim=(0, None))
g.set_title('Histogram of price per square meter')

<p>The average price per square meter, taking into account the three cities, is PLN 10.735.</p>

In [None]:
# Histogram of an apartments prices closer look
plt.figure(figsize=(12,6))
plt.ticklabel_format(style='plain', axis='x')
g = sns.histplot(x = 'price', data = df5[df5['price'] < 2000000], bins = 50, color = 'tomato', 
                 kde = True)
g.set(xlim=(0, None))
g.set_title('Histogram of an apartments prices')

<p>The average price for an apartment, taking three cities into account, is PLN 641.818.</p>

<p>How do these prices look for individual cities?</p>

In [None]:
# Histogram of price per square meter in Poznań
plt.figure(figsize=(12,6))
g = sns.histplot(x = 'price_per_sqmeter', data = df5[df5['city'] == "Poznań"], bins = 25, color = 'green', 
                 kde = True)
g.set(xlim=(0, None))
g.set_title('Histogram of price per square meter in Poznań')

<p>In Poznań, the average price per square meter is PLN 8.500.</p>
<p>However, it can be noticed that in most of the sale announcements it is around PLN 7.500.</p>

In [None]:
# Histogram of an apartments prices in Poznań
plt.figure(figsize=(12,6))
g = sns.histplot(x = 'price', data = df5[df5['city'] == "Poznań"], bins = 25, color = 'green', 
                 kde = True)
g.set(xlim=(0, None))
g.set_title('Histogram of an apartments prices in Poznań')
plt.ticklabel_format(style='plain', axis='x')

<p>When it comes to the price of an apartment, in Poznań we have to pay an average of PLN 467.000.</p>

<p>How does it look like in Warsaw?</p>

In [None]:
# Histogram of price per square meter in Warszawa
plt.figure(figsize=(12,6))
g = sns.histplot(x = 'price_per_sqmeter', data = df5[df5['city'] == "Warszawa"], bins = 25, color = 'blue', 
                 kde = True)
g.set(xlim=(0, None))
g.set_title('Histogram of price per square meter in Warszawa')

<p>In Warsaw, the average price per square meter is PLN 12.000.</p>

In [None]:
# Histogram of an apartments prices in Warszawa
plt.figure(figsize=(12,6))

g = sns.histplot(x = 'price', data = df5[df5['city'] == "Warszawa"], bins = 40, color = 'blue', 
                 kde = True)

g.set(xlim=(0, None))
g.set_title('Histogram of an apartments prices in Warszawa')
plt.ticklabel_format(style='plain', axis='x')

In [None]:
# Histogram of an apartments prices in Warszawa closer look
plt.figure(figsize=(12,6))

g = sns.histplot(x = 'price', data = df5[(df5['city'] == "Warszawa") & (df5['price'] < 2000000)], 
                 bins = 20, color = 'blue', kde = True)

g.set(xlim=(0, None))
g.set_title('Histogram of an apartments prices in Warszawa')
plt.ticklabel_format(style='plain', axis='x')

<p>When it comes to the price of an apartment, in Warsaw we have to pay an average of PLN 766.000.</p>
<p>However, it can be noticed that in most of the sale announcements it is around PLN 450,000 - 550.000 for apartment.</p>

<p>How does it look like in Cracow?</p>

In [None]:
# Count of price per square meter in Kraków
plt.figure(figsize=(12,6))
g = sns.histplot(x = 'price_per_sqmeter', data = df5[df5['city'] == "Kraków"], bins = 25, color = 'orange',
                kde = True)
g.set(xlim=(0, None))
g.set_title('Count of price per square meter in Kraków')

<p>In Krakow, the average price per square meter is PLN 10.000.
However, it can be noticed that in most of the sale announcements it is around PLN 8.000.</p>

In [None]:
# Histogram of an apartments prices in Kraków
plt.figure(figsize=(12,6))

g = sns.histplot(x = 'price', data = df5[df5['city'] == "Kraków"], bins = 40, color = 'orange', 
                 kde = True)

g.set(xlim=(0, None))
g.set_title('Histogram of an apartments prices in Kraków')
plt.ticklabel_format(style='plain', axis='x')

In [None]:
# Histogram of an apartments prices in Kraków closer look
plt.figure(figsize=(12,6))

g = sns.histplot(x = 'price', data = df5[(df5['city'] == "Kraków") & (df5['price'] < 2000000)], 
                 bins = 20, color = 'orange', kde = True)

g.set(xlim=(0, None))
g.set_title('Histogram of an apartments prices in Kraków')
plt.ticklabel_format(style='plain', axis='x')

<p>When it comes to the price of an apartment, in Cracow we have to pay an average of PLN 590.000.</p>

In [None]:
# Scatter plot of The size of the apartment [sqm] vs price'
plt.figure(figsize=(12,6))
plt.ticklabel_format(style='plain', axis='y')
g = sns.scatterplot(x = 'sq', y = 'price', data = df5, hue = 'city', 
                    hue_order = ['Warszawa', 'Kraków', 'Poznań'])
g.set_title('The size of the apartment [sqm] vs price')

In [None]:
# Scatter plot The size of the apartment [sqm] vs price Poznań
plt.figure(figsize=(12,6))
plt.ticklabel_format(style='plain', axis='y')

g = sns.scatterplot(x='sq', y= 'price', data = df5[df5['city'] == "Poznań"], color = 'green')
g.set_title('The size of the apartment [sqm] vs price Poznań')

In [None]:
# Scatter plot The size of the apartment [sqm] vs price Warszawa
plt.figure(figsize=(12,6))
plt.ticklabel_format(style='plain', axis='y')

g = sns.scatterplot(x='sq', y= 'price', data = df5[df5['city'] == "Warszawa"], color = 'blue')
g.set_title('The size of the apartment [sqm] vs price Warszawa')

In [None]:
# Scatter plot The size of the apartment [sqm] vs price Kraków
plt.figure(figsize=(12,6))
plt.ticklabel_format(style='plain', axis='y')

g = sns.scatterplot(x='sq', y= 'price', data = df5[df5['city'] == "Kraków"], color = 'orange')
g.set_title('The size of the apartment [sqm] vs price Kraków')

<p>On the above scatter plots we can see regressions. As the size of the apartment increases, so does its price.</p>

<p>We can still notice outliers in each of the cities. They are often more exclusive flats, where, despite similar parameters, the flat can be in a very convenient location with very good modern equipment.</p>

In [None]:
# Correlation heatmap
plt.figure(figsize=(14,12))
sns.set_context('paper', font_scale=2)

corr= df5.corr()
# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(corr, dtype=bool))
sns.heatmap(corr, annot=True, cmap='Blues', mask=mask)

<p>Take a look at the heatmap correlation. The most positive correlation is shown by the following 
variables: the number of square meters and the number of rooms in an apartment. 
Which makes sense, because the more rooms, the bigger the apartment is.</p>

<p>The second variable that strongly correlates with the number of square meters is the price, 
which also makes sense, because the larger the apartment, the greater its price. 
This has also been shown to us by scatter plots.</p>

In [None]:
# Count of sales announcements by number of rooms in the apartment'
plt.figure(figsize=(12,6))
sns.countplot(x = 'rooms', hue_order = ['Warszawa', 'Kraków', 'Poznań'],
              data = df5, 
              hue = 'city').set_title('Count of sales announcements by number of rooms in the apartment')

<p>In the chart above, we can see that in each of the three cities, ads for two and three-room apartments have an advantage.</p>

In [None]:
# Strip plot Price vs Rooms
plt.figure(figsize=(12,8))
plt.ticklabel_format(style='plain', axis='y')
sns.stripplot(x = 'rooms', data = df5, y = 'price', hue = 'city', jitter=True,
             hue_order = ['Warszawa', 'Kraków', 'Poznań'], dodge = True).set_title('Price vs Rooms')

<p>The chart above shows the price of flats in relation to the number of rooms.</p>
<p>We know that most of the ads are two or three-room apartments.
We can notice that their price, similarly to the other values of the number of rooms, is about PLN 500.000.</p>

In [None]:
# Box plots Price vs year of building
plt.figure(figsize=(24,12))
plt.ticklabel_format(style='plain', axis='y')
plt.xticks(fontsize=8, rotation=90)
g = sns.boxplot(x = 'year', data = df5, y = 'price', 
                palette = 'rocket_r').set_title('Price vs year of building')

<p>The graph shows the price vs the year of the building / apartment.
The oldest apartment is from 1390, probably in an old tenement house. The newest apartment is 2024, so it is probably planned to be built, but it is already possible to buy it. The average year of a building / apartment from all advertisements is 2001.</p>
<p>We can also notice that in newer flats in the 1995-2024 range, there are many outliers. It is probably related to previously mentioned exclusive apartments.</p>

In [None]:
df5[df5['year'] >= 2021].describe().transpose()

<p>Finally, let's look at the average values referring only to apartments from 2021 or newer.</p>
<p>We can say that the average price of a flat is PLN 546.000. It is a three-room apartment on the 3rd floor with an area of approximately 56 square meters.</p>

<span id = "5"></span>
# Apartments Location Heatmap

In [None]:
# Heatmap Poznań

# list of all latitude 
pzn_lat = df5[df5['city'] == "Poznań"].latitude.tolist()
# list of all longitude 
pzn_lng = df5[df5['city'] == "Poznań"].longitude.tolist()
# matrix of all [[latitude,longitude], ...[]]
pzn_data = np.column_stack((pzn_lat, pzn_lng))

m = folium.Map(location=[52.4127903, 16.9222533], zoom_start=12, tiles="Stamen Terrain")

HeatMap(pzn_data).add_to(folium.FeatureGroup(name='Heat Map').add_to(m))
folium.LayerControl().add_to(m)

m

In [None]:
# Heatmap Warszawa

# list of all latitude 
waw_lat = df5[df5['city'] == "Warszawa"].latitude.tolist()
# list of all longitude 
waw_lng = df5[df5['city'] == "Warszawa"].longitude.tolist()
# matrix of all [[latitude,longitude], ...[]]
waw_data = np.column_stack((waw_lat, waw_lng))

m = folium.Map(location=[52.235, 21.035], zoom_start=11, tiles="Stamen Terrain")

HeatMap(waw_data).add_to(folium.FeatureGroup(name='Heat Map').add_to(m))
folium.LayerControl().add_to(m)

m

In [None]:
# Heatmap Kraków

# list of all latitude 
krk_lat = df5[df5['city'] == "Kraków"].latitude.tolist()
# list of all longitude 
krk_lng = df5[df5['city'] == "Kraków"].longitude.tolist()
# matrix of all [[latitude,longitude], ...[]]
krk_data = np.column_stack((krk_lat, krk_lng))

m = folium.Map(location=[50.063367, 19.935186], zoom_start=12, tiles="Stamen Terrain")

HeatMap(krk_data).add_to(folium.FeatureGroup(name='Heat Map').add_to(m))
folium.LayerControl().add_to(m)

m

<span id = "6"></span>
# Regression Model

In [None]:
df5.info()

In [None]:
# one hot encoding on city
df6 = pd.get_dummies(df5, columns = ['city'])
df6.info()

In [None]:
# setting X with columns important to the model and y
X = df6.drop(['address', 'id', 'latitude', 'longitude', 'price', 'price_per_sqmeter', 'sqm_per_room'], axis =1)
y = df6['price']

In [None]:
X.info()

In [None]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
# xgboost 
regressor=xgboost.XGBRegressor()

In [None]:
## Hyper Parameter Optimization

n_estimators = [100, 500, 900, 1100, 1500]
max_depth = [2, 3, 5, 10, 15]
booster=['gbtree','gblinear']
learning_rate=[0.05,0.1,0.15,0.20]
min_child_weight=[1,2,3,4]
base_score=[0.25,0.5,0.75,1]

# Define the grid of hyperparameters to search
hyperparameter_grid = {
    'n_estimators': n_estimators,
    'max_depth':max_depth,
    'learning_rate':learning_rate,
    'min_child_weight':min_child_weight,
    'booster':booster,
    'base_score':base_score
    }

In [None]:
# Set up the random search with 4-fold cross validation

random_cv = RandomizedSearchCV(estimator=regressor,
            param_distributions=hyperparameter_grid,
            cv=5, n_iter=50,
            scoring = 'neg_mean_absolute_error',n_jobs = 4,
            verbose = 5, 
            return_train_score = True,
            random_state=42)

In [None]:
# fit the train data
# random_cv.fit(X_train, y_train)

In [None]:
# checking best estimator
# random_cv.best_estimator_

In [None]:
# define regressor with the best estimator
regressor = xgboost.XGBRegressor(base_score=1, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.1, max_delta_step=0, max_depth=15,
             min_child_weight=1, missing=None, monotone_constraints='()',
             n_estimators=900, n_jobs=4, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [None]:
# fit the train data
regressor.fit(X_train, y_train)

<span id = "7"></span>
# Model Evaluation

In [None]:
# predict the price for test data
y_pred =regressor.predict(X_test)

In [None]:
print("\tModel Evaluation")
print('\n')
print('mean_absolute_error:')
print(mean_absolute_error(y_test, y_pred))
print('\n')
print('sqrt mean_squared_error:')
print(np.sqrt(mean_squared_error(y_test,y_pred)))
print('\n')
print('r2_score:')
print(r2_score(y_test,y_pred))
print('\n')
print('explained_variance_score:')
print(explained_variance_score(y_test,y_pred))

In [None]:
# Our predictions
plt.figure(figsize=(12,8))
plt.scatter(y_test,y_pred)

# Perfect predictions
line = plt.plot(y_test,y_test,'r')

# labels
plt.xlabel('True Price')
plt.ylabel('Predicted Price')
plt.title('True Price vs Predicted Price')
plt.legend(line, ['Perfect Predictions']);

In [None]:
# mean price for a apartment
df5['price'].mean()

In [None]:
# MAE * 100 / (mean price for a apartment)
mean_absolute_error(y_test, y_pred) * 100 / df5['price'].mean()

# we off by around 13,7 %