In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder

In [3]:
cleaned_file = 'otodomScrapRAWData_cleaned.csv'
df = pd.read_csv(cleaned_file)
print(df.shape)
df.head(6)

(17010, 12)


Unnamed: 0,price,latitude,longitude,surface_area,num_of_room,floor,num_floors_in_building,finish_condition,form_of_property,balcony_garden_terrace,parking_space,heating
0,1109400.0,52.196401,20.95204,73.06,4,1.0,7.0,for finishing,full ownership,Balcony,garage/parking space,
1,,52.112868,20.973348,47.87,2,0.0,3.0,for finishing,full ownership,"Terrace, Garden",,
2,1255000.0,52.23313,21.019,42.0,3,4.0,6.0,for living,full ownership,Balcony,,municipal
3,980000.0,52.27225,20.928295,60.0,2,0.0,7.0,for living,full ownership,Garden,garage/parking space,municipal
4,880000.0,52.238722,20.961506,49.9,2,7.0,7.0,for living,,,,municipal
5,,52.164608,20.799947,130.46,4,2.0,4.0,for finishing,full ownership,Balcony,garage/parking space,


In [3]:
from sklearn.preprocessing import MultiLabelBinarizer

# Step 1: Split the strings into lists
df['balcony_garden_terrace'] = df['balcony_garden_terrace'].apply(lambda x: x.split(', '))

# Step 2: Apply MultiLabelBinarizer
mlb = MultiLabelBinarizer()
one_hot_encoded = mlb.fit_transform(df['balcony_garden_terrace'])

# Create a DataFrame with the one-hot encoded columns
one_hot_encoded_df = pd.DataFrame(one_hot_encoded, columns=mlb.classes_, index=df.index)

# Step 3: Combine the one-hot encoded columns with the original DataFrame
df = pd.concat([df, one_hot_encoded_df], axis=1)

# Drop the original 'balcony_garden_terrace' column
df = df.drop(["balcony_garden_terrace"], axis=1)

# Display the resulting DataFrame
df.head(6)

Unnamed: 0,price,latitude,longitude,surface_area,num_of_room,floor,num_floors_in_building,finish_condition,form_of_property,parking_space,heating,Balcony,Elevator,Garage,Laundry,Terrace
0,,52.172792,20.994102,35.41,5,3.0,8.0,for finishing,full ownership,garage/parking space,,0,1,0,0,1
1,970000.0,52.182325,21.025885,48.55,3,7.0,15.0,for living,full ownership,garage/parking space,municipal,0,0,0,1,0
2,629000.0,52.286913,20.940933,36.5,1,0.0,10.0,for living,full ownership,,municipal,0,0,1,0,0
3,2480000.0,52.23805,21.02935,99.69,1,3.0,4.0,for living,cooperative ownership right to the premises,,municipal,0,0,1,0,0
4,560000.0,52.446508,20.692522,50.0,4,1.0,5.0,,,garage/parking space,,0,0,1,0,0
5,819000.0,52.201297,20.88994,60.0,2,1.0,7.0,,full ownership,garage/parking space,municipal,1,1,0,1,0


In [4]:
print(df['Balcony'].dtype)


int32


In [5]:
df['parking_space'] = df['parking_space'].apply(lambda x: 1 if x == 'garage/parking space' else 0).astype('int8')

In [6]:
df = pd.concat([df, pd.get_dummies(df['form_of_property'])], axis=1)
df = df.drop(['form_of_property'], axis=1)

In [7]:
df = pd.concat([df, pd.get_dummies(df['heating'])], axis = 1)
df = df.drop(['heating'], axis=1)

In [8]:
df.head(6)

Unnamed: 0,price,latitude,longitude,surface_area,num_of_room,floor,num_floors_in_building,finish_condition,parking_space,Balcony,...,Garage,Laundry,Terrace,cooperative ownership right to the premises,full ownership,boiler room,electric,gas,municipal,other
0,,52.172792,20.994102,35.41,5,3.0,8.0,for finishing,1,0,...,0,0,1,False,True,False,False,False,False,False
1,970000.0,52.182325,21.025885,48.55,3,7.0,15.0,for living,1,0,...,0,1,0,False,True,False,False,False,True,False
2,629000.0,52.286913,20.940933,36.5,1,0.0,10.0,for living,0,0,...,1,0,0,False,True,False,False,False,True,False
3,2480000.0,52.23805,21.02935,99.69,1,3.0,4.0,for living,0,0,...,1,0,0,True,False,False,False,False,True,False
4,560000.0,52.446508,20.692522,50.0,4,1.0,5.0,,1,0,...,1,0,0,False,False,False,False,False,False,False
5,819000.0,52.201297,20.88994,60.0,2,1.0,7.0,,1,1,...,0,1,0,False,True,False,False,False,True,False


In [9]:
# df['finish_condition'] = df['finish_condition'].apply(lambda x:
#                                                       1 if x == 'for living'
#                                                       else 0.8 if x == 'for renovation'
#                                                       else 0.5 if pd.isnull(x)
#                                                       else 0 if x == 'for finishing'
#                                                       else 0.5)
# 'for living'
# 'for finishing'
# 'for renovation'
#  NaN

In [10]:
# Split the DataFrame into two parts based on whether 'price' is missing
df_without_price = df[df['price'].isna()]
df = df[df['price'].notna()]

X = df.drop('price', axis=1)
y = df['price']

In [11]:
df.sample(5)

Unnamed: 0,price,latitude,longitude,surface_area,num_of_room,floor,num_floors_in_building,finish_condition,parking_space,Balcony,...,Garage,Laundry,Terrace,cooperative ownership right to the premises,full ownership,boiler room,electric,gas,municipal,other
41,899000.0,52.220145,20.89631,43.91,3,0.0,4.0,for living,1,0,...,0,0,1,False,True,False,False,False,True,False
101,1185000.0,52.225769,21.026033,45.17,3,0.0,3.0,for renovation,0,0,...,0,0,1,False,True,False,False,False,True,False
79,899000.0,52.220145,20.897082,43.91,5,0.0,4.0,for living,1,1,...,0,1,0,False,True,False,False,False,True,False
56,967500.0,52.161609,21.12469,45.0,1,0.0,2.0,for living,1,0,...,0,0,1,False,True,False,False,True,False,False
29,850000.0,52.245573,21.188513,66.05,1,1.0,3.0,for living,1,1,...,0,0,1,False,True,False,False,False,True,False


In [12]:
df.columns

Index(['price', 'latitude', 'longitude', 'surface_area', 'num_of_room',
       'floor', 'num_floors_in_building', 'finish_condition', 'parking_space',
       'Balcony', 'Elevator', 'Garage', 'Laundry', 'Terrace',
       'cooperative ownership right to the premises', 'full ownership',
       'boiler room', 'electric', 'gas', 'municipal', 'other'],
      dtype='object')

In [15]:
price_living = df[df['finish_condition'] == 'for living']['price']
price_finishing = df[df['finish_condition'] == 'for finishing']['price']
print(price_living.shape)
print(price_finishing.shape)
print(price_living.dtype)
print(price_finishing.dtype)

(82,)
(14,)
float64
float64


In [14]:
import scipy as sp
SIGNIFICANCE_LEVEL = 0.05

t_stat, p_value = sp.stats.ttest_ind(price_living, price_finishing)
print(t_stat, p_value)

rejecting = p_value < SIGNIFICANCE_LEVEL
print("NULL Hypth. H0: A will-be-ready-to-live house is cheaper than a ready-to-live house.")
print(("" if rejecting else "Not ") + "possible to reject the null hypothesis")

1.663090142066532 0.09962526245987119
NULL Hypth. H0: A will-be-ready-to-live house is cheaper than a ready-to-live house.
Not possible to reject the null hypothesis
