## Importing libraries

In [32]:
from scipy.stats import zscore
import numpy as np
import pandas as pd
import os

In [20]:
data = pd.read_csv('../Data/raw_data1.csv')

## Feature Engineering

In [21]:
# Check if the property is on the top floor or first floor
data['is_top_floor'] = data['current_floor'] == data['total_from']
data['is_first_floor'] = data['current_floor'] == 1

# Marking old buildings
data['is_old_building'] = data['building_type'].apply(lambda x: 1 if x == 'old' else 0)

# Converting 'Yes'/'No' to binary (1/0) for bill_of_sale column
data['bill_of_sale'] = data['bill_of_sale'].apply(lambda x: 1 if x == 'Yes' else 0)

# Encoding repair status as binary (1/0)
data['repair_status_encoded'] = data['repair_status'].apply(lambda x: 1 if x == 'Yes' else 0)

In [22]:
# Creating room to area ratio and area to room ratio
data['room_area_ratio'] = data['room_size'] / data['area']
data['area_room_ratio'] = data['area'] / data['room_size']

# Creating features related to floor density and area-floor density
data['floor_density'] = data['current_floor'] / (data['total_from'] + 1)
data["area_floor_density"] = data["area"] * data["floor_density"]

# Interaction between top floor and old building
data["top_old_building"] = data["is_top_floor"] * data["is_old_building"]

In [23]:
# Creating dummy variables for categorical columns
data = pd.get_dummies(data, columns=['Location1', 'Location2', 'building_type', 'repair_status'], drop_first=True)

# Applying log transformation to handle skewness
data["log_total_floors"] = np.log1p(data["total_from"])
data["log_area_floor_density"] = np.log1p(data["area_floor_density"])

# Squaring the floor density to capture non-linear relationships
data["floor_density_squared"] = data["floor_density"] ** 2

## Handling Outliers

In [24]:
# Compute Z-scores for the 'price' column
data['z_score'] = zscore(data['price (AZN)'])

# Remove rows (outliers)
data_no_outliers = data[(data['z_score'].abs() <= 3)].copy()

# Drop columns that are no longer needed
data_no_outliers.drop('Item_id', axis=1, inplace=True)
data_no_outliers.drop('z_score', axis=1, inplace=True)

## Final Dataset Overview

In [26]:
data_no_outliers.head()

Unnamed: 0,room_size,area,current_floor,total_from,bill_of_sale,price (AZN),is_top_floor,is_first_floor,is_old_building,repair_status_encoded,...,Location2_Şıxov q.,Location2_Əhmədli m.,Location2_Əhmədli q.,Location2_Ələt q.,Location2_Əmircan q.,building_type_old,repair_status_Yes,log_total_floors,log_area_floor_density,floor_density_squared
0,2,46.0,2,5,0,52500,False,False,0,1,...,False,False,False,False,False,False,True,1.791759,2.793208,0.111111
1,3,130.0,7,11,1,158500,False,False,0,1,...,False,False,False,False,False,False,True,2.484907,4.341639,0.340278
2,2,44.0,1,6,1,58500,False,True,0,1,...,False,False,False,False,False,False,True,1.94591,1.985915,0.020408
3,3,92.0,5,8,1,123000,False,False,0,1,...,False,False,False,False,False,False,True,2.197225,3.953378,0.308642
4,3,75.0,5,8,1,105000,False,False,0,1,...,False,False,False,False,False,False,True,2.197225,3.753418,0.308642


In [31]:
data_no_outliers.describe()

Unnamed: 0,room_size,area,current_floor,total_from,bill_of_sale,price (AZN),is_old_building,repair_status_encoded,room_area_ratio,area_room_ratio,floor_density,area_floor_density,top_old_building,log_total_floors,log_area_floor_density,floor_density_squared
count,47705.0,47705.0,47705.0,47705.0,47705.0,47705.0,47705.0,47705.0,47705.0,47705.0,47705.0,47705.0,47705.0,47705.0,47705.0,47705.0
mean,2.75447,99.921884,7.828865,13.496132,0.796332,249646.939356,0.250728,0.84685,0.030156,36.024678,0.539726,54.216953,0.052007,2.58651,3.786489,0.349808
std,0.846849,46.85311,4.787762,5.342432,0.402729,136662.867746,0.433437,0.360135,0.008929,10.749893,0.241877,37.90143,0.222044,0.449613,0.704299,0.261727
min,1.0,10.0,1.0,1.0,0.0,8000.0,0.0,0.0,0.001688,5.333333,0.029412,1.666667,0.0,0.693147,0.980829,0.000865
25%,2.0,65.0,4.0,9.0,1.0,151000.0,0.0,1.0,0.023077,28.0,0.333333,27.5,0.0,2.302585,3.349904,0.111111
50%,3.0,90.0,7.0,16.0,1.0,216000.0,0.0,1.0,0.028846,34.666667,0.533333,46.647059,0.0,2.833213,3.863821,0.284444
75%,3.0,125.0,11.0,17.0,1.0,310000.0,1.0,1.0,0.035714,43.333333,0.75,72.0,0.0,2.890372,4.290459,0.5625
max,12.0,1185.0,27.0,34.0,1.0,860000.0,1.0,1.0,0.1875,592.5,0.964286,855.833333,1.0,3.555348,6.753243,0.929847


In [33]:
import os

# Create the folder if it doesn't exist
os.makedirs('../Data', exist_ok=True)

# Save the data
data_no_outliers.to_csv('../Data/processed_data.csv', index=False)