# Preprocessing pipeline

In [38]:
import sys
import pandas as pd
sys.path.append('../model/src')

from add_district import add_district
from create_train_test import create_train_test
from data_preprocessing import preprocess_data
from util import get_filename_and_extension


source_filename = './lux/lux_3k.csv'
filename = add_district(source_filename)
print("✅ Plik z dzielnicami utworzony")

train_filename, test_filename = create_train_test(filename, test_size=30)
print("✅ Pliki train/test utworzone")

train = pd.read_csv(train_filename)
test = pd.read_csv(test_filename)

# # Przetworzenie i zapis zestawów train/test
name, _ = get_filename_and_extension(source_filename)
config_filename = f"{name}.pkl"
train = preprocess_data(train, is_train=True, config_filename=config_filename)
test = preprocess_data(test, is_train=False, config_filename=config_filename)

# Porównanie kolumn i dodanie brakujących kolumn z zerami
for col in train.columns:
    if col not in test.columns:
        test[col] = 0

for col in test.columns:
    if col not in train.columns:
        train[col] = 0

# Zapewnienie, że kolumny w train i test są w tej samej kolejności
train = train[sorted(train.columns)]
test = test[sorted(test.columns)]

train.to_csv(train_filename, index=False)
test.to_csv(test_filename, index=False)

print(f"✅ Dane przetworzone i zapisane jako {train_filename} oraz {test_filename}")

✅ Plik z dzielnicami utworzony
✅ Pliki train/test utworzone
✅ Dane przetworzone i zapisane jako ./lux/lux_3k_district_train.csv oraz ./lux/lux_3k_district_test.csv


# Load data
Drop useless rows (without luxury level or price)

In [39]:
df = pd.read_csv(train_filename, sep=',')

initial_count = len(df)

# Filter rows where 'luxury_level' is >= 0
df = df[df['luxury_level'] >= 0]
luxury_level_dropped = initial_count - len(df)

# Filter rows where 'price' is >= 0
initial_count = len(df)
df = df[df['price'] >= 0]
price_dropped = initial_count - len(df)

# Filter rows where 'area' is >= 0
initial_count = len(df)
df = df[df['area'] >= 0]
area_dropped = initial_count - len(df)

# Display the counts of dropped rows
print(f"Rows dropped for 'luxury_level' < 0: {luxury_level_dropped}")
print(f"Rows dropped for 'price' < 0: {price_dropped}")
print(f"Rows dropped for 'area' < 0: {area_dropped}")
df.info()

Rows dropped for 'luxury_level' < 0: 762
Rows dropped for 'price' < 0: 0
Rows dropped for 'area' < 0: 84
<class 'pandas.core.frame.DataFrame'>
Index: 1475 entries, 0 to 2320
Data columns (total 25 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   ad_type                        1475 non-null   object 
 1   area                           1475 non-null   float64
 2   build_year                     1475 non-null   float64
 3   building_floors                1475 non-null   float64
 4   collage                        1475 non-null   object 
 5   distance_from_center           1475 non-null   float64
 6   distance_from_other_expensive  1475 non-null   float64
 7   floor                          1475 non-null   int64  
 8   heating                        1328 non-null   object 
 9   location_district              1475 non-null   object 
 10  location_lat                   1475 non-null   float64
 11  location

In [40]:
import sys
sys.path.append('../backend')

from model import model

# Convert csv to data requrired by model

In [41]:
import numpy as np
from pandas import Series

row = df.iloc[0]
features = model._model.get_booster().feature_names
n_features = len(features)

def convert(row: Series) -> np.ndarray:
    # print(f"{row['state'] = }, {type(row['state']) = }")
    if pd.isna(row['heating']):
        row['heating'] = 'urban'
    if pd.isna(row['ownership']):
        row['ownership'] = 'full_ownership'
    if pd.isna(row['state']):
        row['state'] = 'ready_to_use'
    form_data = {}
    # Categorical features
    form_data["ad_type"] = model._map("ad_type", row['ad_type'])
    form_data["heating"] = model._map("heating", row['heating'])
    form_data["location_district"] = model._map("location_district", row['location_district'])
    form_data["market"] = model._map("market", row['market'])
    form_data["ownership"] = model._map("ownership", row['ownership'])
    form_data["state"] = model._map("state", row['state'])


    form_data["area"] = row['area']
    form_data["build_year"] = row['build_year']
    form_data["building_floors"] = row['building_floors']
    form_data["floor"] = row['floor']

    form_data["location_lat"] = row['location_lat']
    form_data["location_lon"] = row['location_lon']
    form_data["rooms"] = row['rooms']
    form_data["utilities_balkon"] = int(row['utilities_balkon'])
    form_data["utilities_oddzielna kuchnia"] = int(row['utilities_oddzielna kuchnia'])
    form_data["utilities_piwnica"] = int(row['utilities_piwnica'])
    form_data["utilities_pom. użytkowe"] = int(row['utilities_piwnica'])
    form_data["utilities_taras"] = int(row['utilities_balkon'])
    form_data["utilities_winda"] = int(row['utilities_winda'])
    # TODO: available in frontend but not used: available from
    form_data["distance_from_center"] = row['distance_from_center']
    form_data["distance_from_other_expensive"] = row['distance_from_other_expensive']

    # Fill data_array with values from form_data using fetures array
    data_array = np.zeros([n_features])
    for i, feature in enumerate(features):
        if feature in form_data:
            data_array[i] = form_data[feature]
        else:
            raise ValueError(f"Feature '{feature}' not found in form_data")

    # Reshape the 1D array to a 2D array (1 x 38) as required by XGBoost
    data_array = np.reshape(data_array, (1, -1))
    return data_array

In [42]:
df_with_predictions = df.copy()
df_with_predictions['prediction'] = 0.0

name, _ = get_filename_and_extension(train_filename)
out_filename = f"{name}_predictions.csv"

for index, row in df.iterrows():
    try:
        data_array = convert(row)
    except ValueError as e:
        print(f"Error processing row {index}: {e}")
        continue
    prediction = model._model.predict(data_array)
    df_with_predictions.at[index, 'prediction'] = prediction[0]
    
df_with_predictions = df_with_predictions[df_with_predictions['prediction'] > 0]
df_with_predictions.to_csv(out_filename, index=False)
print(f"✅ Predykcje zapisane do pliku {out_filename}")

✅ Predykcje zapisane do pliku ./lux/lux_3k_district_train_predictions.csv
