In [11]:
# Import necessary libraries
import pandas as pd
import numpy as np

# Load the dataset
file_path = '../../data/cleaned_immo_data.csv'  # Update this path if needed
data = pd.read_csv(file_path)

# View the first few rows of the dataset
data.columns

Index(['regio1', 'serviceCharge', 'heatingType', 'telekomTvOffer',
       'newlyConst', 'balcony', 'picturecount', 'pricetrend',
       'telekomUploadSpeed', 'totalRent', 'yearConstructed', 'scoutId',
       'firingTypes', 'hasKitchen', 'geo_bln', 'cellar',
       'yearConstructedRange', 'baseRent', 'houseNumber', 'livingSpace',
       'geo_krs', 'condition', 'interiorQual', 'petsAllowed', 'street',
       'streetPlain', 'lift', 'baseRentRange', 'typeOfFlat', 'geo_plz',
       'noRooms', 'thermalChar', 'floor', 'numberOfFloors', 'noRoomsRange',
       'garden', 'livingSpaceRange', 'regio2', 'regio3', 'description',
       'facilities', 'date'],
      dtype='object')

#### Step 1: Calculate Property Age and Years Since Last Refurbishment


In [14]:
current_year = pd.Timestamp.now().year
if 'yearConstructed' in data.columns:
    data['property_age'] = current_year - data['yearConstructed']
    data['property_age'].fillna(data['property_age'].median(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['property_age'].fillna(data['property_age'].median(), inplace=True)


#### Step 2: Encode Categorical Features
#### Step 3: Convert Boolean Columns to Numeric

In [16]:
# Choose categorical columns that need encoding
categorical_features = ['regio1', 'regio2', 'regio3', 'typeOfFlat']
data = pd.get_dummies(data, columns=[col for col in categorical_features if col in data.columns], drop_first=True)


# Identify boolean columns and convert to 0/1
bool_cols = ['newlyConst', 'balcony', 'hasKitchen', 'cellar', 'lift', 'garden']
for col in bool_cols:
    if col in data.columns:
        data[col] = data[col].astype(int)

#### Step 4: Scale Numerical Features

In [17]:
from sklearn.preprocessing import StandardScaler

# Define the numerical columns to scale
numerical_features = [col for col in ['livingSpace', 'baseRent', 'totalRent', 'property_age'] if col in data.columns]

# Initialize scaler and apply scaling
scaler = StandardScaler()
data[numerical_features] = scaler.fit_transform(data[numerical_features])


#### Step 5: Save the Processed Data


In [18]:
data.to_csv('../../data/processed_immo_data.csv', index=False)
print("Feature engineering complete and saved as 'processed_immo_data.csv'")

Feature engineering complete and saved as 'processed_immo_data.csv'
