In [8]:
# Feature Engineering Notebook - Efficient Encoding

import pandas as pd

# Load the cleaned data from data preprocessing stage
file_path = '../../data/cleaned_immo_data.csv'  # Ensure this is correct
data = pd.read_csv(file_path)

In [9]:
# Step 1: Frequency Encoding for High-Cardinality Columns (e.g., `regio` columns)
high_cardinality_cols = ['regio1', 'regio2', 'regio3']
for col in high_cardinality_cols:
    freq_encoding = data[col].value_counts().to_dict()
    data[col] = data[col].map(freq_encoding)

In [10]:
# Step 2: Identify Remaining Categorical Columns and Apply Encoding
# Find columns that contain non-numeric data
non_numeric_cols = data.select_dtypes(include=['object']).columns
print("Remaining non-numeric columns to encode:", non_numeric_cols)

Remaining non-numeric columns to encode: Index(['heatingType', 'telekomTvOffer', 'firingTypes', 'geo_bln',
       'houseNumber', 'geo_krs', 'condition', 'interiorQual', 'petsAllowed',
       'street', 'streetPlain', 'typeOfFlat', 'description', 'facilities',
       'date'],
      dtype='object')


In [11]:
# Apply frequency encoding or label encoding to remaining non-numeric columns
for col in non_numeric_cols:
    freq_encoding = data[col].value_counts().to_dict()
    data[col] = data[col].map(freq_encoding)

In [12]:
# Step 3: Convert Boolean Features to Integers if Any
bool_cols = data.select_dtypes(include='bool').columns
data[bool_cols] = data[bool_cols].astype(int)

In [13]:
# Display the transformed data to confirm changes
print("Data after encoding all categorical columns:")
print(data.head())


Data after encoding all categorical columns:
   regio1  serviceCharge  heatingType  telekomTvOffer  newlyConst  balcony  \
0   58651         245.00     119057.0        205581.0           0        0   
1    7750         134.00      17905.0        205581.0           0        1   
2   55792         255.00      11792.0        205581.0           1        1   
3   55792          58.15      22216.0        205581.0           0        1   
4    2774         138.00      17905.0             NaN           0        1   

   picturecount  pricetrend  telekomUploadSpeed  totalRent  ...  floor  \
0             6        4.62                10.0      840.0  ...    1.0   
1             8        3.47                10.0      650.0  ...    2.0   
2             8        2.72                 2.4     1300.0  ...    3.0   
3             9        1.53                40.0      650.0  ...    3.0   
4            19        2.46                40.0      903.0  ...    1.0   

   numberOfFloors  noRoomsRange  garden  

In [16]:
# Step 4: Handle Missing Values with Median Imputation
# Impute NaN values with the median of each column
data = data.fillna(data.median())

In [17]:
# Step 4: Save the feature-engineered data
data.to_csv('../../data/feature_engineered_immo_data.csv', index=False)
print("Feature engineering complete. Saved as 'feature_engineered_immo_data.csv'")

Feature engineering complete. Saved as 'feature_engineered_immo_data.csv'
