In [3]:
import pandas as pd
import numpy as np

# Load the cleaned data
file_path = '../../data/cleaned_immo_data.csv'  # Adjust the path as needed
data = pd.read_csv(file_path)

# Check the data to understand existing columns
data.head()


Unnamed: 0,regio1,serviceCharge,heatingType,telekomTvOffer,newlyConst,balcony,picturecount,pricetrend,telekomUploadSpeed,totalRent,...,floor,numberOfFloors,noRoomsRange,garden,livingSpaceRange,regio2,regio3,description,facilities,date
0,Nordrhein_Westfalen,245.0,central_heating,ONE_YEAR_FREE,False,False,6,4.62,10.0,840.0,...,1.0,3.0,4,True,4,Dortmund,Schüren,Die ebenerdig zu erreichende Erdgeschosswohnun...,Die Wohnung ist mit Laminat ausgelegt. Das Bad...,May19
1,Rheinland_Pfalz,134.0,self_contained_central_heating,ONE_YEAR_FREE,False,True,8,3.47,10.0,650.0,...,2.0,3.0,3,False,4,Rhein_Pfalz_Kreis,Böhl_Iggelheim,Alles neu macht der Mai – so kann es auch für ...,,May19
2,Sachsen,255.0,floor_heating,ONE_YEAR_FREE,True,True,8,2.72,2.4,1300.0,...,3.0,4.0,3,False,4,Dresden,Äußere_Neustadt_Antonstadt,Der Neubau entsteht im Herzen der Dresdner Neu...,"* 9 m² Balkon\n* Bad mit bodengleicher Dusche,...",Oct19
3,Sachsen,58.15,district_heating,ONE_YEAR_FREE,False,True,9,1.53,40.0,650.0,...,3.0,3.0,3,False,2,Mittelsachsen_Kreis,Freiberg,Abseits von Lärm und Abgasen in Ihre neue Wohn...,,May19
4,Bremen,138.0,self_contained_central_heating,,False,True,19,2.46,40.0,903.0,...,1.0,3.0,3,False,4,Bremen,Neu_Schwachhausen,Es handelt sich hier um ein saniertes Mehrfami...,Diese Wohnung wurde neu saniert und ist wie fo...,Feb20


In [57]:
# Step 2: Handle Missing Values (Initial Fill)
# Fill missing values for numeric columns with the median
numeric_cols = data.select_dtypes(include=[np.number]).columns
data[numeric_cols] = data[numeric_cols].fillna(data[numeric_cols].median())

# Fill missing values for non-numeric columns with 'Unknown'
non_numeric_cols = data.select_dtypes(include=['object']).columns
data[non_numeric_cols] = data[non_numeric_cols].fillna('Unknown')

# Step 3: Frequency Encoding for High Cardinality Categorical Variables
high_cardinality_cols = ['regio1', 'regio2', 'regio3', 'heatingType']
for col in high_cardinality_cols:
    freq_encoding = data[col].value_counts().to_dict()
    data[col] = data[col].map(freq_encoding)

# Step 4: Date-Based Features
# Extract month, year, and season information from the 'date' column
data['date'] = pd.to_datetime(data['date'], errors='coerce')
data['month'] = data['date'].dt.month
data['year'] = data['date'].dt.year
data['season'] = data['month'] % 12 // 3 + 1  # Maps months to seasons (1: Winter, 2: Spring, etc.)

# Step 5: Create Interaction Features
data['room_size'] = data['livingSpace'] / data['noRooms']  # Average room size
data['rent_per_sqm'] = data['totalRent'] / data['livingSpace']  # Rent per square meter
data['price_trend_adjusted_rent'] = data['totalRent'] * data['pricetrend']  # Adjust rent by price trend

# Step 6: Text Feature Engineering (Length Features)
# Measure length of 'description' and 'facilities' columns
data['description_length'] = data['description'].apply(lambda x: len(str(x)) if pd.notnull(x) else 0)
data['facilities_length'] = data['facilities'].apply(lambda x: len(str(x)) if pd.notnull(x) else 0)

# Step 7: Boolean Columns to Integers
boolean_columns = ['newlyConst', 'balcony', 'garden']
data[boolean_columns] = data[boolean_columns].astype(int)

# Step 8: Log Transformations for Skewed Features
data['livingSpace_log'] = np.log1p(data['livingSpace'])
data['totalRent_log'] = np.log1p(data['totalRent'])

# Step 9: Final Non-Numeric Check and Encoding
# Encode remaining non-numeric columns if any
remaining_non_numeric_cols = data.select_dtypes(include=['object']).columns
for col in remaining_non_numeric_cols:
    if data[col].nunique() < 10:
        data = pd.get_dummies(data, columns=[col], drop_first=True)
    else:
        # Frequency encoding as a fallback
        freq_encoding = data[col].value_counts().to_dict()
        data[col] = data[col].map(freq_encoding)

data.replace([np.inf, -np.inf], np.nan, inplace=True)
# Step 10: Final NaN Check and Fill (if any remain)
data = data.fillna(data.median())  # Final fill for any remaining NaN values
# Verify that no inf or NaN values remain
print("Remaining NaNs:", data.isna().sum().sum())
print("Max value in each column:", data.max(numeric_only=True))

  data['date'] = pd.to_datetime(data['date'], errors='coerce')


Remaining NaNs: 963152
Max value in each column: regio1                              58651
serviceCharge                    146118.0
heatingType                        119057
newlyConst                              1
balcony                                 1
picturecount                          121
pricetrend                          14.92
telekomUploadSpeed                  100.0
totalRent                          1397.5
yearConstructed                    2029.0
scoutId                         115711743
firingTypes                        101338
hasKitchen                           True
geo_bln                             58651
cellar                               True
yearConstructedRange                  9.0
baseRent                           1490.0
houseNumber                         59827
livingSpace                      111111.0
geo_krs                             12518
condition                           63380
street                              59822
streetPlain                

In [58]:
# Step 4: Save the feature-engineered data
data.to_csv('../../data/feature_engineered_immo_data.csv', index=False)
print("Feature engineering complete. Saved as 'feature_engineered_immo_data.csv'")

Feature engineering complete. Saved as 'feature_engineered_immo_data.csv'


#### Getting State, Cities, Streets => Regio1, Regio2, Regio3

In [15]:
import pandas as pd
import json
import os
from collections import defaultdict

# Load the dataset
data = pd.read_csv("../../data/cleaned_immo_data.csv")

# Define columns for the hierarchical structure
high_cardinality_cols = ['regio1', 'regio2', 'regio3']

# Initialize the hierarchical structure
hierarchical_mapping = defaultdict(lambda: defaultdict(lambda: defaultdict(int)))

# Build the hierarchical structure with frequencies
for _, row in data.iterrows():
    state = row['regio1']
    city = row['regio2']
    street = row['regio3']
    
    # Increase frequency count for each level
    hierarchical_mapping[state][city][street] += 1

# Transform the hierarchical structure to the required format
formatted_hierarchy = []
for state, cities in hierarchical_mapping.items():
    state_entry = {
        "name": state,
        "value": sum(city_freq for streets in cities.values() for city_freq in streets.values()),
        "cities": []
    }
    
    for city, streets in cities.items():
        city_entry = {
            "name": city,
            "value": sum(streets.values()),
            "streets": [{"name": street, "value": count} for street, count in streets.items()]
        }
        
        # Append each city entry to the state's cities list
        state_entry["cities"].append(city_entry)

    # Append each state entry to the main list
    formatted_hierarchy.append(state_entry)

# Define output path
output_dir = "../../data/regio/"
os.makedirs(output_dir, exist_ok=True)  # Create directory if it doesn't exist
file_path = os.path.join(output_dir, "hierarchical_frequency_mapping.json")

# Save the hierarchical mapping as a JSON file
with open(file_path, 'w', encoding='utf-8') as file:
    json.dump(formatted_hierarchy, file, ensure_ascii=False, indent=4)

print(f"Hierarchical frequency mapping saved to {file_path}")


Hierarchical frequency mapping saved to ../../data/regio/hierarchical_frequency_mapping.json
