# Data Cleaning & Preprocessing

### Setup

In [2]:
import pandas as pd
import numpy as np

### Raw Data Load & Overview

In [3]:
df_raw = pd.read_csv('../data/raw/cologne_data.csv', sep=';', encoding='unicode_escape') 

df_raw.head().T.head(10)

  df_raw = pd.read_csv('../data/raw/cologne_data.csv', sep=';', encoding='unicode_escape')


Unnamed: 0,0,1,2,3,4
S_JAHR,2012,2012,2012,2012,2012
S_RAUM,0,1,2,3,4
RAUM,0 / Stadt Köln,1 / Innenstadt,2 / Rodenkirchen,3 / Lindenthal,4 / Ehrenfeld
S_RAUMEBENE,0,1,1,1,1
RAUMEBENE,Gesamtstadt,Stadtbezirke,Stadtbezirke,Stadtbezirke,Stadtbezirke
A0002A,180415.0,21712.0,14788.0,14132.0,19811.0
A0002P,17271948,16985457,14337793,9872231,18779445
A0022S,4190013762,4086903262,4345253054,4206031943,4054831047
A0025A,1044555.0,127827.0,103140.0,143149.0,105493.0
A0027A,46426,4428,5331,6787,3935


### Column Selection & Renaming

In [4]:
rename_columns = {
    # --- meta ---
    'S_JAHR': 'year',
    'S_RAUM': 'area_code',
    'RAUM': 'area',
    'S_RAUMEBENE': 'area_level_code',
    'RAUMEBENE': 'area_level',
    
    # --- population & demography ---
    'A0022S': 'avg_age_total',
    'A0296S': 'avg_age_male',
    'A0297S': 'avg_age_female',
    'A0298S': 'avg_age_german',
    'A0299S': 'avg_age_non_german',
    'A0025A': 'population_total',
    'A0002A': 'non_german_total',
    'A0002P': 'non_german_share',
    'A0153A': 'migration_background_total',
    'A0153P': 'migration_background_share',
    'A0213A': 'female_residents_total',
    'A0109A': 'male_residents_total',
    'A0300A': 'births',
    'A0301A': 'deaths',
    
    # --- age groups ---
    'A0029A': 'population_00_03',
    'A0035A': 'population_03_06',
    'A0291A': 'population_06_15',
    'A0043A': 'population_15_18',
    'A0292A': 'population_18_21',
    'A0293A': 'population_21_35',
    'A0294A': 'population_35_60',
    'A0060A': 'population_60_65',
    'A0295A': 'population_65_75',
    'A0065A': 'population_75_80',
    'A0027A': 'population_80_plus',
    
    # --- households & marital status ---
    'A0267A': 'households_total',
    'A0273A': 'households_single',
    'A0273P': 'households_single_share',
    'A0275A': 'households_with_children',
    'A0275P': 'households_with_children_share',
    'A0363A': 'married_residents',
    'A0364A': 'single_residents',
    'A0365A': 'divorced_residents',
    'A0366A': 'widowed_residents',
    
    # --- housing ---
    'B0009A': 'new_flats_completed',
    'B0022S': 'living_area_per_capita_sqm',
    'B0023S': 'avg_flat_area_sqm',
    'B0026P': 'supported_housing_share',
    
     # --- socioeconomic ---
    'D0001P': 'unemployed_rate_total',
    'C0007A': 'social_benefit_recipients',
    'C0007P': 'social_benefit_recipients_share',
    'D0011A': 'employed_residents',
    'D0011P': 'employed_residents_share',
    # 'E0002A': 'students_by_residence',
    
     # --- mobility ---
    'H0001A': 'registered_cars_total',
    # 'H0004A': 'registered_petrol_cars',
    'H0007A': 'registered_electric_cars',
    'H0041A': 'new_registered_electric_cars',
    'H0003S': 'private_cars_per_1000',
}

In [5]:
df = df_raw[list(rename_columns.keys())].rename(columns=rename_columns)

print(f"Renamed dataset shape: {df.shape}")
df.head().T

Renamed dataset shape: (8892, 52)


Unnamed: 0,0,1,2,3,4
year,2012,2012,2012,2012,2012
area_code,0,1,2,3,4
area,0 / Stadt Köln,1 / Innenstadt,2 / Rodenkirchen,3 / Lindenthal,4 / Ehrenfeld
area_level_code,0,1,1,1,1
area_level,Gesamtstadt,Stadtbezirke,Stadtbezirke,Stadtbezirke,Stadtbezirke
avg_age_total,4190013762,4086903262,4345253054,4206031943,4054831047
avg_age_male,4091336362,4055675276,4213189939,4096546036,3985724002
avg_age_female,4283753064,4118596412,4467740298,4303439468,4121577561
avg_age_german,4237034807,4090622752,4424319955,4237437637,4057239755
avg_age_non_german,3964795425,4068724668,3872862005,3919316091,4044413457


### Data Cleaning & Transformation

In [6]:
df.dtypes.unique()
# df.info()

array([dtype('int64'), dtype('O'), dtype('float64')], dtype=object)

In [7]:
for col in df.select_dtypes('object').columns.difference(['area', 'area_level']):
    df[col] = (
        df[col]
        .astype(str)
        .str.strip()
        .str.replace(",", ".", regex=False)
        .str.replace("\xa0", "", regex=False)
        .replace(["*", "nan", "None"], np.nan)
        .astype(float, errors='ignore')
    )

In [8]:
exclude_cols = [
    'avg_age_total', 'avg_age_male', 'avg_age_female', 'avg_age_german', 'avg_age_non_german', 
    'non_german_share', 'migration_background_share',
    'households_single_share', 'households_with_children_share',
    'living_area_per_capita_sqm', 'avg_flat_area_sqm', 
    'supported_housing_share', 'unemployed_rate_total', 'social_benefit_recipients_share', 'employed_residents_share', 
    'private_cars_per_1000'
]

for col in df.select_dtypes('float64').columns.difference(exclude_cols):
    if df[col].isna().any():
        df[col] = df[col].round().astype('Int64')
    else:
        df[col] = df[col].round().astype('int64')

In [9]:
df['area_level'] = df['area_level'].astype('category')

In [10]:
# keep only the name after the slash and remove spaces
# df["area"] = df["area"].str.split("/").str[-1].str.strip()
df['area'] = df['area'].str.replace(r'^\s*\d+\s*/\s*', '', regex=True).str.strip()

df[["area_code", "area"]].head(20)

Unnamed: 0,area_code,area
0,0,Stadt Köln
1,1,Innenstadt
2,2,Rodenkirchen
3,3,Lindenthal
4,4,Ehrenfeld
5,5,Nippes
6,6,Chorweiler
7,7,Porz
8,8,Kalk
9,9,Mülheim


In [11]:
df['area'].nunique()

658

In [12]:
df.groupby(['year', 'area_level']).size().unstack()

  df.groupby(['year', 'area_level']).size().unstack()


area_level,Gesamtstadt,Sozialräume,Stadtbezirke,Stadtteile,Statistische Quartiere
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2012,1,18,9,86,570
2013,1,18,9,86,570
2014,1,18,9,86,570
2015,1,18,9,86,570
2016,1,18,9,86,570
2017,1,18,9,86,570
2018,1,18,9,86,570
2019,1,18,9,86,570
2020,1,18,9,86,570
2021,1,18,9,86,570


### Missing Values Summary

In [31]:
df.isna().sum().sort_values(ascending=False).head(20)

supported_housing_share            5892
avg_flat_area_sqm                  5883
living_area_per_capita_sqm         5883
new_registered_electric_cars       2409
registered_electric_cars           2340
new_flats_completed                1478
deaths                              347
births                               63
population_80_plus                   38
social_benefit_recipients_share      37
population_15_18                     28
social_benefit_recipients            24
population_75_80                     22
unemployed_rate_total                21
population_18_21                     18
population_65_75                     17
widowed_residents                    17
population_60_65                     16
population_00_03                     16
employed_residents_share             16
dtype: int64

In [32]:
df.groupby('area_level')[df.columns].apply(lambda x: x.isna().mean()).T

  df.groupby('area_level')[df.columns].apply(lambda x: x.isna().mean()).T


area_level,Gesamtstadt,Sozialräume,Stadtbezirke,Stadtteile,Statistische Quartiere
year,0.0,0.0,0.0,0.0,0.0
area_code,0.0,0.0,0.0,0.0,0.0
area,0.0,0.0,0.0,0.0,0.0
area_level_code,0.0,0.0,0.0,0.0,0.0
area_level,0.0,0.0,0.0,0.0,0.0
avg_age_total,0.0,0.055556,0.0,0.0,0.0
avg_age_male,0.0,0.055556,0.0,0.0,0.0
avg_age_female,0.0,0.055556,0.0,0.0,0.0
avg_age_german,0.0,0.055556,0.0,0.0,0.0
avg_age_non_german,0.0,0.055556,0.0,0.0,0.0


Most indicators are fully available at city, district, and neighborhood level, while statistical blocks and social spaces show substantial structural missingness


### Duplicates

In [15]:
df.duplicated(subset=['year', 'area_code']).sum()

np.int64(0)

In [16]:
df[df['area'].str.match(r'^\d+')]['area'].unique()

array([], dtype=object)

### Summary

In [17]:
df["area_level"].value_counts()

area_level
Statistische Quartiere    7410
Stadtteile                1118
Sozialräume                234
Stadtbezirke               117
Gesamtstadt                 13
Name: count, dtype: int64

In [18]:
df.describe()

Unnamed: 0,year,area_code,area_level_code,avg_age_total,avg_age_male,avg_age_female,avg_age_german,avg_age_non_german,population_total,non_german_total,...,supported_housing_share,unemployed_rate_total,social_benefit_recipients,social_benefit_recipients_share,employed_residents,employed_residents_share,registered_cars_total,registered_electric_cars,new_registered_electric_cars,private_cars_per_1000
count,8892.0,8892.0,8892.0,8879.0,8879.0,8879.0,8879.0,8879.0,8879.0,8879.0,...,3000.0,8871.0,8868.0,8855.0,8889.0,8876.0,8892.0,6552.0,6483.0,8892.0
mean,2018.0,418977400.0,2.869883,42.079251,41.187192,42.895229,42.303935,40.705231,8330.546796,1645.604122,...,7.716536,8.552736,919.435837,12.849916,3171.643042,55.850254,3586.916217,45.845085,20.67222,367.665073
std,3.741868,305705000.0,0.44825,3.733417,3.328594,4.210409,4.310442,3.541292,53726.638512,10115.560509,...,12.745773,5.201709,5569.019829,10.298751,20780.986047,6.74448,23801.787941,483.857478,211.090226,104.211092
min,2012.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.477952,5.0,0.2871,15.0,19.270833,5.0,0.0,0.0,0.0
25%,2015.0,105010000.0,3.0,39.7779,39.231274,40.306416,39.620951,38.777182,1558.5,215.0,...,0.0,4.79783,81.0,5.25575,585.0,52.290921,584.0,0.0,0.0,287.50525
50%,2018.0,401040000.0,3.0,42.165948,41.296564,43.027754,42.559808,40.744231,1991.0,347.0,...,3.445449,7.062684,174.0,9.5108,760.0,56.602806,792.0,0.0,0.0,357.978
75%,2021.0,706297500.0,3.0,44.444485,43.294481,45.64098,45.188129,42.803563,2550.0,641.5,...,9.448258,11.063093,396.0,17.991188,1014.0,60.403631,1221.25,14.0,5.0,445.77325
max,2024.0,909030000.0,4.0,71.926288,65.994399,75.530879,74.761972,63.015918,1097519.0,232908.0,...,119.661016,51.920622,118092.0,68.5926,449967.0,89.332004,496526.0,20280.0,7601.0,828.658


In [19]:
df.to_csv("../data/processed/cologne_data_clean_v1.csv", index=False)