In [1]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine

engine = create_engine("postgresql+psycopg2://*****:*****@localhost:5433/layereddb")

In [2]:
# --- Load art & music related venues, tagging them for future split ---
art_df = pd.read_sql("""
    SELECT 
        district_id,
        CASE 
            WHEN amenity IN ('music_venue', 'music_school', 'studio') 
                 OR club = 'music' THEN 'music'
            WHEN amenity IN ('dancing_school', 'events_venue', 'arts_centre') 
                 OR club IN ('art', 'dance') THEN 'art'
            ELSE 'other'
        END AS art_type,
        COUNT(DISTINCT club_id) AS num_places
    FROM berlin_source_data.social_clubs_activities
    WHERE 
        amenity IN (
            'arts_centre', 'events_venue', 
            'music_venue', 'music_school', 
            'dancing_school', 'studio'
        )
        OR club IN ('art', 'music', 'dance')
    GROUP BY district_id, art_type;
""", engine)

In [3]:
art_df

Unnamed: 0,district_id,art_type,num_places
0,11001001,art,80
1,11001001,music,26
2,11002002,art,58
3,11002002,music,31
4,11003003,art,27
5,11003003,music,42
6,11004004,art,42
7,11004004,music,24
8,11005005,art,5
9,11005005,music,3


In [4]:
#  district_attributes dataframe 
district_attrs = pd.read_sql("""
    SELECT *
    FROM berlin_labels.district_attributes
""", engine)

district_attrs.head()

Unnamed: 0,district_id,area_sq_km,inhabitants,area_coefficient,population_coefficient
0,11004004,64.662978,343081,0.871208,1.061595
1,11002002,20.389118,293454,0.274704,0.908034
2,11011011,52.091363,311881,0.70183,0.965053
3,11010010,61.782422,291948,0.832398,0.903374
4,11001001,39.379173,397134,0.530558,1.228851


In [5]:
#    Merge with the main district_attributes dataframe and fill missing values with 0  

df_art = district_attrs.merge(art_df, on="district_id", how="left").fillna(0)

df_art

Unnamed: 0,district_id,area_sq_km,inhabitants,area_coefficient,population_coefficient,art_type,num_places
0,11004004,64.662978,343081,0.871208,1.061595,art,42
1,11004004,64.662978,343081,0.871208,1.061595,music,24
2,11002002,20.389118,293454,0.274704,0.908034,art,58
3,11002002,20.389118,293454,0.274704,0.908034,music,31
4,11011011,52.091363,311881,0.70183,0.965053,art,9
5,11011011,52.091363,311881,0.70183,0.965053,music,9
6,11010010,61.782422,291948,0.832398,0.903374,art,2
7,11010010,61.782422,291948,0.832398,0.903374,music,5
8,11001001,39.379173,397134,0.530558,1.228851,art,80
9,11001001,39.379173,397134,0.530558,1.228851,music,26


In [6]:
#  Calculate density and per 1000 inhabitants 

df_art["density"] = df_art["num_places"] / df_art["area_sq_km"]
df_art["per_1000"] = df_art["num_places"] / (df_art["inhabitants"] / 1000)

#  Determine 50th percentile thresholds 

quantiles = (
    df_art.groupby("art_type")[["density", "per_1000"]]
    .quantile(0.50)
    .rename(columns={"density": "density_q50", "per_1000": "per_1000_q50"})
)

# Merge quantiles back into main dataframe
df_art = df_art.merge(quantiles, on="art_type", how="left")  

# Assign label based on thresholds 

# ---  Assign label per type ---
df_art["label"] = np.where(
    (df_art["density"] > df_art["density_q50"]) &
    (df_art["per_1000"] > df_art["per_1000_q50"]),
    df_art["art_type"] + "_district",
    None
)

df_art

Unnamed: 0,district_id,area_sq_km,inhabitants,area_coefficient,population_coefficient,art_type,num_places,density,per_1000,density_q50,per_1000_q50,label
0,11004004,64.662978,343081,0.871208,1.061595,art,42,0.649522,0.12242,0.199545,0.054484,art_district
1,11004004,64.662978,343081,0.871208,1.061595,music,24,0.371155,0.069954,0.246694,0.046611,music_district
2,11002002,20.389118,293454,0.274704,0.908034,art,58,2.844655,0.197646,0.199545,0.054484,art_district
3,11002002,20.389118,293454,0.274704,0.908034,music,31,1.520419,0.105638,0.246694,0.046611,music_district
4,11011011,52.091363,311881,0.70183,0.965053,art,9,0.172773,0.028857,0.199545,0.054484,
5,11011011,52.091363,311881,0.70183,0.965053,music,9,0.172773,0.028857,0.246694,0.046611,
6,11010010,61.782422,291948,0.832398,0.903374,art,2,0.032372,0.006851,0.199545,0.054484,
7,11010010,61.782422,291948,0.832398,0.903374,music,5,0.080929,0.017126,0.246694,0.046611,
8,11001001,39.379173,397134,0.530558,1.228851,art,80,2.031531,0.201443,0.199545,0.054484,art_district
9,11001001,39.379173,397134,0.530558,1.228851,music,26,0.660247,0.065469,0.246694,0.046611,music_district


In [7]:
# --- Select relevant columns ---
art_labels_df = df_art[["district_id", "label"]].copy()

# --- Create list of labels per district ---
art_labels_df["labels_list"] = art_labels_df.apply(
    lambda r: [r["label"]] if pd.notna(r["label"]) else [],
    axis=1
)

# --- Expand list into separate rows ---
final_art_df = (
    art_labels_df[["district_id", "labels_list"]]
    .explode("labels_list")
    .dropna(subset=["labels_list"])
    .rename(columns={"labels_list": "label"})
)

# --- Add category name ---
final_art_df["category"] = "Community & Lifestyle"

# --- Add hashtags for consistency ---
final_art_df["label"] = "#" + final_art_df["label"]

# --- Reorder columns for DB upload ---
final_art_df = final_art_df[["district_id", "category", "label"]]

# --- Preview final table ---
print("✅ Preview of final data to upload:")

final_art_df

✅ Preview of final data to upload:


Unnamed: 0,district_id,category,label
0,11004004,Community & Lifestyle,#art_district
1,11004004,Community & Lifestyle,#music_district
2,11002002,Community & Lifestyle,#art_district
3,11002002,Community & Lifestyle,#music_district
8,11001001,Community & Lifestyle,#art_district
9,11001001,Community & Lifestyle,#music_district
10,11008008,Community & Lifestyle,#art_district
12,11003003,Community & Lifestyle,#art_district
13,11003003,Community & Lifestyle,#music_district
21,11007007,Community & Lifestyle,#music_district


In [8]:
# --- Upload to Database ---

try:
    final_art_df.to_sql(
        'district_labels_new',     
        engine,
        schema='berlin_labels',
        if_exists='append',        
        index=False
    )
    print(f"✅ Successfully uploaded {len(final_art_df)} bike labels to the database.")
except Exception as e:
    print(f"❌ Upload error: {e}")

✅ Successfully uploaded 10 bike labels to the database.
