In [1]:
import pandas as pd
from sqlalchemy import create_engine

engine = create_engine("postgresql+psycopg2://*****:*****@localhost:5433/layereddb")

In [2]:
# Load cultural places (social clubs + theaters)

df_culture = pd.read_sql("""
    SELECT 
        sca.district_id,
        COUNT(DISTINCT sca.club_id) 
        + COALESCE(t.num_theaters, 0) AS num_culture_places
    FROM berlin_source_data.social_clubs_activities sca
    LEFT JOIN (
        SELECT district_id, COUNT(DISTINCT theater_id) AS num_theaters
        FROM berlin_source_data.theaters
        GROUP BY district_id
    ) t ON sca.district_id = t.district_id
    WHERE 
        sca.amenity IN ('community_centre', 'social_centre', 'social_club')
        OR sca.club IN ('culture', 'history', 'academic', 'charity', 'politics', 'humanist')
    GROUP BY sca.district_id, t.num_theaters;
""", engine)

In [3]:
df_culture 

Unnamed: 0,district_id,num_culture_places
0,11001001,181
1,11002002,137
2,11003003,105
3,11004004,98
4,11005005,47
5,11006006,62
6,11007007,98
7,11008008,98
8,11009009,88
9,11010010,58


In [4]:
#  district_attributes dataframe 

district_attrs = pd.read_sql("""
    SELECT *
    FROM berlin_labels.district_attributes
""", engine)

district_attrs.head()

Unnamed: 0,district_id,area_sq_km,inhabitants,area_coefficient,population_coefficient
0,11004004,64.662978,343081,0.871208,1.061595
1,11002002,20.389118,293454,0.274704,0.908034
2,11011011,52.091363,311881,0.70183,0.965053
3,11010010,61.782422,291948,0.832398,0.903374
4,11001001,39.379173,397134,0.530558,1.228851


In [5]:
#    Merge with the main district_attributes dataframe and fill missing values with 0  

df_culture  = district_attrs.merge(df_culture , on="district_id", how="left").fillna(0)

df_culture 

Unnamed: 0,district_id,area_sq_km,inhabitants,area_coefficient,population_coefficient,num_culture_places
0,11004004,64.662978,343081,0.871208,1.061595,98
1,11002002,20.389118,293454,0.274704,0.908034,137
2,11011011,52.091363,311881,0.70183,0.965053,71
3,11010010,61.782422,291948,0.832398,0.903374,58
4,11001001,39.379173,397134,0.530558,1.228851,181
5,11008008,44.907902,330017,0.605047,1.021171,98
6,11003003,103.162091,424307,1.389909,1.312933,105
7,11012012,89.28078,268792,1.202885,0.831723,53
8,11005005,91.836013,257091,1.237312,0.795516,47
9,11006006,102.514181,310446,1.381179,0.960613,62


In [6]:
#  Calculate density and per 1000 inhabitants 
df_culture["culture_density"] = df_culture["num_culture_places"] / df_culture["area_sq_km"]  
df_culture["culture_per_1000"] = df_culture["num_culture_places"] / (df_culture["inhabitants"] / 1000)  

#  Determine 50th percentile thresholds 
culture_density_50 = df_culture["culture_density"].quantile(0.50)
culture_per_1000_50 = df_culture["culture_per_1000"].quantile(0.50)

# Assign label based on thresholds 
df_culture["label_culture_hub"] = (
    (df_culture["culture_density"] > culture_density_50) &
    (df_culture["culture_per_1000"] > culture_per_1000_50)
).map({True: "culture_hub", False: None})

# Check the top districts 
df_culture[[
    "district_id", "area_sq_km", "inhabitants", "num_culture_places",
    "culture_density", "culture_per_1000", "label_culture_hub"
]].sort_values("num_culture_places", ascending=False).head(10)


Unnamed: 0,district_id,area_sq_km,inhabitants,num_culture_places,culture_density,culture_per_1000,label_culture_hub
4,11001001,39.379173,397134,181,4.596338,0.455766,culture_hub
1,11002002,20.389118,293454,137,6.719271,0.466853,culture_hub
6,11003003,103.162091,424307,105,1.017816,0.247462,
0,11004004,64.662978,343081,98,1.51555,0.285647,culture_hub
5,11008008,44.907902,330017,98,2.182244,0.296954,culture_hub
10,11007007,53.023264,355868,98,1.848245,0.275383,culture_hub
11,11009009,167.637176,294081,88,0.524943,0.299237,
2,11011011,52.091363,311881,71,1.36299,0.227651,
9,11006006,102.514181,310446,62,0.604794,0.199713,
3,11010010,61.782422,291948,58,0.938778,0.198666,


In [7]:
# --- Select relevant columns ---
culture_labels_df = df_culture[["district_id", "label_culture_hub"]].copy()

# --- Create list of labels per district ---
culture_labels_df["labels_list"] = culture_labels_df.apply(
    lambda r: [r["label_culture_hub"]] if pd.notna(r["label_culture_hub"]) else [],
    axis=1
)

# --- Expand list into separate rows ---
final_culture_df = (
    culture_labels_df[["district_id", "labels_list"]]
    .explode("labels_list")
    .dropna(subset=["labels_list"])
    .rename(columns={"labels_list": "label"})
)

# --- Add category name ---
final_culture_df["category"] = "Community & Lifestyle"

# --- Add hashtags for consistency ---
final_culture_df["label"] = "#" + final_culture_df["label"]

# --- Reorder columns for DB upload ---
final_culture_df = final_culture_df[["district_id", "category", "label"]]

# --- Preview final table ---
print("✅ Preview of final data to upload:")
print(final_culture_df.head())

✅ Preview of final data to upload:
   district_id               category         label
0     11004004  Community & Lifestyle  #culture_hub
1     11002002  Community & Lifestyle  #culture_hub
4     11001001  Community & Lifestyle  #culture_hub
5     11008008  Community & Lifestyle  #culture_hub
10    11007007  Community & Lifestyle  #culture_hub


In [8]:
# --- Upload to Database ---

try:
    final_culture_df.to_sql(
         'district_labels_new',     
         engine,
         schema='berlin_labels',
         if_exists='append',        
         index=False
    )
    print(f"✅ Successfully uploaded {len(final_culture_df)} culture labels to the database.")
except Exception as e:
    print(f"❌ Upload error: {e}")

✅ Successfully uploaded 5 culture labels to the database.
