In [14]:
import polars as pl

In [6]:
POPULATION_FILEPATH = '../../data/01_raw/popolazione_2023.csv'

In [19]:
df = (
    pl.read_csv(POPULATION_FILEPATH, separator=';')
    .filter(
        (pl.col('FREQ') == 'A') &
        (pl.col('INDICATOR') == 'RESPOP_AV') &
        (pl.col('AGE_NOCLASS') == 'TOTAL') &
        (pl.col('TIME_PERIOD') == 2023)
    )    .with_columns(
        pl.col("Osservazione").cast(pl.Float64)  # Ensure numeric
    )
)

agg_df = (
    df.group_by(["REF_AREA", "Territorio",'TIME_PERIOD'])
    .agg([
        pl.when((pl.col("GENDER") == "F") & (pl.col("CITIZENSHIP") == "ITL"))
          .then(pl.col("Osservazione"))
          .otherwise(0.0)
          .sum()
          .alias("total_f_itl"),

        pl.when((pl.col("GENDER") == "M") & (pl.col("CITIZENSHIP") == "ITL"))
          .then(pl.col("Osservazione"))
          .otherwise(0.0)
          .sum()
          .alias("total_m_itl"),

        pl.when((pl.col("GENDER") == "F") & (pl.col("CITIZENSHIP") == "FRGAPO"))
          .then(pl.col("Osservazione"))
          .otherwise(0.0)
          .sum()
          .alias("total_f_frgapo"),

        pl.when(pl.col("GENDER") == "M")
          .then(pl.col("Osservazione"))
          .otherwise(0.0)
          .sum()
          .alias("total_m"),

        pl.when(pl.col("GENDER") == "F")
          .then(pl.col("Osservazione"))
          .otherwise(0.0)
          .sum()
          .alias("total_f"),

        pl.col("Osservazione").sum().alias("total_all")
    ])
)
agg_df

REF_AREA,Territorio,TIME_PERIOD,total_f_itl,total_m_itl,total_f_frgapo,total_m,total_f,total_all
str,str,i64,f64,f64,f64,f64,f64,f64
"""015136""","""Masate""",2023,1723.0,1731.0,189.0,3822.0,3824.0,15292.0
"""097033""","""Ello""",2023,586.0,563.0,28.0,1142.0,1228.0,4740.0
"""030095""","""'Rive d""'Arcano'""",2023,1122.0,1108.0,92.0,2304.0,2428.0,9464.0
"""028098""","""'Vighizzolo d""'Este'""",2023,387.0,445.0,18.0,934.0,810.0,3488.0
"""022079""","""Dro""",2023,2357.0,2348.0,204.0,5006.0,5122.0,20256.0
…,…,…,…,…,…,…,…,…
"""081013""","""Paceco""",2023,5360.0,5080.0,119.0,10312.0,10958.0,42540.0
"""034019""","""'Lesignano de""' Bagni'""",2023,2316.0,2413.0,197.0,5222.0,5026.0,20496.0
"""025057""","""Soverzene""",2023,181.0,174.0,4.0,348.0,370.0,1436.0
"""004115""","""Mango""",2023,510.0,530.0,101.0,1290.0,1222.0,5024.0


In [21]:
OUTPUT_FILEPATH = '../../data/02_primary/italy_population.parquet'
agg_df.write_parquet(OUTPUT_FILEPATH)