In [None]:
import polars as pl

In [None]:
POPULATION_FILEPATH = '../../data/01_raw/popolazione_2023.csv'

In [None]:
df = (
    pl.read_csv(POPULATION_FILEPATH, separator=';')
    .filter(
        (pl.col('FREQ') == 'A') &
        (pl.col('INDICATOR') == 'RESPOP_AV') &
        (pl.col('AGE_NOCLASS') == 'TOTAL') &
        (pl.col('TIME_PERIOD') == 2023)
    )    .with_columns(
        pl.col("Osservazione").cast(pl.Float64)  # Ensure numeric
    )
)

agg_df = (
    df.group_by(["REF_AREA", "Territorio",'TIME_PERIOD'])
    .agg([
        pl.when((pl.col("GENDER") == "F") & (pl.col("CITIZENSHIP") == "ITL"))
          .then(pl.col("Osservazione"))
          .otherwise(0.0)
          .sum()
          .alias("total_f_itl"),

        pl.when((pl.col("GENDER") == "M") & (pl.col("CITIZENSHIP") == "ITL"))
          .then(pl.col("Osservazione"))
          .otherwise(0.0)
          .sum()
          .alias("total_m_itl"),

        pl.when((pl.col("GENDER") == "F") & (pl.col("CITIZENSHIP") == "FRGAPO"))
          .then(pl.col("Osservazione"))
          .otherwise(0.0)
          .sum()
          .alias("total_f_frgapo"),

        pl.when((pl.col("GENDER") == "M") & (pl.col("CITIZENSHIP") == "FRGAPO"))
          .then(pl.col("Osservazione"))
          .otherwise(0.0)
          .sum()
          .alias("total_m_frgapo"),

        pl.when(pl.col("GENDER") == "M")
          .then(pl.col("Osservazione"))
          .otherwise(0.0)
          .sum()
          .alias("total_m"),

        pl.when(pl.col("GENDER") == "F")
          .then(pl.col("Osservazione"))
          .otherwise(0.0)
          .sum()
          .alias("total_f"),

        pl.col("Osservazione").sum().alias("total_all")
    ])
)
agg_df

In [None]:
OUTPUT_FILEPATH = '../../data/02_primary/italy_population.parquet'
agg_df.write_parquet(OUTPUT_FILEPATH)

---

In [None]:
MUNICIPALITY_FILEPATH = '../../data/01_raw/Elenco dei codici e delle denominazioni delle unit_ territoriali Data Indagine 18-05-2025 Stampa 18052025215700.csv'

In [None]:
import re

columns = pl.read_csv(MUNICIPALITY_FILEPATH, separator=';', n_rows=0).columns
dtypes = {col: pl.Utf8 for col in columns}

def clean_column(col_name: str) -> str:
    col_name = col_name.lower()
    col_name = re.sub(r"[^\w\s]", "", col_name)
    col_name = re.sub(r"\s+", "_", col_name)
    return col_name

df = pl.read_csv(MUNICIPALITY_FILEPATH, separator=';', dtypes=dtypes)
df = df.rename({col: clean_column(col) for col in df.columns})

In [None]:
OUTPUT_FILEPATH = '../../data/02_primary/italy_municipalities.parquet'
agg_df.write_parquet(OUTPUT_FILEPATH)

In [None]:
mdf

---

In [None]:
EMPLOYMENT_FILEPATH = '../../data/01_raw/occupazione.csv'

In [None]:
df

In [None]:
import re

def clean_column(col_name: str) -> str:
    col_name = col_name.lower()
    col_name = re.sub(r"[^\w\s]", "", col_name)
    col_name = re.sub(r"\s+", "_", col_name)
    return col_name

df = pl.read_csv(EMPLOYMENT_FILEPATH, separator=';',infer_schema_length=1000000)
df = df.filter(
        (pl.col('FREQ') == 'A') &
        (pl.col('INDICATOR') == 'RESPOP_AV') &
        (pl.col('AGE_NOCLASS') == 'Y_GE15')
    ).with_columns(
        pl.col("Osservazione").cast(pl.Float64)  # Ensure numeric
    )
agg_df = (
    df.group_by(["REF_AREA", "Territorio",'TIME_PERIOD','Condizione professionale o non professionale'])
    .agg([
        pl.when((pl.col("GENDER") == "F") & (pl.col("CITIZENSHIP") == "ITL"))
          .then(pl.col("Osservazione"))
          .otherwise(0.0)
          .sum()
          .alias("total_f_itl"),

        pl.when((pl.col("GENDER") == "M") & (pl.col("CITIZENSHIP") == "ITL"))
          .then(pl.col("Osservazione"))
          .otherwise(0.0)
          .sum()
          .alias("total_m_itl"),

        pl.when((pl.col("GENDER") == "F") & (pl.col("CITIZENSHIP") == "FRGAPO"))
          .then(pl.col("Osservazione"))
          .otherwise(0.0)
          .sum()
          .alias("total_f_frgapo"),

        pl.when((pl.col("GENDER") == "M") & (pl.col("CITIZENSHIP") == "FRGAPO"))
          .then(pl.col("Osservazione"))
          .otherwise(0.0)
          .sum()
          .alias("total_m_frgapo"),

        pl.when(pl.col("GENDER") == "M")
          .then(pl.col("Osservazione"))
          .otherwise(0.0)
          .sum()
          .alias("total_m"),

        pl.when(pl.col("GENDER") == "F")
          .then(pl.col("Osservazione"))
          .otherwise(0.0)
          .sum()
          .alias("total_f"),

        pl.col("Osservazione").sum().alias("total_all")
    ])
)
agg_df

In [None]:
OUTPUT_FILEPATH = '../../data/02_primary/italy_employment.parquet'
agg_df.write_parquet(OUTPUT_FILEPATH)