In [1]:
import csv
from datetime import datetime
import os
import re
import logging

from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, DateType, IntegerType
from pyspark.sql.functions import current_date, col, to_date, when, current_timestamp, monotonically_increasing_id, lit, upper, lower

In [2]:
class TabellaDipendenti:
    
    def __init__(self, path_csv, path_log, idrun):
            self.spark = (
                SparkSession.builder
                .master("local[1]")
                .appName("FlussoGenerico")
                .getOrCreate()
            )

            self.idrun = idrun
            self.path_log = path_log

            # Lettura generica: header dalla prima riga, tutto come stringa
            self.df = (
                self.spark.read
                .option("header", True)   # la prima riga diventa header
                .option("inferSchema", False)  # tutto stringa, conversione dopo
                .csv(path_csv)
            )

            # Aggiungo data inserimento
            self.df = self.df.withColumn("DINS", current_date())

            # Logging configurazione
            logging.basicConfig(
                filename=self.path_log,
                filemode='w',
                level=logging.INFO,
                format="%(message)s",
                force=True
            )

            # Log iniziale
            logging.info(
                "IDRUN=%s, Operazione=Costruttore, Stato=OK, File=%s, Data=%s",
                idrun, path_csv, datetime.now().strftime("%Y-%m-%d %H:%M:%S")
            )

In [6]:
flusso = TabellaDipendenti(
        path_csv="../data/FlussoDip.csv",
        path_log="../logs/tlog.txt",
        idrun=1
    )

In [7]:
flusso.df.printSchema()

root
 |-- ID_CLIENTE: string (nullable = true)
 |-- NOME: string (nullable = true)
 |-- COGNOME: string (nullable = true)
 |-- EMAIL: string (nullable = true)
 |-- SPESA_TOTALE: string (nullable = true)
 |-- DATA_ISCRIZIONE: string (nullable = true)
 |-- STATO: string (nullable = true)
 |-- DINS: date (nullable = false)



In [8]:
flusso.df.show()

+----------+--------+----------+--------------------+------------+---------------+---------+----------+
|ID_CLIENTE|    NOME|   COGNOME|               EMAIL|SPESA_TOTALE|DATA_ISCRIZIONE|    STATO|      DINS|
+----------+--------+----------+--------------------+------------+---------------+---------+----------+
|         1|    Anna|     Rossi|anna.rossi@exampl...|     1200.50|     2020-05-15|   ATTIVO|2025-09-27|
|         2|    Luca|   Bianchi|luca.bianchi@example|         abc|     2019-13-40|   ATTIVO|2025-09-27|
|         3| Claudia|     Verdi|                NULL|      890.00|     2018-07-22|DISATTIVO|2025-09-27|
|         4|   Paolo|      Neri|paolo.neri@exampl...|      450.30|     2021-02-30|   ATTIVO|2025-09-27|
|         5| Giorgio|     Ferri|giorgio.ferri@exa...|        1000|     2017-11-05|     NULL|2025-09-27|
|         6| Martina|     Lanza|martina.lanza@exa...|        -300|     2016-08-12|   ATTIVO|2025-09-27|
|         7| Filippo|     Baldi|filippo.baldi@exa...|         Na