# Notebook TFM: ETL - Transformación de datos CURATED a ENRICH
<div style="background-color:#F2EDED;">
<br/>
<div>
<img src="https://uploads-ssl.webflow.com/614b1fe22fa8b90ef41aeffe/6265cb48f9496b1cefc9ab75_logotipo-mbit-39.png" width="200px" align="left" CLASS="TextWrap" style="background-color:#2a3f3f; margin-left: 10px;">
<img src="https://branding-guidelines.msf.es/esp/imgs/logo/Logo-01.jpg" width="100px" align="right" CLASS="TextWrap" style="background-color:#2a3f3f;">
</div>
<br/>
<br/>
<br/>
<div>
<h1><font color="#2a3f3f" size=4 style="margin-left: 10px;">MODELO DE PROBABILIDAD A TESTAR EN MÉDICOS SIN FRONTERAS</font></h1>
</div>
<br/>
<div style="text-align: right; margin-right: 10px; margin-bottom: 10px;">
<font color="#2a3f3f" size=3>Elio López Salamanca </font><br>
<font color="#2a3f3f" size=3>Sergio Israel Calleja Chimeno</font><br>
</div>
</div>

In [1]:
spark

Calculation started (calculation_id=04c5a956-1a08-e670-80c5-be8c9e7f40a6) in (session=60c5a956-0962-c349-1495-123530f39e0e). Checking calculation status...


Progress:   0%|          |elapsed time = 00:00s

Calculation completed.
<pyspark.sql.session.SparkSession object at 0x7fae4adbb0>



In [2]:
import logging
from pyspark.sql import DataFrame
from pyspark.sql.functions import col
from typing import Dict

Calculation started (calculation_id=3ac5a9a0-aa10-a37e-da6d-9b47cf84ec99) in (session=c2c5a996-5161-6f58-6dde-c9cff6427c43). Checking calculation status...


Progress:   0%|          |elapsed time = 00:00s

Calculation completed.


### Configuraciones iniciales

In [13]:
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

curated_paths = {
    "Contact": "s3://curated-msf-mbit/Contact/",
    "Opportunity": "s3://curated-msf-mbit/Opportunity/",
    "Campaign": "s3://curated-msf-mbit/Campaign/",
    "RecurringDonation": "s3://curated-msf-mbit/RecurringDonation"
}
    
#enrichment_path = "s3://enriched-msf-mbit/ContactsOpportunities/"
enrichment_path = "s3://enriched-msf-mbit/ContactsOpportunitiesAgg/"

Calculation started (calculation_id=1ac5a9a4-1817-fd99-8284-692d665a0318) in (session=c2c5a996-5161-6f58-6dde-c9cff6427c43). Checking calculation status...


Progress:   0%|          |elapsed time = 00:00s

Calculation completed.


### Lee los datos de las tablas Delta

In [5]:
def read_tables(table_paths: dict[str, str]) -> dict[str, DataFrame]:
    try:
        
        tables = {}
        for table_name, path in table_paths.items():
            tables[table_name] = spark.read.format("delta").load(path)
        return tables
        
        if tables is None:
            logger.error("Error: 'tables' is Null.")
        return

        logger.info("Data extracted")
        return tables
    
    except Exception as e:
        logger.error(f"Error extracting data: {str(e)}")
        raise

Calculation started (calculation_id=f0c5a9a2-7bc6-cbf7-cb25-7e3cef4e6f1f) in (session=c2c5a996-5161-6f58-6dde-c9cff6427c43). Checking calculation status...


Progress:   0%|          |elapsed time = 00:00s

Calculation completed.


In [8]:
tables_ = read_tables(curated_paths)

Calculation started (calculation_id=dac5a9a3-2d3b-e112-7e54-40800246074c) in (session=c2c5a996-5161-6f58-6dde-c9cff6427c43). Checking calculation status...


Progress:   0%|          |elapsed time = 00:00s

Calculation completed.


In [9]:
print(tables_)

Calculation started (calculation_id=56c5a9a3-8088-273e-b10a-12329a43ee2f) in (session=c2c5a996-5161-6f58-6dde-c9cff6427c43). Checking calculation status...


Progress:   0%|          |elapsed time = 00:00s

Calculation completed.
{'Contact': DataFrame[Id: string, Ltvscore: double, Program: string, Programaherencias: boolean, Programais: boolean, Legacyconfidentiality: boolean, Membertype: string, hash_key: string, processing_timestamp: timestamp], 'Opportunity': DataFrame[Id: string, Rating: string, Campaignid: string, PrimaryContact: string, RecurringDonation: string, Stagename: string, Program: string, Type: string, Typefundraisingcontribution: string, hash_key: string, processing_timestamp: timestamp], 'Campaign': DataFrame[Id: string, Campaigndonationreporting: string, Campaignentryreporting: string, Isemergency: boolean, Isonline: string, Objective: string, Objectivepublic: string, Segment: string, Status: string, hash_key: string, processing_timestamp: timestamp], 'RecurringDonation': DataFrame[Id: string, Isdeleted: boolean, Annualizedquota: double, Cancelationdate: date, Currentcampaign: string, Amount: double, Contact: string, InstallmentPeriod: string, PaidAmount: double, TotalP

### Agregación

In [20]:
# Debido a que el anterior dataset contiene millones de registros,
# procedemos a agregar los datos de la tabla Oportunidades para optimizar el dataset resultante

def agg_oportunidades(df: DataFrame) -> DataFrame:
    
    try:

        group_by_columns = [
            "Rating", "Campaignid", "PrimaryContact", "RecurringDonation",
            "Stagename", "Program", "Type", "Typefundraisingcontribution"
        ]
        
        aggregated_df = df.groupBy(group_by_columns).count().drop("count")
        
        logger.info("DataFrame aggregated successfully.")
        return aggregated_df
    
    except Exception as e:
        logger.error(f"Error aggregating DataFrame: {str(e)}")
        raise

Calculation started (calculation_id=f6c5a9a9-6762-75b6-b0bb-a2f0a804a62d) in (session=c2c5a996-5161-6f58-6dde-c9cff6427c43). Checking calculation status...


Progress:   0%|          |elapsed time = 00:00s

Calculation completed.


### Enriquecer los datos de contactos con otras tablas

In [10]:
def create_enriched_dataset(opportunity: DataFrame, contact: DataFrame, 
                            campaign: DataFrame, recurring_donation: DataFrame) -> DataFrame:
    """
    Crea un dataset enriquecido a partir de varias tablas relacionadas.
    """
    enriched_data = (
        opportunity.alias("opp")
        .join(contact.alias("cnt"), col("opp.PrimaryContact") == col("cnt.Id"), "inner")
        .join(campaign.alias("cmp"), col("opp.CampaignId") == col("cmp.Id"), "inner")
        .join(recurring_donation.alias("rcd"), 
              (col("opp.RecurringDonation") == col("rcd.Id")) & 
              (col("opp.PrimaryContact") == col("rcd.Contact")), "left_outer")
    )
    
    opp_columns = [f"opp.{col_name}" for col_name in opportunity.columns]
    cnt_columns = [f"cnt.{col_name}" for col_name in contact.columns]
    cmp_columns = [f"cmp.{col_name}" for col_name in campaign.columns]
    rcd_columns = [f"rcd.{col_name}" for col_name in recurring_donation.columns]

    # Mapeo de alias a sus columnas correspondientes y prefijos deseados
    alias_prefix_mapping = {
        "opp": (opp_columns, "Opp_"),
        "cnt": (cnt_columns, "Cnt_"),
        "cmp": (cmp_columns, "Cmp_"),
        "rcd": (rcd_columns, "Rcd_")
    }

    # Seleccionar y renombrar las columnas
    selected_columns = [
        col(alias_col).alias(f"{prefix}{alias_col.split('.')[-1]}")
        for alias, (alias_cols, prefix) in alias_prefix_mapping.items()
        for alias_col in alias_cols
        if alias_col.split('.')[-1] not in ["hash_key", "processing_timestamp"]
    ]

    enriched_data = enriched_data.select(*selected_columns).dropDuplicates()
    
    print("Enriched columns generated: ")
    print(enriched_data.printSchema())
        
    return enriched_data

Calculation started (calculation_id=eec5a9a3-b7fc-ef0b-706a-f4c65021dc4f) in (session=c2c5a996-5161-6f58-6dde-c9cff6427c43). Checking calculation status...


Progress:   0%|          |elapsed time = 00:00s

Calculation completed.


### Carga los datos enriquecidos en el destino en formato Parquet

In [11]:
def write_dataset(df, output_path):
    try:
        
        (
            df.coalesce(5)
            .write
            .format("delta")
            .mode("overwrite")
            .partitionBy("Opp_Type")
            .save(output_path)
        )
    
        logger.info(f"Data written to {output_path}")
    except Exception as e:
        logger.error(f"Error writing data to {output_path}: {str(e)}")
        raise

Calculation started (calculation_id=9ac5a9a3-c141-7dc5-385c-24041ddc4748) in (session=c2c5a996-5161-6f58-6dde-c9cff6427c43). Checking calculation status...


Progress:   0%|          |elapsed time = 00:00s

Calculation completed.


### Define ETL "Enrichement"

In [12]:
def run_enrichment(curated_paths: Dict[str, str], enrichment_path: str)-> None:
    try:
        # Extract
        tables = read_tables(curated_paths)
        
        # Agragaciones
        tables["OpportunityAgg"] = agg_oportunidades(tables["Opportunity"])
        
        # Transform / Enrich
        enriched_data = create_enriched_dataset(
             tables["OpportunityAgg"], 
             tables["Contact"], 
             tables["Campaign"], 
             tables["RecurringDonation"]
         )

        # Load
        write_dataset(enriched_data, enrichment_path)
    
        logger.info("ETL 'enrichment' executed successfully")
    except Exception as e:
        logger.error(f"Error in ETL pipeline: {str(e)}")
        raise

Calculation started (calculation_id=1ec5a9a3-c97d-fb65-ef90-6c86b5cb7e49) in (session=c2c5a996-5161-6f58-6dde-c9cff6427c43). Checking calculation status...


Progress:   0%|          |elapsed time = 00:00s

Calculation completed.


### Ejectua ETL

In [14]:
run_enrichment(curated_paths, enrichment_path)

Calculation started (calculation_id=c4c5a9a5-0bfe-2e2c-ac27-e4825f402e32) in (session=c2c5a996-5161-6f58-6dde-c9cff6427c43). Checking calculation status...


Progress:   0%|          |elapsed time = 00:00s

Calculation completed.
Enriched columns generated: 
root
 |-- Opp_Rating: string (nullable = true)
 |-- Opp_Campaignid: string (nullable = true)
 |-- Opp_PrimaryContact: string (nullable = true)
 |-- Opp_RecurringDonation: string (nullable = true)
 |-- Opp_Stagename: string (nullable = true)
 |-- Opp_Program: string (nullable = true)
 |-- Opp_Type: string (nullable = true)
 |-- Opp_Typefundraisingcontribution: string (nullable = true)
 |-- Cnt_Id: string (nullable = true)
 |-- Cnt_Ltvscore: double (nullable = true)
 |-- Cnt_Program: string (nullable = true)
 |-- Cnt_Programaherencias: boolean (nullable = true)
 |-- Cnt_Programais: boolean (nullable = true)
 |-- Cnt_Legacyconfidentiality: boolean (nullable = true)
 |-- Cnt_Membertype: string (nullable = true)
 |-- Cmp_Id: string (nullable = true)
 |-- Cmp_Campaigndonationreporting: string (nullable = true)
 |-- Cmp_Campaignentryreporting: string (nullable = true)
 |-- Cmp_Isemergency: boolean (nullable = true)
 |-- Cmp_Isonline: string 

In [19]:

df_read_parquet = spark.read.format("parquet").load("s3://enriched-msf-mbit/ContactsOpportunities/")

Calculation started (calculation_id=82c5a9a7-0d1f-96c5-bd30-314894f554a6) in (session=c2c5a996-5161-6f58-6dde-c9cff6427c43). Checking calculation status...


Progress:   0%|          |elapsed time = 00:00s

Calculation 82c5a9a7-0d1f-96c5-bd30-314894f554a6 failed


  File "<stdin>", line 1, in <module>
  File "/opt/amazon/spark/python/lib/pyspark.zip/pyspark/sql/readwriter.py", line 159, in load
    return self._df(self._jreader.load(path))
  File "/opt/amazon/spark/python/lib/py4j-0.10.9.3-src.zip/py4j/java_gateway.py", line 1321, in __call__
    return_value = get_return_value(
  File "/opt/amazon/spark/python/lib/pyspark.zip/pyspark/sql/utils.py", line 111, in deco
    return f(*a, **kw)
  File "/opt/amazon/spark/python/lib/py4j-0.10.9.3-src.zip/py4j/protocol.py", line 326, in get_return_value
    raise Py4JJavaError(
py4j.protocol.Py4JJavaError: An error occurred while calling o249.load.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 62.0 failed 4 times, most recent failure: Lost task 0.3 in stage 62.0 (TID 742) ([2600:1f18:aa1:3a0e:20d7:173b:91be:2a5e] executor 25): org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.

In [None]:
df_read_parquet.count()

In [15]:
# Despúes de la agregación
df_read = spark.read.format("delta").load("s3://enriched-msf-mbit/ContactsOpportunitiesAgg/")
df_read.count()

Calculation started (calculation_id=14c5a9a6-0e38-67c9-1c40-45367979ae7a) in (session=c2c5a996-5161-6f58-6dde-c9cff6427c43). Checking calculation status...


Progress:   0%|          |elapsed time = 00:00s

Calculation completed.
2682548



In [23]:
df_single_partition = df_read.repartition(1)

df_single_partition.write.csv("s3://mbit-oct22-msf-data-ealopezs/Dataset-result/", mode="overwrite", sep=";", header=True)

Calculation started (calculation_id=9cc5a9ab-d657-c13d-39e8-ccf9a34fedff) in (session=c2c5a996-5161-6f58-6dde-c9cff6427c43). Checking calculation status...


Progress:   0%|          |elapsed time = 00:00s

Calculation completed.
