In [33]:
import tables_configs 
from pyspark.sql import DataFrame as SparkDataFrame
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StringType, IntegerType
from pyspark.sql.functions import regexp_replace, col

In [22]:
spark = SparkSession.builder\
                    .getOrCreate()

In [30]:
sales_schema = StructType()\
               .add("SalesOrderLineKey",IntegerType())\
               .add("ResellerKey",IntegerType())\
               .add("CustomerKey",IntegerType())\
               .add("ProductKey",IntegerType())\
               .add("OrderDateKey",IntegerType())\
               .add("DueDateKey",IntegerType())\
               .add("ShipDateKey",IntegerType())\
               .add("SalesTerritoryKey",IntegerType())\
               .add("OrderQuantity",IntegerType())\
               .add("UnitPrice",StringType())\
               .add("ExtendedAmount",StringType())\
               .add("UnitPriceDiscountPct",StringType())\
               .add("ProductStandardCost",StringType())\
               .add("TotalProductCost",StringType())\
               .add("SalesAmount",StringType())

In [5]:
sales_df = spark.read\
                .csv("C:\\Users\\Admin\\Desktop\\KafkaPipeline\\ETLocal2Kafka\\data\\sales.csv", header = True, inferSchema = False, schema = sales_schema)

In [24]:
def get_columns_2_transform(table_name:str, columns_2_transform_type:str) -> list:
    """Obtener de las configuaraciones globales de la tabla (ConfigQuind) las columnas
    con tipos de datos especiales a imputar. 

    Args:
        table_name (str): Nombre de la tabla que posee un tipo de dato especifico a inputar
        columns_2_transform_type (str): Tipo de dato especifico a imputar

    Returns:
        list: Lista de tuplas que contiene el nombre de la columns y su tipo especificado
              en las configuarciones globales.

    Example:
    >>> get_columns_2_transform("sales","price")
            [('ExtendedAmount', 'price'),
             ('UnitPrice', 'price'),
             ('TotalProductCost', 'price'),
             ('SalesAmount', 'price')]
    >>> get_columns_2_transform("date","price")
            []
    """
    # me aseguro de que la tabla tenga configurada variables globales
    try:
       tables_configs.global_confs[table_name]
    except KeyError:
       # en caso de que la tabla no tenga configuraciones especificadas retornar lista vacia
       return []
    assert columns_2_transform_type in tables_configs.columns_types_2_transform, f"{columns_2_transform_type} type not in tables_config allowed types"
    return list(filter(lambda x:x[1] == columns_2_transform_type,tables_configs.global_confs[table_name].items()))

# COMMAND ----------

def transform_prices_2_numeric(df:SparkDataFrame, table_name:str) -> SparkDataFrame:
    """Imputar datos de las columnas de la tabla 'table_name' que tienen datos de tipo 'price' 
    a datos flotantes. 

    Args:
        df (SparkDataFrame): DataFrame de Spark que contiene los datos leidos del topico de Kafka.
        table_name (str): Nombre de la tabla que contiene las columnas con tipos de datos
        'price' a imputar.

    Returns:
        SparkDataFrame: DataFrame de Spark con columnas que contiene tipos de datos 'price' imputadas.
    
    Example:
    >>> df_input.show()
        +-----------------+-----------+-----------+----------+------------+----------+-----------+-----------------+-------------+---------+--------------+--------------------+-------------------+----------------+-----------+
        |SalesOrderLineKey|ResellerKey|CustomerKey|ProductKey|OrderDateKey|DueDateKey|ShipDateKey|SalesTerritoryKey|OrderQuantity|UnitPrice|ExtendedAmount|UnitPriceDiscountPct|ProductStandardCost|TotalProductCost|SalesAmount|
        +-----------------+-----------+-----------+----------+------------+----------+-----------+-----------------+-------------+---------+--------------+--------------------+-------------------+----------------+-----------+
        |         43659001|        676|         -1|       349|    20170702|  20170712|   20170709|                5|            1|$2.024,99|     $2.024,99|               0,00%|          $1.898,09|       $1.898,09|  $2.024,99|
        +-----------------+-----------+-----------+----------+------------+----------+-----------+-----------------+-------------+---------+--------------+--------------------+-------------------+----------------+-----------+
    
    >>> transform_prices_2_numeric(df_iput,"sales")
        +-----------------+-----------+-----------+----------+------------+----------+-----------+-----------------+-------------+---------+--------------+--------------------+-------------------+----------------+-----------+
        |SalesOrderLineKey|ResellerKey|CustomerKey|ProductKey|OrderDateKey|DueDateKey|ShipDateKey|SalesTerritoryKey|OrderQuantity|UnitPrice|ExtendedAmount|UnitPriceDiscountPct|ProductStandardCost|TotalProductCost|SalesAmount|
        +-----------------+-----------+-----------+----------+------------+----------+-----------+-----------------+-------------+---------+--------------+--------------------+-------------------+----------------+-----------+
        |         43659001|        676|         -1|       349|    20170702|  20170712|   20170709|                5|            1|  2024.99|       2024.99|               0,00%|            1898.09|         1898.09|    2024.99|
        +-----------------+-----------+-----------+----------+------------+----------+-----------+-----------------+-------------+---------+--------------+--------------------+-------------------+----------------+-----------+
    """
    if len(get_columns_2_transform(table_name,"price")) == 0:
       return df
    
    columns_2_transform = list(map(
        lambda x: x[0],
        get_columns_2_transform(table_name,"price")
    ))
    # checkeo que las columnas retornadas si esten en el dataframe
    assert len(set(columns_2_transform) - set(df.columns)) == 0, f"Mismatch between df columns and global {table_name} columns" 
    for column in columns_2_transform:
        df = df.withColumn(
            column,
            regexp_replace(
                regexp_replace(col(column), "[$.]", ""), "[,]","."
            ).alias(column).cast("float")
        )
    return df


# COMMAND ----------

def transform_percentages_2_numeric(df:SparkDataFrame, table_name:str) -> SparkDataFrame:
    """Imputar datos de las columnas de la tabla 'table_name' que tienen datos de tipo 'percentage' 
    a datos flotantes.  


    Args:
        df (SparkDataFrame): DataFrame de Spark que contiene los datos leidos del topico de Kafka.
        table_name (str): Nombre de la tabla que contiene las columnas con tipos de datos
        'percentage' a imputar.

    Returns:
        SparkDataFrame: DataFrame de Spark con columnas que contiene tipos de datos 'percentage' imputadas.

    Example:
    >>> df_input.show()
        +-----------------+-----------+-----------+----------+------------+----------+-----------+-----------------+-------------+---------+--------------+--------------------+-------------------+----------------+-----------+
        |SalesOrderLineKey|ResellerKey|CustomerKey|ProductKey|OrderDateKey|DueDateKey|ShipDateKey|SalesTerritoryKey|OrderQuantity|UnitPrice|ExtendedAmount|UnitPriceDiscountPct|ProductStandardCost|TotalProductCost|SalesAmount|
        +-----------------+-----------+-----------+----------+------------+----------+-----------+-----------------+-------------+---------+--------------+--------------------+-------------------+----------------+-----------+
        |         43659001|        676|         -1|       349|    20170702|  20170712|   20170709|                5|            1|$2.024,99|     $2.024,99|               0,00%|          $1.898,09|       $1.898,09|  $2.024,99|
        +-----------------+-----------+-----------+----------+------------+----------+-----------+-----------------+-------------+---------+--------------+--------------------+-------------------+----------------+-----------+
    
    >>> transform_percentages_2_numeric(df_input, "sales")
        +-----------------+-----------+-----------+----------+------------+----------+-----------+-----------------+-------------+---------+--------------+--------------------+-------------------+----------------+-----------+
        |SalesOrderLineKey|ResellerKey|CustomerKey|ProductKey|OrderDateKey|DueDateKey|ShipDateKey|SalesTerritoryKey|OrderQuantity|UnitPrice|ExtendedAmount|UnitPriceDiscountPct|ProductStandardCost|TotalProductCost|SalesAmount|
        +-----------------+-----------+-----------+----------+------------+----------+-----------+-----------------+-------------+---------+--------------+--------------------+-------------------+----------------+-----------+
        |         43659001|        676|         -1|       349|    20170702|  20170712|   20170709|                5|            1|$2.024,99|     $2.024,99|                 0.0|          $1.898,09|       $1.898,09|  $2.024,99|
        +-----------------+-----------+-----------+----------+------------+----------+-----------+-----------------+-------------+---------+--------------+--------------------+-------------------+----------------+-----------+
    """
    if len(get_columns_2_transform(table_name,"percentage")) == 0:
       return df
    
    columns_2_transform = list(map(
        lambda x: x[0],
        get_columns_2_transform(table_name,"percentage")
    ))
    # checkeo que las columnas retornadas si esten en el dataframe
    assert len(set(columns_2_transform) - set(df.columns)) == 0, f"Mismatch between df columns and global {table_name} columns"
    for column in columns_2_transform:
        df = df.withColumn(
            column,
            regexp_replace(
                regexp_replace(col(column), "[%]", ""), "[,]","."
            ).alias(column).cast("float")
        )
    return df 



def main(df_input:SparkDataFrame, table_name:str):
    df_prices_transformed = transform_prices_2_numeric(df_input,table_name)
    df_percentages_transformed = transform_percentages_2_numeric(df_prices_transformed,table_name)
    return df_percentages_transformed

In [25]:
sales_df = spark.read\
                .csv("C:\\Users\\puert\\OneDrive\\Escritorio\\KafkaDataBricksnAzure\\ETLocal2Kafka\\data\\sales.csv", header = True, inferSchema = False, schema = sales_schema)

clean_sales= main(sales_df, "sales")

In [35]:
sales_df.show(1)

+-----------------+-----------+-----------+----------+------------+----------+-----------+-----------------+-------------+---------+--------------+--------------------+-------------------+----------------+-----------+
|SalesOrderLineKey|ResellerKey|CustomerKey|ProductKey|OrderDateKey|DueDateKey|ShipDateKey|SalesTerritoryKey|OrderQuantity|UnitPrice|ExtendedAmount|UnitPriceDiscountPct|ProductStandardCost|TotalProductCost|SalesAmount|
+-----------------+-----------+-----------+----------+------------+----------+-----------+-----------------+-------------+---------+--------------+--------------------+-------------------+----------------+-----------+
|         43659001|        676|         -1|       349|    20170702|  20170712|   20170709|                5|            1|$2.024,99|     $2.024,99|               0,00%|          $1.898,09|       $1.898,09|  $2.024,99|
+-----------------+-----------+-----------+----------+------------+----------+-----------+-----------------+-------------+------

In [36]:
clean_sales.show(1)

+-----------------+-----------+-----------+----------+------------+----------+-----------+-----------------+-------------+---------+--------------+--------------------+-------------------+----------------+-----------+
|SalesOrderLineKey|ResellerKey|CustomerKey|ProductKey|OrderDateKey|DueDateKey|ShipDateKey|SalesTerritoryKey|OrderQuantity|UnitPrice|ExtendedAmount|UnitPriceDiscountPct|ProductStandardCost|TotalProductCost|SalesAmount|
+-----------------+-----------+-----------+----------+------------+----------+-----------+-----------------+-------------+---------+--------------+--------------------+-------------------+----------------+-----------+
|         43659001|        676|         -1|       349|    20170702|  20170712|   20170709|                5|            1|  2024.99|       2024.99|                 0.0|            1898.09|         1898.09|    2024.99|
+-----------------+-----------+-----------+----------+------------+----------+-----------+-----------------+-------------+------

In [31]:
date_df = spark.read\
                .csv("C:\\Users\\puert\\OneDrive\\Escritorio\\KafkaDataBricksnAzure\\ETLocal2Kafka\\data\\date.csv", header = True, inferSchema = False, schema = schema_date)
clean_date = main(date_df, "date")

In [32]:
date_df.show(1)

+--------+----------+----------+-------------+--------+------------+--------+
| DateKey|      Date|FiscalYear|FiscalQuarter|   Month|    FullDate|MonthKey|
+--------+----------+----------+-------------+--------+------------+--------+
|20170701|01/07/2017|    FY2018|    FY2018 Q1|2017 Jul|2017 Jul, 01|  201707|
+--------+----------+----------+-------------+--------+------------+--------+
only showing top 1 row



In [34]:
clean_date.show(1)

+--------+----------+----------+-------------+--------+------------+--------+
| DateKey|      Date|FiscalYear|FiscalQuarter|   Month|    FullDate|MonthKey|
+--------+----------+----------+-------------+--------+------------+--------+
|20170701|01/07/2017|    FY2018|    FY2018 Q1|2017 Jul|2017 Jul, 01|  201707|
+--------+----------+----------+-------------+--------+------------+--------+
only showing top 1 row

