In [6]:
import tables_configs 
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StringType, IntegerType, DoubleType
from pyspark.sql.functions import regexp_replace, col

In [4]:
spark = SparkSession.builder\
                    .getOrCreate()

In [9]:
sales_schema = StructType()\
               .add("SalesOrderLineKey",IntegerType())\
               .add("ResellerKey",IntegerType())\
               .add("CustomerKey",IntegerType())\
               .add("ProductKey",IntegerType())\
               .add("OrderDateKey",IntegerType())\
               .add("DueDateKey",IntegerType())\
               .add("ShipDateKey",IntegerType())\
               .add("SalesTerritoryKey",IntegerType())\
               .add("OrderQuantity",IntegerType())\
               .add("UnitPrice",StringType())\
               .add("ExtendedAmount",StringType())\
               .add("UnitPriceDiscountPct",StringType())\
               .add("ProductStandardCost",StringType())\
               .add("TotalProductCost",StringType())\
               .add("SalesAmount",StringType())

In [15]:
sales_df = spark.read\
                .csv("C:\\Users\\Admin\\Desktop\\KafkaPipeline\\ETLocal2Kafka\\data\\sales.csv", header = True, inferSchema = False, schema = sales_schema)

In [44]:
clean_df = sales_df.withColumn(
    "UnitPrice",
    regexp_replace(
        regexp_replace(col("UnitPrice"), "[$.]", ""), "[,]","."
    ).alias("UnitPrice").cast("float")
) 

In [8]:
def get_columns_2_transform(table_name:str, columns_2_transform_type:str) -> list:
    """_summary_

    Args:
        table_name (str): _description_
        columns_2_transform_type (str): _description_

    Returns:
        list: _description_

    Example:
    >>> get_columns_2_transform("sales","price")
            [('ExtendedAmount', 'price'),
             ('UnitPrice', 'price'),
             ('TotalProductCost', 'price'),
             ('SalesAmount', 'price')]
    """
    # me aseguro de que la tabla tenga configurada variables globales
    assert tables_configs.global_confs[table_name], f"{table_name} does not exists in global confs."
    assert columns_2_transform_type in tables_configs.columns_types_2_transform, ""
    return list(filter(lambda x:x[1] == columns_2_transform_type,tables_configs.global_confs[table_name].items()))
    


def transform_prices_2_numeric(df, table_name):
    def get_price_columns(table_name:str) -> list:
        """_summary_

        Args:
            table_name (str): _description_

        Returns:
            list: _description_

        Example:
            >>> get_price_columns("sales")
            [('ExtendedAmount', 'price'),
             ('UnitPrice', 'price'),
             ('TotalProductCost', 'price'),
             ('SalesAmount', 'price')]
        """
        return list(filter(lambda x:x[1] == "price",global_confs[table_name].items()))
    price_columns_2_transform = filter(lambda x:x == "price",global_confs[table_name])

In [9]:
global_confs["sales"].items()

dict_items([('ExtendedAmount', 'price'), ('UnitPrice', 'price'), ('ProductStandardCost', 'Price'), ('TotalProductCost', 'price'), ('SalesAmount', 'price'), ('UnitPriceDiscountPct', 'percentage')])

In [17]:
list(filter(lambda x:x[1] == "price",global_confs["sales"].items()))

[('ExtendedAmount', 'price'),
 ('UnitPrice', 'price'),
 ('TotalProductCost', 'price'),
 ('SalesAmount', 'price')]

In [45]:
clean_df.show()

+-----------------+-----------+-----------+----------+------------+----------+-----------+-----------------+-------------+---------+--------------+--------------------+-------------------+----------------+-----------+
|SalesOrderLineKey|ResellerKey|CustomerKey|ProductKey|OrderDateKey|DueDateKey|ShipDateKey|SalesTerritoryKey|OrderQuantity|UnitPrice|ExtendedAmount|UnitPriceDiscountPct|ProductStandardCost|TotalProductCost|SalesAmount|
+-----------------+-----------+-----------+----------+------------+----------+-----------+-----------------+-------------+---------+--------------+--------------------+-------------------+----------------+-----------+
|         43659001|        676|         -1|       349|    20170702|  20170712|   20170709|                5|            1|  2024.99|     $2.024,99|               0,00%|          $1.898,09|       $1.898,09|  $2.024,99|
|         43659002|        676|         -1|       350|    20170702|  20170712|   20170709|                5|            3|  2024

In [20]:
sales_df.show()

+-----------------+-----------+-----------+----------+------------+----------+-----------+-----------------+-------------+---------+--------------+--------------------+-------------------+----------------+-----------+
|SalesOrderLineKey|ResellerKey|CustomerKey|ProductKey|OrderDateKey|DueDateKey|ShipDateKey|SalesTerritoryKey|OrderQuantity|UnitPrice|ExtendedAmount|UnitPriceDiscountPct|ProductStandardCost|TotalProductCost|SalesAmount|
+-----------------+-----------+-----------+----------+------------+----------+-----------+-----------------+-------------+---------+--------------+--------------------+-------------------+----------------+-----------+
|         43659001|        676|         -1|       349|    20170702|  20170712|   20170709|                5|            1|$2.024,99|     $2.024,99|               0,00%|          $1.898,09|       $1.898,09|  $2.024,99|
|         43659002|        676|         -1|       350|    20170702|  20170712|   20170709|                5|            3|$2.024

In [39]:
clean_df.show()

+---------+--------------+--------------+
|UnitPrice|UnitPriceClean|UnitPriceFinal|
+---------+--------------+--------------+
|$2.024,99|       2024,99|       2024.99|
|$2.024,99|       2024,99|       2024.99|
|$2.024,99|       2024,99|       2024.99|
|$2.039,99|       2039,99|       2039.99|
|$2.039,99|       2039,99|       2039.99|
|$2.039,99|       2039,99|       2039.99|
|$2.039,99|       2039,99|       2039.99|
|   $28,84|         28,84|         28.84|
|   $28,84|         28,84|         28.84|
|    $5,70|          5,70|          5.70|
|    $5,19|          5,19|          5.19|
|   $20,19|         20,19|         20.19|
|  $419,46|        419,46|        419.46|
|  $874,79|        874,79|        874.79|
|  $809,76|        809,76|        809.76|
|  $714,70|        714,70|        714.70|
|  $714,70|        714,70|        714.70|
|    $5,19|          5,19|          5.19|
|   $28,84|         28,84|         28.84|
|  $722,59|        722,59|        722.59|
+---------+--------------+--------