In [1]:
import tables_configs 
from pyspark.sql import DataFrame as SparkDataFrame
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StringType, IntegerType, DoubleType
from pyspark.sql.functions import regexp_replace, col

In [2]:
spark = SparkSession.builder\
                    .getOrCreate()

In [3]:
sales_schema = StructType()\
               .add("SalesOrderLineKey",IntegerType())\
               .add("ResellerKey",IntegerType())\
               .add("CustomerKey",IntegerType())\
               .add("ProductKey",IntegerType())\
               .add("OrderDateKey",IntegerType())\
               .add("DueDateKey",IntegerType())\
               .add("ShipDateKey",IntegerType())\
               .add("SalesTerritoryKey",IntegerType())\
               .add("OrderQuantity",IntegerType())\
               .add("UnitPrice",StringType())\
               .add("ExtendedAmount",StringType())\
               .add("UnitPriceDiscountPct",StringType())\
               .add("ProductStandardCost",StringType())\
               .add("TotalProductCost",StringType())\
               .add("SalesAmount",StringType())

In [5]:
sales_df = spark.read\
                .csv("C:\\Users\\Admin\\Desktop\\KafkaPipeline\\ETLocal2Kafka\\data\\sales.csv", header = True, inferSchema = False, schema = sales_schema)

In [7]:
def get_columns_2_transform(table_name:str, columns_2_transform_type:str) -> list:
    """_summary_

    Args:
        table_name (str): _description_
        columns_2_transform_type (str): _description_

    Returns:
        list: _description_

    Example:
    >>> get_columns_2_transform("sales","price")
            [('ExtendedAmount', 'price'),
             ('UnitPrice', 'price'),
             ('TotalProductCost', 'price'),
             ('SalesAmount', 'price')]
    """
    # me aseguro de que la tabla tenga configurada variables globales
    assert tables_configs.global_confs[table_name], f"{table_name} does not exists in global confs."
    assert columns_2_transform_type in tables_configs.columns_types_2_transform, f"{columns_2_transform_type} type not in tables_config allowed types"
    return list(filter(lambda x:x[1] == columns_2_transform_type,tables_configs.global_confs[table_name].items()))
    


def transform_prices_2_numeric(df:SparkDataFrame, table_name:str) -> SparkDataFrame:
    columns_2_transform = list(map(
        lambda x: x[0],
        get_columns_2_transform(table_name,"price")
    ))
    # checkeo que las columnas retornadas si esten en el dataframe
    assert len(set(columns_2_transform) - set(df.columns)) == 0, f"Mismatch between df columns and global {table_name} columns" 
    for column in columns_2_transform:
        df = df.withColumn(
            column,
            regexp_replace(
                regexp_replace(col(column), "[$.]", ""), "[,]","."
            ).alias(column).cast("float")
        )
    return df



def transform_percentages_2_numeric(df, table_name):
    columns_2_transform = list(map(
        lambda x: x[0],
        get_columns_2_transform(table_name,"percentage")
    ))
    # checkeo que las columnas retornadas si esten en el dataframe
    assert len(set(columns_2_transform) - set(df.columns)) == 0, f"Mismatch between df columns and global {table_name} columns"
    for column in columns_2_transform:
        df = df.withColumn(
            column,
            regexp_replace(
                regexp_replace(col(column), "[%]", ""), "[,]","."
            ).alias(column).cast("float")
        )
    return df 



clean_sales = transform_prices_2_numeric(sales_df,"sales")
clean_sales = transform_percentages_2_numeric(clean_sales, "sales")

In [8]:
clean_sales.show()

+-----------------+-----------+-----------+----------+------------+----------+-----------+-----------------+-------------+---------+--------------+--------------------+-------------------+----------------+-----------+
|SalesOrderLineKey|ResellerKey|CustomerKey|ProductKey|OrderDateKey|DueDateKey|ShipDateKey|SalesTerritoryKey|OrderQuantity|UnitPrice|ExtendedAmount|UnitPriceDiscountPct|ProductStandardCost|TotalProductCost|SalesAmount|
+-----------------+-----------+-----------+----------+------------+----------+-----------+-----------------+-------------+---------+--------------+--------------------+-------------------+----------------+-----------+
|         43659001|        676|         -1|       349|    20170702|  20170712|   20170709|                5|            1|  2024.99|       2024.99|                 0.0|            1898.09|         1898.09|    2024.99|
|         43659002|        676|         -1|       350|    20170702|  20170712|   20170709|                5|            3|  2024