# Bronze Workload - Sales


##### **Attributes:**
- **DATE**
- **CE_BRAND_FLVR**
- **BRAND_NM**
- **Btlr_Org_LVL_C_Desc**
- **CHNL_GROUP**
- **TRADE_CHNL_DESC**
- **PKG_CAT**
- **Pkg_Cat_Desc**
- **TSR_PCKG_NM**
- **$volume**
- **YEAR**
- **MONTH**
- **PERIOD**
  
##### **Developer:** Paulo

In [1]:
%run "/home/jovyan/work/workspace/functions/create_spark_session.ipynb"

In [2]:
%run "/home/jovyan/work/workspace/functions/paths.ipynb"

In [3]:
%run "/home/jovyan/work/workspace/functions/dates.ipynb"

In [4]:
spark.conf.set("spark.sql.legacy.timeParserPolicy", "LEGACY")

In [5]:
from pyspark.sql.functions import col, date_format, lit , to_date,trim,current_date, regexp_replace
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, IntegerType, LongType, DateType,DecimalType,FloatType

from datetime import datetime
import re

In [6]:
def sanitize_columns(df):
    for col_name in df.columns:
        clean_name = re.sub(r'[^a-zA-Z0-9_]', '', col_name.replace(' ', '_').replace('$', ''))
        df = df.withColumnRenamed(col_name, clean_name)
    return df

In [7]:
schema = StructType([
     StructField('DATE', StringType(), True), 
     StructField('CE_BRAND_FLVR', StringType(), True), 
     StructField('BRAND_NM', StringType(), True), 
     StructField('Btlr_Org_LVL_C_Desc', StringType(), True), 
     StructField('CHNL_GROUP', StringType(), True), 
     StructField('TRADE_CHNL_DESC', StringType(), True), 
     StructField('PKG_CAT', StringType(), True), 
     StructField('Pkg_Cat_Desc', StringType(), True), 
     StructField('TSR_PCKG_NM', StringType(), True), 
     StructField('$volume', StringType(), True), 
     StructField('YEAR', StringType(), True), 
     StructField('MONTH', StringType(), True), 
     StructField('PERIOD', StringType(), True)
])

In [8]:
path = f"{landing}abi_bus_case1_beverage_sales_20210726.csv"

df = (
    spark
    .read
    .option("encoding", "UTF-16")
    .csv(path, header=True, schema=schema, sep='\t')
)


df = df.withColumn('dtload', to_date(date_format(lit(f'{dtcarga}'),'yyyy-MM-dd')))

In [9]:
df = sanitize_columns(df)

In [10]:
df = (

   df.withColumn('ce_brand_flvr', trim(col('CE_BRAND_FLVR')).cast('int'))
    .withColumn('DATE', to_date(trim(col('DATE')), "M/d/yyyy"))
    .withColumn('volume', trim(col('volume')).cast('double'))
    .withColumn('YEAR', col('YEAR').cast('int'))
    .withColumn('MONTH', col('MONTH').cast('int'))
    .withColumn('PERIOD', regexp_replace(col('PERIOD'), r"[^a-zA-Z0-9]","").cast('int'))
    
)

In [14]:
df.na.drop().write.partitionBy("dtload").mode('overwrite').format('delta').save(f'{bronze}sales')