In [9]:
# conding: utf-8

from pyspark.sql.functions import UserDefinedFunction
from pyspark.sql.types import *

In [10]:
"""
    Este trecho de código é necessário apenas quando o notebook é executado em ambiente local
"""

from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
try:
    sc = SparkContext('local')
    spark = SparkSession(sc)
except ValueError:
    pass  # a spark context is already created

#### Recuperando schema interface

In [12]:
structured_jsons_path = 's3://vtex.datale/structured_json/{*}CheckoutOrder/*/id/*'
structured_df = spark.read.json(structured_jsons_path)

schema = structured_df.schema

#### Usando o schema para, ao ler dados históricos, convertê-los já para a estrutura ideal

In [13]:
history_datapath = 's3://vtex-analytics-import/vtex-checkout-versioned/{*}CheckoutOrder/*/id/*'
df = spark.read.json(history_datapath, schema=schema)

#### Particionando dados e escrevendo

In [14]:
def getYear(var):
    return var.split('T')[0].split('-')[0]

def getMonth(var):
    return var.split('T')[0].split('-')[1]

def getDay(var):
    return var.split('T')[0].split('-')[2]

# Register functions as Spark UDFs 
udf_getYear = UserDefinedFunction(getYear, StringType())
udf_getMonth = UserDefinedFunction(getMonth, StringType())
udf_getDay = UserDefinedFunction(getDay, StringType())

In [23]:
# Create the Columns for the Partitions
df = df.withColumn('YEAR', udf_getYear(df.LastChange))
df = df.withColumn('MONTH', udf_getMonth(df.LastChange))
df = df.withColumn('DAY', udf_getDay(df.LastChange))

# TODO: LastChange eh um campo nullable. Isso pode gerar problemas na hora de executar
#         as funções auxiliares anteriores

In [None]:
# Save table to S3 using Parquet format and partitioning by defined columns
df.write.partitionBy(['YEAR','MONTH','DAY','InstanceId'])\
    .mode('append')\
    .parquet('s3://vtex.datalake/consumable_tables/')