In [3]:
# conding: utf-8

from pyspark.sql.functions import UserDefinedFunction
from pyspark.sql.types import *

In [4]:
"""
    Este trecho de código é necessário apenas quando o notebook é executado em ambiente local
"""

from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
try:
    sc = SparkContext('local')
    spark = SparkSession(sc)
except ValueError:
    pass  # a spark context is already created

#### Recuperando schema interface

In [11]:
structured_jsons_path = '../../../../aws_s3/teste/structured_json/checkout/00_CheckoutOrder/001E600A63944702BF4EA7A92FCD3833/id/*'
structured_df = spark.read.json(structured_jsons_path)

schema = structured_df.schema

#### Usando o schema para, ao ler dados históricos, convertê-los já para a estrutura ideal

In [29]:
history_datapath = '../../../../aws_s3/teste/history/checkout/001E600A63944702BF4EA7A92FCD3833//id/*'
df = spark.read.json(history_datapath)

# O código abaixo funciona para Items

'''
# Get Item Type
types = filter(lambda f: f.name == "Items", structured_df.schema.fields)
itemType = types[0].dataType

def convertData(dado):
    def _parse_product_categories(raw_product_categories):
        product_categories = []
        for k,v in raw_product_categories.items():
            product_categories.append({
                'id': k,
                'name': v
            })
        return product_categories
    
    if(dado == "Items"):
        items = json.loads(dado)
        for item in items:
            product_categories = _parse_product_categories(item.get('productCategories', {}))
            item['productCategories'] = product_categories

    return items

# Register functions as Spark UDFs 
udf_getData = UserDefinedFunction(convertData, itemType)

df = df.withColumn('Items', udf_getData("Items"))

df.select("Items").show()
'''

'\n# Get Item Type\ntypes = filter(lambda f: f.name == "Items", structured_df.schema.fields)\nitemType = types[0].dataType\n\ndef convertData(dado):\n    def _parse_product_categories(raw_product_categories):\n        product_categories = []\n        for k,v in raw_product_categories.items():\n            product_categories.append({\n                \'id\': k,\n                \'name\': v\n            })\n        return product_categories\n    \n    if(dado == "Items"):\n        items = json.loads(dado)\n        for item in items:\n            product_categories = _parse_product_categories(item.get(\'productCategories\', {}))\n            item[\'productCategories\'] = product_categories\n\n    return items\n\n# Register functions as Spark UDFs \nudf_getData = UserDefinedFunction(convertData, itemType)\n\ndf = df.withColumn(\'Items\', udf_getData("Items"))\n\ndf.select("Items").show()\n'

### Código para converter os dados com spark usando cast

#### TODO: ver a forma de lidar com as comparações em `get_transform_data`, pois o field vem como uma coluna, teria de ver uma forma de pegar seu nome, para poder comparar e fazer as conversões corretas
#### TODO: ver se o `remove_attachments` funciona corretamente

In [31]:
import json
import os

ATTACHMENT = "attachment"
JSON_KEY_ITEMMETADATA = "ItemMetadata"
JSON_KEY_ITEMS = "Items"
JSON_KEY_RATESANDBDATA = "RatesAndBenefitsData"
JSON_KEY_CUSTOMDATA = "CustomData"


def remove_attachments(dic):    
    dic_copy = dic.copy()
    for key in dic_copy:
        if(ATTACHMENT in key.lower()):
            del dic[key]
        elif(type(dic_copy[key]) == dict):
            remove_attachments(dic[key])
        elif(type(dic_copy[key]) == list):
            for item in dic_copy[key]:
                if(type(item) == dict):
                    remove_attachments(item)
    return dic


def parse_items(items):
    def _parse_product_categories(raw_product_categories):
        product_categories = []
        for k,v in raw_product_categories.items():
            product_categories.append({
                'id': k,
                'name': v
            })
        return product_categories
    
    for item in items:
        product_categories = _parse_product_categories(item.get('productCategories', {}))
        item['productCategories'] = product_categories
        
    return items


def transform_itemmetadata(itemmetadata):
    len_items_itemmetadata = len(itemmetadata["items"])
    for i in range(len_items_itemmetadata):
        if ("assemblyOptions" in itemmetadata["items"][i]):
            del itemmetadata["items"][i]["assemblyOptions"]
    return itemmetadata
    
    
def transform_ratesandbenefitsdata(data):
    KEY_IDENTIFIERS = "rateAndBenefitsIdentifiers"
    KEY_MATCH_PARAMS = "matchedParameters"
    KEY_ADDINFO = "additionalInfo"
    for i in range(len(data[KEY_IDENTIFIERS])):
        if KEY_MATCH_PARAMS in data[KEY_IDENTIFIERS][i]:
            del data[KEY_IDENTIFIERS][i][KEY_MATCH_PARAMS]
        if KEY_ADDINFO in data[KEY_IDENTIFIERS][i]:
            del data[KEY_IDENTIFIERS][i][KEY_ADDINFO]
    return data


def transform_customdata(customdata):
    KEY_CUSTOMAPP = "customApps"
    KEY_FIELDS = "fields"
    KEY_EXTRA_CONTENT = "cart-extra-context"
    if KEY_CUSTOMAPP in customdata:
        for i in range(len(customdata[KEY_CUSTOMAPP])):
            if KEY_FIELDS in customdata[KEY_CUSTOMAPP][i] and\
                KEY_EXTRA_CONTENT in customdata[KEY_CUSTOMAPP][i][KEY_FIELDS]:
                    del customdata[KEY_CUSTOMAPP][i][KEY_FIELDS][KEY_EXTRA_CONTENT]
    return customdata
    
def get_transform_data(field):
    structuted_field = json.loads(field)
    
    if field == JSON_KEY_ITEMS:
        structuted_field = parse_items(structuted_field)

    if field == JSON_KEY_ITEMMETADATA:
        structuted_field = transform_itemmetadata(structuted_field)

    if field == JSON_KEY_RATESANDBDATA:
        structuted_field = transform_ratesandbenefitsdata(structuted_field)

    if field == JSON_KEY_CUSTOMDATA:
        structuted_field = transform_customdata(structuted_field)
    
    if(type(structuted_field) == dict):
        structuted_field = remove_attachments(structuted_field)
    
    return structuted_field
    
def transform_struct_df(df_no_structured, fields):
    '''
        Transforms to structured Data Frame. 
    '''
    for field in fields:
        fieldType = field.dataType
        fieldName = field.name
        
        if(fieldType != StringType()):
            udf_get_transform_data = UserDefinedFunction(get_transform_data, fieldType)
            df_no_structured = df_no_structured.withColumn(fieldName, udf_get_transform_data(fieldName))
    
    return df_no_structured

In [35]:
df = transform_struct_df(df, structured_df.schema.fields)

df.select("Items").show()

IllegalArgumentException: u'Unsupported class file major version 55'

#### Particionando dados e escrevendo

In [33]:
def getYear(lastChange, creationDate):
    date = lastChange if lastChange is not None else creationDate
    return date.split('T')[0].split('-')[0]

def getMonth(lastChange, creationDate):
    date = lastChange if lastChange is not None else creationDate
    return date.split('T')[0].split('-')[1]

def getDay(lastChange, creationDate):
    date = lastChange if lastChange is not None else creationDate
    return date.split('T')[0].split('-')[2]

# Register functions as Spark UDFs 
udf_getYear = UserDefinedFunction(getYear, StringType())
udf_getMonth = UserDefinedFunction(getMonth, StringType())
udf_getDay = UserDefinedFunction(getDay, StringType())

In [36]:
# Create the Columns for the Partitions
df = df.withColumn('YEAR', udf_getYear(df.LastChange))
df = df.withColumn('MONTH', udf_getMonth(df.LastChange))
df = df.withColumn('DAY', udf_getDay(df.LastChange))


df.select("YEAR").show()
# TODO: LastChange eh um campo nullable. Isso pode gerar problemas na hora de executar
#         as funções auxiliares anteriores

IllegalArgumentException: u'Unsupported class file major version 55'

In [None]:
# Save table to S3 using Parquet format and partitioning by defined columns
df.write.partitionBy(['YEAR','MONTH','DAY','InstanceId'])\
    .mode('append')\
    .parquet('s3://vtex.datalake/consumable_tables/')