In [1]:
# conding: utf-8

from pyspark.sql.functions import col
from pyspark.sql.functions import from_unixtime
from pyspark.sql.functions import lit
from pyspark.sql.functions import unix_timestamp
from pyspark.sql.functions import UserDefinedFunction 
from pyspark.sql.functions import struct
from pyspark.sql.types import *

import boto3

import os
import json
import urllib
import re
from unicodedata import normalize
from itertools import islice

from gen_struct import transform_struct_json

In [2]:
"""
    Este trecho de código é necessário apenas quando o notebook é executado em ambiente local de test
"""

from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
try:
    sc = SparkContext('local')
    spark = SparkSession(sc)
except ValueError:
    pass  # a spark context is already created

#### Lendo objetos do S3

In [3]:
s3_resources = boto3.resource('s3')

bucket = filter(lambda bckt: bckt.name == 'vtex-analytics-import', s3_resources.buckets.all())[0]
prefix = 'vtex-checkout-versioned/95_FulfillmentOrder/001E600A63944702BF4EA7A92FCD3833/'

hexdir_paths = bucket.objects.filter(Prefix=prefix)
ids_paths = filter(lambda obj: '/id/' in obj.key, hexdir_paths)

#### Estruturando objetos e criando dataframe 

In [4]:
s3_client = boto3.client('s3')

structured_jsons = []
for obj in ids_paths:
    response = s3_client.get_object(
        Bucket=obj.bucket_name,
        Key=obj.key,
    )
    raw_json = json.loads(response['Body'].read())
    structured = transform_struct_json(raw_json)
    structured_json = json.dumps(structured)
    structured_jsons.append(structured_json)

jsonRDD = sc.parallelize(structured_jsons)
df = spark.read.json(jsonRDD)        

#### Particionando dados e escrevendo

In [6]:
def getYear(var):
    return var.split('T')[0].split('-')[0]

def getMonth(var):
    return var.split('T')[0].split('-')[1]

def getDay(var):
    return var.split('T')[0].split('-')[2]

# Register functions as Spark UDFs 
udf_getYear = UserDefinedFunction(getYear, StringType())
udf_getMonth = UserDefinedFunction(getMonth, StringType())
udf_getDay = UserDefinedFunction(getDay, StringType())

In [13]:
# Create the Columns for the Partitions
df = df.withColumn('YEAR', udf_getYear(df.CreationDate))
df = df.withColumn('MONTH', udf_getMonth(df.CreationDate))
df = df.withColumn('DAY', udf_getDay(df.CreationDate))

# Save table to S3 using Parquet format and partitioning by defined columns
df.write.partitionBy(['YEAR','MONTH','DAY','InstanceId'])\
    .mode('append')\
    .json('../../../../data/s3/checkout/partitioned')

In [40]:
# stdf = spark.read.json('../../../../data/s3/checkout/structured_json/{*}CheckoutOrder/*/id/*')
# schema = stdf.schema
# tdf = spark.read.json('../../../../data/s3/checkout/history/{*}CheckoutOrder/*/id/*', schema=schema)