In [10]:
import os
import requests
import pyspark.sql.functions as F

from pyspark.sql import SparkSession
from tqdm.auto import tqdm
from pyspark.sql.types import StructType, StructField, StringType, IntegerType 

In [11]:
from minio import Minio
from minio.error import S3Error

# CONSTANTES

In [12]:
minio_endpoint = 'minio:9000'
minio_user = os.environ['MINIO_ROOT_USER']
minio_password = os.environ['MINIO_ROOT_PASSWORD']
buckets = ['bronze', 'silver', 'gold']

# CRIAR BUCKETS

In [13]:
minio_client = Minio(
    minio_endpoint,
    access_key=minio_user,
    secret_key=minio_password,
    secure=False,
)

for bucket in buckets:
    if not minio_client.bucket_exists(bucket):
        minio_client.make_bucket(bucket)
        print(f'Make bucket {bucket}!')
    else:
        print(f'Bucket {bucket} already exists!')

Bucket bronze already exists!
Bucket silver already exists!
Bucket gold already exists!


# INICIAR SESSÃO SPARK

In [None]:
spark = (
    SparkSession
        .builder
        .master("spark://spark:7077")
        .appName('MinIO')
        .config('spark.hadoop.fs.s3a.endpoint', minio_endpoint)
        .config('spark.hadoop.fs.s3a.access.key', minio_user)
        .config('spark.hadoop.fs.s3a.secret.key', minio_password)
        .config('spark.hadoop.fs.s3a.path.style.access', 'true')
        .config('spark.hadoop.fs.s3a.connection.ssl.enabled', 'false')
        .config('spark.hadoop.fs.s3a.impl', 'org.apache.hadoop.fs.s3a.S3AFileSystem')
        .config('spark.jars.packages', 'com.amazonaws:aws-java-sdk-bundle:1.12.262,org.apache.hadoop:hadoop-aws:3.3.4')
        .getOrCreate()
)

In [None]:
import json

for index, row in grouped.iterrows():
    if index > 5:  #LIMITANDO A QUANTIDADE DE ITERAÇÕES
        break
        
    order_id = row['order_id']
    item = row['items']
    
    json_data = {
        'order_id': order_id,
        'payment_type': df_pay.loc[df_pay['order_id'] == order_id, 'payment_type'].values[0],
        'payment_value': float(df_pay.loc[df_pay['order_id'] == order_id, 'payment_value'].values[0]),
        'seller_id': df_pay.loc[df_pay['order_id'] == order_id, 'seller_id'].values[0],
        'items': item
    }
    
    with open(f'C:\\Users\\RodrigoPintoMesquita\\Documents\\GitHub\\Faculdade\\Notebooks Trabalhos 2025H1\\PB\\data\\land\\{order_id}.json', 'w') as json_file:
        json.dump(json_data, json_file, indent=4)

# GRAVANDO NA CAMADA BRONZE

In [133]:
import os
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType, ArrayType

files = os.listdir(folder)

import os
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType, ArrayType

file = 'categories.csv'

categories_schema = StructType([
    StructField('product_id', StringType(), True),
    StructField('product_category_name', StringType(), True)
])

df = spark.read.csv(file, schema=categories_schema, header=True)
df.coalesce(1).write.mode('overwrite').parquet(f's3a://bronze/categories.parquet')

                                                                                

In [162]:
import os
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType, ArrayType

folder = 'land/'
files = os.listdir(folder)

item_schema = StructType([
    StructField("order_id", StringType(), True),
    StructField("order_item_id", IntegerType(), True),
    StructField("product_id", StringType(), True),
    StructField("seller_id", StringType(), True),
    StructField("shipping_limit_date", StringType(), True),
    StructField("price", FloatType(), True),
    StructField("freight_value", FloatType(), True)
])

orders_schema = StructType([
    StructField("order_id", StringType(), True),
    StructField("order_purchase_timestamp", StringType(), True),
    StructField("customer_id", StringType(), True),
    StructField("payment_type", StringType(), True),
    StructField("payment_value", FloatType(), True),
    StructField("items", ArrayType(item_schema), True) 
])


for file in files:
    name = file.replace(".json", "")
    df = spark.read.json(f'{folder}{file}', schema=orders_schema, multiLine=True)
    df.coalesce(1).write.mode('overwrite').parquet(f's3a://bronze/vendas/{name}.parquet')

25/05/31 17:03:40 WARN DataSource: All paths were ignored:
  file:/work/notebooks/land/.ipynb_checkpoints
                                                                                

# GRAVANDO NA CAMADA PRATA

In [None]:
%pip install boto3

In [168]:
import boto3
import os

s3 = boto3.client(
    's3',
    endpoint_url=f'http://{minio_endpoint}',
    aws_access_key_id=minio_user,
    aws_secret_access_key=minio_password
)

prefix = 'vendas/'  
response = s3.list_objects_v2(
    Bucket='bronze',
    Prefix=prefix,
    Delimiter='/'
)

pastas = [
    cp['Prefix']
    for cp in response.get('CommonPrefixes', [])
    if '.ipynb_checkpoints' not in cp['Prefix']
]

parquets = []

for pasta in pastas:
    parquets.append(pasta[len(prefix):].rstrip('/'))

parquets

['00143d0f86d6fbd9f9b38ab440ac16f5.parquet',
 '47770eb9100c2d0c44946d9cf07ec65d.parquet',
 '53cdb2fc8bc7dce0b6741e2150273451.parquet',
 '949d5b44dbf5de918fe9c16f97b45f8a.parquet',
 'e481f51cbdc54678b7cc49136f2d6af7.parquet']

In [213]:
folder = 's3a://bronze/'
folder_vendas = f'{folder}vendas/'

df_categories = spark.read.parquet(f'{folder}categories.parquet', schema=orders_schema)

for file in parquets:
    df = spark.read.parquet(f'{folder_vendas}{file}', schema=categories_schema)

    # Parseando o campo timestamp
    df = df.withColumn('year', F.year(F.to_timestamp(df['order_purchase_timestamp'], 'yyyy-MM-dd HH:mm:ss')))
    df = df.withColumn('month', F.month(F.to_timestamp(df['order_purchase_timestamp'], 'yyyy-MM-dd HH:mm:ss')))
    df = df.withColumn('day', F.dayofmonth(F.to_timestamp(df['order_purchase_timestamp'], 'yyyy-MM-dd HH:mm:ss')))
    df = df.withColumn('hour', F.hour(F.to_timestamp(df['order_purchase_timestamp'], 'yyyy-MM-dd HH:mm:ss')))

    # CRIANDO O OBJETO VENDAS_TRANSACAO
    df_trans = df[['order_id', 'order_purchase_timestamp', 'customer_id', 'payment_type', 'payment_value', 'year', 'month', 'day', 'hour']]
    df_trans = df_trans.dropDuplicates()

    (
        df_trans
        .write
        .partitionBy(['year', 'month'])
        .mode('append')
        .parquet(f's3a://silver/vendas_transacoes.parquet')
    )

    # CRIANDO O OBJETO VENDAS_ITENS

    # Acessando a Array dentro da coluna items
    df_itens = df.withColumn("item", F.explode(df["items"]))

    df_itens = df_itens.select(
        F.col("item.order_id"),
        F.col("item.order_item_id"),
        F.col("item.product_id"),
        F.col("item.seller_id"),
        F.col("item.shipping_limit_date"),
        F.col("item.price"),
        F.col("item.freight_value"),
        F.col("order_purchase_timestamp"),
        F.col("year"),
        F.col("month"),
        F.col("day"),
        F.col("hour")
    )

    # Calculando o valor total por item do pedido
    df_itens = df_itens.withColumn('total_item_value', df_itens['price'] + df_itens['freight_value'])

    # Join com a tabela de produtos para trazer a categoria
    df_itens = df_itens.join(df_categories, on='product_id', how='left')

    df_itens = df_itens.select('order_id', 'product_category_name', 'order_item_id', 'product_id', 'price', 'freight_value', 'total_item_value', 'year', 'month', 'day', 'hour')

    (
        df_itens
        .write
        .partitionBy(['product_category_name', 'year'])
        .mode('append')
        .parquet(f's3a://silver/vendas_items.parquet')
    )


                                                                                

# GRAVANDO NA CAMADA GOLD

In [225]:
from pyspark.sql import functions as F

folder = 's3a://silver/'

#CRIANDO O OBJETO payment_type_history
df_trans = spark.read.parquet(f'{folder}vendas_transacoes.parquet', inferSchema=True)

df_payment_type_history = df_trans.groupby(['year', 'month']).agg(
    F.sum('payment_value').alias('total_payment_value'),
    F.count('order_id').alias('qty_items')
)


(
    df_payment_type_history
    .write
    .mode('overwrite')
    .parquet(f's3a://gold/payment_type_history.parquet')
)


#CRIANDO O OBJETO products_history
df_items = spark.read.parquet(f'{folder}vendas_items.parquet', inferSchema=True)

df_products_history = df_items.groupby(['product_category_name','year', 'month']).agg(
    F.sum('total_item_value').alias('total_value'),
    F.sum('price').alias('value_without_freight'),
    F.count('order_id').alias('qty_items')
)


(
    df_products_history
    .write
    .mode('overwrite')
    .parquet(f's3a://gold/products_history.parquet')
)
