In [2]:
import pandas as pd
from google.cloud import storage
!pip install gcsfs --quiet

bucket_name = 'projeto-recomendador'

client = storage.Client()
bucket = client.bucket(bucket_name)

In [13]:
# carregando datasets

# olist files
olist_orders = f"gs://{bucket_name}/{'base_olist/olist_orders_dataset.csv'}"
olist_orders_items = f"gs://{bucket_name}/{'base_olist/olist_order_items_dataset.csv'}"
olist_orders_products = f"gs://{bucket_name}/{'base_olist/olist_products_dataset.csv'}"

# criando dataframes
orders_df = pd.read_csv(olist_orders)
items_df = pd.read_csv(olist_orders_items)
products_df = pd.read_csv(olist_orders_products)

print("orders", orders_df.head())
print("items", items_df.head())
print("products", products_df.head())

orders                            order_id                       customer_id  \
0  e481f51cbdc54678b7cc49136f2d6af7  9ef432eb6251297304e76186b10a928d   
1  53cdb2fc8bc7dce0b6741e2150273451  b0830fb4747a6c6d20dea0b8c802d7ef   
2  47770eb9100c2d0c44946d9cf07ec65d  41ce2a54c0b03bf3443c3d931a367089   
3  949d5b44dbf5de918fe9c16f97b45f8a  f88197465ea7920adcdbec7375364d82   
4  ad21c59c0840e6cb83a9ceb5573f8159  8ab97904e6daea8866dbdbc4fb7aad2c   

  order_status order_purchase_timestamp    order_approved_at  \
0    delivered      2017-10-02 10:56:33  2017-10-02 11:07:15   
1    delivered      2018-07-24 20:41:37  2018-07-26 03:24:27   
2    delivered      2018-08-08 08:38:49  2018-08-08 08:55:23   
3    delivered      2017-11-18 19:28:06  2017-11-18 19:45:59   
4    delivered      2018-02-13 21:18:39  2018-02-13 22:20:29   

  order_delivered_carrier_date order_delivered_customer_date  \
0          2017-10-04 19:55:00           2017-10-10 21:25:13   
1          2018-07-26 14:31:00           

In [14]:
# joins
# orders + items
orders_items = orders_df.merge(items_df, on='order_id', how='inner')[["customer_id", "product_id", "order_purchase_timestamp"]]

# orders_items + products
orders_items_products = orders_items.merge(products_df[["product_id", "product_category_name"]], on='product_id', how='left')

# df final
dataset_olist_final = orders_items_products[[
    "customer_id", "product_id", "order_purchase_timestamp", "product_category_name"
]]

In [23]:
# tratamento de nulos
dataset_olist_final.dropna(inplace=True)

# conversão de datas
dataset_olist_final["order_purchase_timestamp"] = pd.to_datetime(
    dataset_olist_final["order_purchase_timestamp"], errors="coerce"
)
dataset_olist_final.dropna(subset=["order_purchase_timestamp"], inplace=True)

print(dataset_olist_final.head())
print("Linhas com nulos:", dataset_olist_final[dataset_olist_final.isna().any(axis=1)])

                        customer_id                        product_id  \
0  9ef432eb6251297304e76186b10a928d  87285b34884572647811a353c7ac498a   
1  b0830fb4747a6c6d20dea0b8c802d7ef  595fac2a385ac33a80bd5114aec74eb8   
2  41ce2a54c0b03bf3443c3d931a367089  aa4383b373c6aca5d8797843e5594415   
3  f88197465ea7920adcdbec7375364d82  d0b61bfb1de832b15ba9d266ca96e5b0   
4  8ab97904e6daea8866dbdbc4fb7aad2c  65266b2da20d04dbe00c5c2d3bb7859e   

  order_purchase_timestamp  product_category_name  
0      2017-10-02 10:56:33  utilidades_domesticas  
1      2018-07-24 20:41:37             perfumaria  
2      2018-08-08 08:38:49             automotivo  
3      2017-11-18 19:28:06               pet_shop  
4      2018-02-13 21:18:39              papelaria  
Linhas com nulos: Empty DataFrame
Columns: [customer_id, product_id, order_purchase_timestamp, product_category_name]
Index: []


In [24]:
# salvar CSV local
local_file = "historico_compras_clientes.csv"
dataset_olist_final.to_csv(local_file, index=False)
print("CSV salvo localmente.")

CSV salvo localmente.


In [25]:
# upload para o bucket
destination_blob = "historico_compras_clientes.csv"

client = storage.Client()
bucket = client.bucket(bucket_name)
blob = bucket.blob(destination_blob)
blob.upload_from_filename(local_file)

print(f"Upload concluído: gs://{bucket_name}/{destination_blob}")

Upload concluído: gs://projeto-recomendador/historico_compras_clientes.csv
