In [1]:
%%time
%matplotlib inline
%load_ext autoreload
%autoreload 2

import sys
from dataclasses import dataclass
from pathlib import Path

import pandas as pd
from dotenv import load_dotenv


@dataclass
class Config:
    project_dir: Path = Path("../../")
    model_dir: Path = project_dir / "models"
    outputs_dir: Path = project_dir / "outputs"
    data_dir: Path = project_dir / "data"
    raw_dir: Path = data_dir / "raw"
    interim_dir: Path = data_dir / "interim"
    processed_dir: Path = data_dir / "processed"


config = Config()
sys.path.append(str(config.project_dir.resolve()))
load_dotenv()

CPU times: user 435 ms, sys: 154 ms, total: 589 ms
Wall time: 956 ms


True

In [2]:
df_products = pd.read_csv(config.raw_dir / "olist_products_dataset.csv")
df_product_category_name_translation = pd.read_csv(
    config.raw_dir / "product_category_name_translation.csv"
)
df_order_items = pd.read_csv(config.raw_dir / "olist_order_items_dataset.csv")
df_sellers = pd.read_csv(config.raw_dir / "olist_sellers_dataset.csv")


## df_products

In [3]:
df_products.shape

(32951, 9)

In [4]:
df_prodcuts_merged = pd.merge(
    df_products,
    df_product_category_name_translation,
    on="product_category_name",
    how="left",
)
assert df_prodcuts_merged.shape[0] == df_products.shape[0]

In [5]:
print(df_prodcuts_merged["product_category_name"].isnull().sum())
print(df_prodcuts_merged["product_category_name_english"].isnull().sum())
print(
    df_prodcuts_merged.query(
        "~product_category_name.isna() & product_category_name_english.isna()"
    ).shape
)
display(
    df_prodcuts_merged.query(
        "~product_category_name.isna() & product_category_name_english.isna()"
    )
)


610
623
(13, 10)


Unnamed: 0,product_id,product_category_name,product_name_lenght,product_description_lenght,product_photos_qty,product_weight_g,product_length_cm,product_height_cm,product_width_cm,product_category_name_english
1628,0105b5323d24fc655f73052694dbbb3a,pc_gamer,59.0,621.0,4.0,2839.0,19.0,16.0,18.0,
5821,6fd83eb3e0799b775e4f946bd66657c0,portateis_cozinha_e_preparadores_de_alimentos,52.0,280.0,1.0,1200.0,25.0,33.0,25.0,
7325,5d923ead886c44b86845f69e50520c3e,portateis_cozinha_e_preparadores_de_alimentos,58.0,284.0,1.0,1200.0,25.0,33.0,25.0,
7478,6727051471a0fc4a0e7737b57bff2549,pc_gamer,60.0,1532.0,3.0,650.0,16.0,22.0,20.0,
8819,bed164d9d628cf0593003389c535c6e0,portateis_cozinha_e_preparadores_de_alimentos,54.0,382.0,2.0,850.0,30.0,21.0,22.0,
11039,1220978a08a6b29a202bc015b18250e9,portateis_cozinha_e_preparadores_de_alimentos,46.0,280.0,1.0,1200.0,25.0,33.0,25.0,
14266,ae62bb0f95af63d64eae5f93dddea8d3,portateis_cozinha_e_preparadores_de_alimentos,59.0,927.0,1.0,10600.0,40.0,20.0,38.0,
16182,1954739d84629e7323a4295812a3e0ec,portateis_cozinha_e_preparadores_de_alimentos,58.0,792.0,4.0,750.0,30.0,30.0,30.0,
16930,dbe520fb381ad695a7e1f2807d20c765,pc_gamer,60.0,840.0,6.0,800.0,18.0,22.0,22.0,
17800,c7a3f1a7f9eef146cc499368b578b884,portateis_cozinha_e_preparadores_de_alimentos,52.0,1372.0,5.0,7350.0,40.0,30.0,23.0,


## df_order_items

In [6]:
df_order_items.shape

(112650, 7)

In [7]:
df_order_items[["order_id", "order_item_id"]].drop_duplicates().shape

(112650, 2)

In [8]:
df_order_items["sum_price_freight_by_order"] = (
    df_order_items["price"] + df_order_items["freight_value"]
)

In [9]:
df_order_items_products_merged = pd.merge(
    df_order_items, df_prodcuts_merged, on="product_id", how="left"
)
assert df_order_items.shape[0] == df_order_items_products_merged.shape[0]


## df_sellers

In [10]:
df_sellers.head()

Unnamed: 0,seller_id,seller_zip_code_prefix,seller_city,seller_state
0,3442f8959a84dea7ee197c632cb2df15,13023,campinas,SP
1,d1b65fc7debc3361ea86b5f14c68d2e2,13844,mogi guacu,SP
2,ce3ad9de960102d0677a81f5d0bb7b2d,20031,rio de janeiro,RJ
3,c0f3eea2e14555b6faeea3dd58c1b1c3,4195,sao paulo,SP
4,51a04a8a6bdcb23deccc82b0b80742cf,12914,braganca paulista,SP


In [11]:
df_order_items_products_sellers_merged = pd.merge(
    df_order_items_products_merged, df_sellers, on="seller_id", how="left"
)
assert (
    df_order_items_products_sellers_merged.shape[0]
    == df_order_items_products_merged.shape[0]
)


In [14]:
df_order_items_products_sellers_merged.to_csv(
    config.interim_dir / "olist_item_product_seller_merged.csv", index=False
)
