In [None]:
%%time
%matplotlib inline
%load_ext autoreload
%autoreload 2

import json
import sys
import time
from dataclasses import dataclass
from pathlib import Path

import pandas as pd
from dotenv import load_dotenv
from openai import OpenAI
from tqdm import tqdm


@dataclass
class Config:
    project_dir: Path = Path("../../")
    model_dir: Path = project_dir / "models"
    outputs_dir: Path = project_dir / "outputs"
    data_dir: Path = project_dir / "data"
    raw_dir: Path = data_dir / "raw"
    interim_dir: Path = data_dir / "interim"
    processed_dir: Path = data_dir / "processed"


config = Config()
sys.path.append(str(config.project_dir.resolve()))
load_dotenv()

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
CPU times: user 2.19 ms, sys: 4.34 ms, total: 6.52 ms
Wall time: 6.67 ms


True

In [49]:
df_order_reviews = pd.read_csv(config.raw_dir / "olist_order_reviews_dataset.csv")
df_products = pd.read_csv(config.raw_dir / "olist_products_dataset.csv")
df_product_category_name_translation = pd.read_csv(
    config.raw_dir / "product_category_name_translation.csv"
)

## df_products

In [26]:
df_prodcuts_merged = pd.merge(
    df_products,
    df_product_category_name_translation,
    on="product_category_name",
    how="left",
)
assert df_prodcuts_merged.shape[0] == df_products.shape[0]

In [28]:
print(df_prodcuts_merged["product_category_name"].isnull().sum())
print(df_prodcuts_merged["product_category_name_english"].isnull().sum())
print(
    df_prodcuts_merged.query(
        "~product_category_name.isna() & product_category_name_english.isna()"
    ).shape
)
display(
    df_prodcuts_merged.query(
        "~product_category_name.isna() & product_category_name_english.isna()"
    )
)


610
623
(13, 10)


Unnamed: 0,product_id,product_category_name,product_name_lenght,product_description_lenght,product_photos_qty,product_weight_g,product_length_cm,product_height_cm,product_width_cm,product_category_name_english
1628,0105b5323d24fc655f73052694dbbb3a,pc_gamer,59.0,621.0,4.0,2839.0,19.0,16.0,18.0,
5821,6fd83eb3e0799b775e4f946bd66657c0,portateis_cozinha_e_preparadores_de_alimentos,52.0,280.0,1.0,1200.0,25.0,33.0,25.0,
7325,5d923ead886c44b86845f69e50520c3e,portateis_cozinha_e_preparadores_de_alimentos,58.0,284.0,1.0,1200.0,25.0,33.0,25.0,
7478,6727051471a0fc4a0e7737b57bff2549,pc_gamer,60.0,1532.0,3.0,650.0,16.0,22.0,20.0,
8819,bed164d9d628cf0593003389c535c6e0,portateis_cozinha_e_preparadores_de_alimentos,54.0,382.0,2.0,850.0,30.0,21.0,22.0,
11039,1220978a08a6b29a202bc015b18250e9,portateis_cozinha_e_preparadores_de_alimentos,46.0,280.0,1.0,1200.0,25.0,33.0,25.0,
14266,ae62bb0f95af63d64eae5f93dddea8d3,portateis_cozinha_e_preparadores_de_alimentos,59.0,927.0,1.0,10600.0,40.0,20.0,38.0,
16182,1954739d84629e7323a4295812a3e0ec,portateis_cozinha_e_preparadores_de_alimentos,58.0,792.0,4.0,750.0,30.0,30.0,30.0,
16930,dbe520fb381ad695a7e1f2807d20c765,pc_gamer,60.0,840.0,6.0,800.0,18.0,22.0,22.0,
17800,c7a3f1a7f9eef146cc499368b578b884,portateis_cozinha_e_preparadores_de_alimentos,52.0,1372.0,5.0,7350.0,40.0,30.0,23.0,


In [29]:
df_prodcuts_merged.to_csv(
    config.interim_dir / "olist_products_translated.csv", index=False
)


## df_order_reviews

In [37]:
print(df_order_reviews.shape)
display(df_order_reviews.isnull().sum())
display(df_order_reviews.head())

(99224, 7)


review_id                      0
order_id                       0
review_score                   0
review_comment_title       87656
review_comment_message     58247
review_creation_date           0
review_answer_timestamp        0
dtype: int64

Unnamed: 0,review_id,order_id,review_score,review_comment_title,review_comment_message,review_creation_date,review_answer_timestamp
0,7bc2406110b926393aa56f80a40eba40,73fc7af87114b39712e6da79b0a377eb,4,,,2018-01-18 00:00:00,2018-01-18 21:46:59
1,80e641a11e56f04c1ad469d5645fdfde,a548910a1c6147796b98fdf73dbeba33,5,,,2018-03-10 00:00:00,2018-03-11 03:05:13
2,228ce5500dc1d8e020d8d1322874b6f0,f9e4b658b201a9f2ecdecbb34bed034b,5,,,2018-02-17 00:00:00,2018-02-18 14:36:24
3,e64fb393e7b32834bb789ff8bb30750e,658677c97b385a9be170737859d3511b,5,,Recebi bem antes do prazo estipulado.,2017-04-21 00:00:00,2017-04-21 22:02:06
4,f7c4243c7fe1938f181bec41a392bdeb,8e6bfb81e283fa7e4f11123a3fb894f1,5,,Parabéns lojas lannister adorei comprar pela I...,2018-03-01 00:00:00,2018-03-02 10:26:53


In [54]:
df_order_reviews.groupby(["review_id", "order_id"])["review_score"].count()

review_id                         order_id                        
0001239bc1de2e33cb583967c2ca4c67  fc046d7776171871436844218f817d7d    1
0001cc6860aeaf5b9017fe4131a52e62  d4665434b01caa9dc3e3e78b3eb3593e    1
00020c7512a52e92212f12d3e37513c0  e28abf2eb2f1fbcbdc2dd0cd9a561671    1
00032b0141443497c898b3093690af51  04fb47576993a3cb0c12d4b25eab6e4e    1
00034d88989f9a4c393bdcaec301537f  5f358d797a49fe2f24352f73426215f6    1
                                                                     ..
fffcfa6087cd3b651c68252342f13cb9  4069c489933782af79afcd3a0e4d693c    1
fffd24e2cf1ca4ee917e2f05be3c01fb  0efaa1dd18856769a1bcc489004fbe3b    1
fffd68e8a9fb73a56a2f504011b0f1f1  fceb38f42fbf13b53a6253648c8d47cf    1
fffee432d53abd67b5b0fd4fc290d8c3  9a54562498faf18f39a0e387976e11a5    1
fffefe7a48d22f7b32046421062219d1  1061bc32577c6b8beb107bf1b5a65175    1
Name: review_score, Length: 99224, dtype: int64

In [58]:
df_order_reviews.groupby("order_id")["review_id"].nunique().sort_values(ascending=False)

order_id
8e17072ec97ce29f0e1f111e598b0c85    3
c88b1d1b157a9999ce368f218a407141    3
03c939fd7fd3b38f8485a0f95798f1f6    3
df56136b8031ecd28e200bb18e6ddb2e    3
29062384ce4975f78aeba6a496510386    2
                                   ..
559609410c90dc1792181a5f260a6600    1
5595480d373b37e5738fc085dd9bcb8e    1
559115c14f48d999adcc027b4a702c8b    1
5590b14b260d2f61b2db5efb8d4a4601    1
fffe41c64501cc87c801fd61db3f6244    1
Name: review_id, Length: 98673, dtype: int64

In [63]:
df_order_reviews_translated.query("order_id == '8e17072ec97ce29f0e1f111e598b0c85'")[
    "review_comment_message_en"
].iloc[1]

'I bought 3 units of the product, but received 2 units that do not match what I purchased. Due to this, my opinion is negative regarding this seller as they did not fulfill what was promised in the sale.'

### translation

In [25]:
def batch_translate_with_openai(texts, batch_size=20):
    """
    テキストのリストをバッチで翻訳する
    """
    client = OpenAI()
    translations = []

    # バッチ処理
    for i in tqdm(range(0, len(texts), batch_size)):
        batch = texts[i : i + batch_size]
        input_texts = [{"id": j, "text": text} for j, text in enumerate(batch)]
        try:
            response = client.chat.completions.create(
                model="gpt-4o-mini",
                messages=[
                    {
                        "role": "system",
                        "content": """You are a translator. Translate the following Portuguese texts to English. 
                        Respond with a JSON array where each object has 'id' and 'translation' fields. 
                        Maintain the original meaning and tone of the text.""",
                    },
                    {
                        "role": "user",
                        "content": f"Translate these texts: {json.dumps(input_texts, ensure_ascii=False)}",
                    },
                ],
                temperature=0.3,
            )
            try:
                content = response.choices[0].message.content
                # マークダウンの記号を削除
                if content.startswith("```"):
                    content = content.split("\n", 1)[1]  # 最初の行を削除
                if content.endswith("```"):
                    content = content.rsplit("\n", 1)[0]  # 最後の行を削除
                if content.startswith("json"):
                    content = content.split("\n", 1)[1]  # jsonの行を削除
                # 余分な空白を削除
                content = content.strip()
                translated_batch = json.loads(content)
                translated_batch = sorted(translated_batch, key=lambda x: x["id"])
                translations.extend([item["translation"] for item in translated_batch])
            except json.JSONDecodeError as e:
                print(f"JSON parse error: {e}")
                print(f"Raw response: {response.choices[0].message.content}")
                print(f"Cleaned content: {content}")
                translations.extend(batch)
            time.sleep(0.5)
        except Exception as e:
            print(f"Error in batch translation: {e}")
            translations.extend(batch)

    return translations


def process_translations(df, column_name, max_rows=1000):
    """
    データフレームの特定のカラムを翻訳する
    """
    df = df.copy()
    # Nullでないデータのみを抽出
    non_null_mask = df[column_name].notna()
    texts_to_translate = df.loc[non_null_mask, column_name].head(max_rows).tolist()
    print(
        f"Starting translation for {len(texts_to_translate)} texts from {column_name}"
    )
    translations = batch_translate_with_openai(texts_to_translate)
    new_column = f"{column_name}_en"
    df[new_column] = df[column_name]
    indices = df.loc[non_null_mask].head(max_rows).index
    df.loc[indices, new_column] = translations

    return df


max_rows = df_order_reviews.shape[0]
df_order_reviews_translated = process_translations(
    df_order_reviews, "review_comment_title", max_rows
)
df_order_reviews_translated = process_translations(
    df_order_reviews_translated, "review_comment_message", max_rows
)


Starting translation for 11568 texts from review_comment_title


100%|██████████| 579/579 [56:50<00:00,  5.89s/it]  


Starting translation for 40977 texts from review_comment_message


100%|██████████| 2049/2049 [5:28:45<00:00,  9.63s/it]  


In [33]:
df_order_reviews_translated.query("~review_comment_title_en.isna()")

Unnamed: 0,review_id,order_id,review_score,review_comment_title,review_comment_message,review_creation_date,review_answer_timestamp,review_comment_title_en,review_comment_message_en
9,8670d52e15e00043ae7de4c01cc2fe06,b9bf720beb4ab3728760088589c62129,4,recomendo,aparelho eficiente. no site a marca do aparelh...,2018-05-22 00:00:00,2018-05-23 16:45:47,I recommend,"Efficient device. On the website, the brand of..."
15,3948b09f7c818e2d86c9a546758b2335,e51478e7e277a83743b6f9991dbfa3fb,5,Super recomendo,"Vendedor confiável, produto ok e entrega antes...",2018-05-23 00:00:00,2018-05-24 03:00:01,I highly recommend,"Reliable seller, product is okay, and delivery..."
19,373cbeecea8286a2b66c97b1b157ec46,583174fbe37d3d5f0d6661be3aad1786,1,Não chegou meu produto,Péssimo,2018-08-15 00:00:00,2018-08-15 04:10:37,My product hasn't arrived,Terrible.
22,d21bbc789670eab777d27372ab9094cc,4fc44d78867142c627497b60a7e0228a,5,Ótimo,Loja nota 10,2018-07-10 00:00:00,2018-07-11 14:10:25,Great,Store gets a 10.
34,c92cdd7dd544a01aa35137f901669cdf,37e7875cdce5a9e5b3a692971f370151,4,Muito bom.,Recebi exatamente o que esperava. As demais en...,2018-06-07 00:00:00,2018-06-09 18:44:02,Very good.,I received exactly what I expected. The other ...
...,...,...,...,...,...,...,...,...,...
99192,0e7bc73fde6782891898ea71443f9904,bd78f91afbb1ecbc6124974c5e813043,4,👍,Aprovado!,2018-07-04 00:00:00,2018-07-05 00:25:13,👍,Approved!
99196,58be140ccdc12e8908ff7fd2ba5c7cb0,0ebf8e35b9807ee2d717922d5663ccdb,5,muito bom produto,"Ficamos muito satisfeitos com o produto, atend...",2018-06-30 00:00:00,2018-07-02 23:09:35,very good product,We are very satisfied with the product; it mee...
99197,51de4e06a6b701cb2be47ea0e689437b,b7467ae483dbe956fe9acdf0b1e6e3f4,3,Não foi entregue o pedido,Bom dia \r\nDas 6 unidades compradas só recebi...,2018-06-05 00:00:00,2018-06-06 10:52:19,The order was not delivered,"Good morning \r\nOut of the 6 units purchased,..."
99199,40743b46a0ee86375cedb95e82b78d75,3e93213bb8fdda91186b4018b2fe0030,5,OTIMA EMBALAGEM,,2018-08-08 00:00:00,2018-08-08 16:56:16,GREAT PACKAGING,


In [55]:
df_order_reviews_translated["review_creation_date"] = pd.to_datetime(
    df_order_reviews_translated["review_creation_date"]
)
df_order_reviews_translated["review_answer_timestamp"] = pd.to_datetime(
    df_order_reviews_translated["review_answer_timestamp"]
)

In [56]:
df_order_reviews_translated.to_csv(
    config.interim_dir / "olist_order_reviews_translated.csv",
    index=False,
)