In [3]:
%%time
%matplotlib inline
%load_ext autoreload
%autoreload 2

import json
import sys
import time
from dataclasses import dataclass
from pathlib import Path

import pandas as pd
import spacy
from dotenv import load_dotenv
from openai import OpenAI
from tqdm import tqdm
from transformers import pipeline


@dataclass
class Config:
    project_dir: Path = Path("../../")
    model_dir: Path = project_dir / "models"
    outputs_dir: Path = project_dir / "outputs"
    data_dir: Path = project_dir / "data"
    raw_dir: Path = data_dir / "raw"
    interim_dir: Path = data_dir / "interim"
    processed_dir: Path = data_dir / "processed"


config = Config()
sys.path.append(str(config.project_dir.resolve()))
load_dotenv()

CPU times: user 2.4 s, sys: 915 ms, total: 3.32 s
Wall time: 6.05 s


True

In [2]:
df_order_reviews = pd.read_csv(config.raw_dir / "olist_order_reviews_dataset.csv")


In [4]:
df_order_reviews_unique_contents = df_order_reviews[
    [
        "review_id",
        "review_comment_title",
        "review_comment_message",
        "review_creation_date",
    ]
].drop_duplicates()

## df_order_reviews_unique_contents

In [54]:
df_order_reviews_unique_contents.groupby(["review_id", "order_id"])[
    "review_score"
].count()

review_id                         order_id                        
0001239bc1de2e33cb583967c2ca4c67  fc046d7776171871436844218f817d7d    1
0001cc6860aeaf5b9017fe4131a52e62  d4665434b01caa9dc3e3e78b3eb3593e    1
00020c7512a52e92212f12d3e37513c0  e28abf2eb2f1fbcbdc2dd0cd9a561671    1
00032b0141443497c898b3093690af51  04fb47576993a3cb0c12d4b25eab6e4e    1
00034d88989f9a4c393bdcaec301537f  5f358d797a49fe2f24352f73426215f6    1
                                                                     ..
fffcfa6087cd3b651c68252342f13cb9  4069c489933782af79afcd3a0e4d693c    1
fffd24e2cf1ca4ee917e2f05be3c01fb  0efaa1dd18856769a1bcc489004fbe3b    1
fffd68e8a9fb73a56a2f504011b0f1f1  fceb38f42fbf13b53a6253648c8d47cf    1
fffee432d53abd67b5b0fd4fc290d8c3  9a54562498faf18f39a0e387976e11a5    1
fffefe7a48d22f7b32046421062219d1  1061bc32577c6b8beb107bf1b5a65175    1
Name: review_score, Length: 99224, dtype: int64

In [58]:
df_order_reviews_unique_contents.groupby("order_id")["review_id"].nunique().sort_values(
    ascending=False
)

order_id
8e17072ec97ce29f0e1f111e598b0c85    3
c88b1d1b157a9999ce368f218a407141    3
03c939fd7fd3b38f8485a0f95798f1f6    3
df56136b8031ecd28e200bb18e6ddb2e    3
29062384ce4975f78aeba6a496510386    2
                                   ..
559609410c90dc1792181a5f260a6600    1
5595480d373b37e5738fc085dd9bcb8e    1
559115c14f48d999adcc027b4a702c8b    1
5590b14b260d2f61b2db5efb8d4a4601    1
fffe41c64501cc87c801fd61db3f6244    1
Name: review_id, Length: 98673, dtype: int64

## translation

In [6]:
def batch_translate_with_openai(texts, batch_size=20):
    """
    テキストのリストをバッチで翻訳する
    """
    client = OpenAI()
    translations = []

    # バッチ処理
    for i in tqdm(range(0, len(texts), batch_size)):
        batch = texts[i : i + batch_size]
        input_texts = [{"id": j, "text": text} for j, text in enumerate(batch)]
        try:
            response = client.chat.completions.create(
                model="gpt-4o-mini",
                messages=[
                    {
                        "role": "system",
                        "content": """You are a translator. Translate the following Portuguese texts to English. 
                        Respond with a JSON array where each object has 'id' and 'translation' fields. 
                        Maintain the original meaning and tone of the text.""",
                    },
                    {
                        "role": "user",
                        "content": f"Translate these texts: {json.dumps(input_texts, ensure_ascii=False)}",
                    },
                ],
                temperature=0.3,
            )
            try:
                content = response.choices[0].message.content
                # マークダウンの記号を削除
                if content.startswith("```"):
                    content = content.split("\n", 1)[1]  # 最初の行を削除
                if content.endswith("```"):
                    content = content.rsplit("\n", 1)[0]  # 最後の行を削除
                if content.startswith("json"):
                    content = content.split("\n", 1)[1]  # jsonの行を削除
                # 余分な空白を削除
                content = content.strip()
                translated_batch = json.loads(content)
                translated_batch = sorted(translated_batch, key=lambda x: x["id"])
                translations.extend([item["translation"] for item in translated_batch])
            except json.JSONDecodeError as e:
                print(f"JSON parse error: {e}")
                print(f"Raw response: {response.choices[0].message.content}")
                print(f"Cleaned content: {content}")
                translations.extend(batch)
            time.sleep(0.5)
        except Exception as e:
            print(f"Error in batch translation: {e}")
            translations.extend(batch)

    return translations


def process_translations(df, column_name, max_rows=1000):
    """
    データフレームの特定のカラムを翻訳する
    """
    df = df.copy()
    non_null_mask = df[column_name].notna()
    texts_to_translate = df.loc[non_null_mask, column_name].head(max_rows).tolist()
    print(
        f"Starting translation for {len(texts_to_translate)} texts from {column_name}"
    )
    translations = batch_translate_with_openai(texts_to_translate)
    new_column = f"{column_name}_en"
    df[new_column] = df[column_name]
    indices = df.loc[non_null_mask].head(max_rows).index
    df.loc[indices, new_column] = translations

    return df


max_rows = df_order_reviews_unique_contents.shape[0]
df_order_reviews_translated = process_translations(
    df_order_reviews_unique_contents, "review_comment_title", max_rows
)
df_order_reviews_translated = process_translations(
    df_order_reviews_translated, "review_comment_message", max_rows
)


Starting translation for 11519 texts from review_comment_title


  0%|          | 0/576 [00:00<?, ?it/s]

100%|██████████| 576/576 [50:21<00:00,  5.25s/it] 


Starting translation for 40668 texts from review_comment_message


 85%|████████▌ | 1737/2034 [27:01:09<1:14:41, 15.09s/it]     

JSON parse error: Unterminated string starting at: line 13 column 31 (char 1180)
Raw response: ```json
[
    {"id": 0, "translation": "Delivery before the deadline was excellent!!!"},
    {"id": 1, "translation": "Excellent!"},
    {"id": 2, "translation": "Very good, I recommend."},
    {"id": 3, "translation": "Great quality, exactly as in the photo. Arrived before the deadline. I loved it."},
    {"id": 4, "translation": "This supplier, a partner of Lannister Stores, is committed to its customers. They demonstrated professionalism and competence in the delivery time and in packaging the consumer's product."},
    {"id": 5, "translation": "Good to have as a partner."},
    {"id": 6, "translation": "GOOD AFTERNOON, I DID NOT RECEIVE THE COMPLETE ORDER, ONLY THE ROD WAS DELIVERED. THE CURTAIN WAS NOT DELIVERED, I AM WAITING FOR A RESOLUTION TO THIS PROBLEM."},
    {"id": 7, "translation": "Excellent product.\n\nSynthetic wig, similar to hair, fine and matte strands, no artificial shine

100%|██████████| 2034/2034 [27:56:10<00:00, 49.44s/it]  


In [8]:
df_order_reviews_merged = pd.merge(
    df_order_reviews,
    df_order_reviews_translated[
        ["review_id", "review_comment_title_en", "review_comment_message_en"]
    ],
    on="review_id",
    how="left",
)
assert df_order_reviews_merged.shape[0] == df_order_reviews.shape[0]

df_order_reviews_merged.to_csv(
    config.interim_dir / "olist_order_reviews_translated.csv", index=False
)


## ポジネガ分析

In [15]:
df_order_reviews_merged = pd.read_csv(
    config.interim_dir / "olist_order_reviews_translated.csv"
)
df_order_reviews_translated_no_null = (
    df_order_reviews_merged.dropna(subset=["review_comment_message_en"])[
        ["review_id", "review_comment_message_en"]
    ]
    .drop_duplicates()
    .reset_index()
    .drop(columns=["index"])
)

In [16]:
sentiment_pipeline = pipeline(
    "sentiment-analysis", model="cardiffnlp/twitter-roberta-base-sentiment"
)
review_comment_message_en_list = df_order_reviews_translated_no_null[
    "review_comment_message_en"
].values.tolist()
label_list = []
score_list = []
for review in tqdm(review_comment_message_en_list):
    categorize_result = sentiment_pipeline(review)
    score_list.append(categorize_result[0]["score"])
    if categorize_result[0]["score"] >= 0.70:
        label_list.append(categorize_result[0]["label"])
    else:
        label_list.append("not_clear")


Device set to use mps:0
100%|██████████| 40650/40650 [13:39<00:00, 49.58it/s]


In [None]:
df_order_reviews_translated_no_null["label"] = label_list
df_order_reviews_translated_no_null["label_translated"] = (
    df_order_reviews_translated_no_null["label"].map(
        {
            "LABEL_0": "Negative",
            "LABEL_1": "Neutral",
            "LABEL_2": "Positive",
            "not_clear": "not_clear",
        }
    )
)
df_order_reviews_translated_no_null["label_score"] = score_list

# 英語の言語モデルをロード
nlp = spacy.load("en_core_web_sm")


def extract_meaningful_words(text):
    if not isinstance(text, str):
        return []

    # テキストを小文字に変換してから解析
    doc = nlp(text.lower())

    # 抽出したい品詞タグ
    pos_tags = {"NOUN", "VERB", "ADJ", "ADV"}

    # 単語を抽出（lemmatizeして原形に変換）
    words = [
        token.lemma_.lower()
        for token in doc
        if (token.pos_ in pos_tags and not token.is_stop) or token.text.lower() == "not"
    ]

    return words


# レビューから意味のある単語を抽出
df_order_reviews_translated_no_null["meaningful_words"] = (
    df_order_reviews_translated_no_null["review_comment_message_en"].apply(
        extract_meaningful_words
    )
)

df_reviews_translated_label_merged = pd.merge(
    df_order_reviews_merged,
    df_order_reviews_translated_no_null[
        ["review_id", "label_translated", "label_score", "meaningful_words"]
    ],
    on="review_id",
    how="left",
)
assert df_order_reviews_merged.shape[0] == df_reviews_translated_label_merged.shape[0]


## データ調整

In [19]:
def modify_review_score_apply(row):
    label = row["label_translated"]
    score = row["review_score"]
    if label == "Negative":
        if score == 5.0:
            return 1.0
        elif score == 4.0:
            return 2.0
    elif label == "Positive":
        if score == 1.0:
            return 5.0
        elif score == 2.0:
            return 4.0
    return score


df_reviews_translated_label_merged["modified_review_score"] = (
    df_reviews_translated_label_merged.apply(modify_review_score_apply, axis=1)
)


In [20]:
df_reviews_translated_label_merged.query("modified_review_score != review_score")[
    ["review_score", "modified_review_score", "review_comment_message_en"]
]

Unnamed: 0,review_score,modified_review_score,review_comment_message_en
639,4,2.0,The post office did not deliver.
672,1,5.0,Very good product and reasonable price!
693,2,4.0,"Excellent product, I recommend it to anyone."
727,5,1.0,"The store itself is good, but unfortunately, t..."
734,4,2.0,The only problem is that the date format is th...
...,...,...,...
98512,4,2.0,"I received the wrong flavor, I ordered strawbe..."
98761,5,1.0,I still haven't received the blouse in the del...
98832,4,2.0,"The deadline is essential, they cannot let thi..."
98956,1,5.0,"As soon as I receive the product, I will give ..."


## カテゴライズ

In [4]:
df_reviews_translated_label_merged = pd.read_csv(
    config.interim_dir / "olist_order_reviews_translated_label_merged.csv"
)

In [6]:
categorize_prompt = """
You are an expert in analyzing customer product reviews.

# Task
- Understand the content of the reviews accurately and categorize them.
- Analyze Review Content: Accurately understand the specific points mentioned in the review.
- Determine Review Type: Classify the review segment as either "Issue", "Praise", or "Others".
- Create Category Name: Combine the content description and review type into a single string: "Content_Type". For example, "Performance_Issue" or "Delivery_Praise".
- Return the categorization results in the specified JSON format.
- Create appropriate categories based on the review content.

# Response Format
You must return a JSON array of objects, where each object has:
- "id": the index number of the review (integer)
- "categories": an array of category strings

Example response format:
[
    {"id": 0, "categories": ["Delivery_Issue"]},
    {"id": 1, "categories": ["Customer Service_Issue"]}
]

# Categorization Rules
- Consider the context of the entire review and understand the actual feelings and intent of the customer.
- Keep the category names as few and precise as possible.
- Use concise, clear English category names.
- Multiple categories are basically not allowed.
- If the comment praises something, the categoy name must end with "Praise".
- If the comment describes some issues, the categoy name must end with "Issue".
- If categorization is difficult or the review lacks meaningful content, use ["Unclassifiable"].
- Create new categories as needed based on the review content, but maintain consistency in naming across similar issues.

# Examples
Input: [
    {"id": 0, "text": "I didn't receive the product, and the refund on the card was only made two months later."},
    {"id": 1, "text": "Thanks"},
    {"id": 2, "text": "The product is not exactly as advertised in the image, but the customer service was very helpful."}
]

Output: [
    {"id": 0, "categories": ["Delivery_Issue"]},
    {"id": 1, "categories": ["Unclassifiable"]},
    {"id": 2, "categories": ["Product Mismatch_Issue"]}
]
"""

In [10]:
df_reviews_unique = (
    df_reviews_translated_label_merged.dropna(subset=["review_comment_message_en"])[
        [
            "review_id",
            "review_comment_message_en",
        ]
    ]
    .drop_duplicates()
    .reset_index()
    .drop(columns=["index"])
)


def classify_review_sentiment(texts, batch_size=20):
    """
    レビューテキストの内容を分析し、主要な感想ポイントを分類する
    """
    client = OpenAI()
    classifications = []
    for i in tqdm(range(0, len(texts), batch_size)):
        batch = texts[i : i + batch_size]
        input_texts = [{"id": j, "text": text} for j, text in enumerate(batch)]
        try:
            response = client.chat.completions.create(
                model="gpt-4o-mini",
                messages=[
                    {
                        "role": "system",
                        "content": categorize_prompt,
                    },
                    {
                        "role": "user",
                        "content": f"Please classify the following reviews and return the result in the specified JSON format: {json.dumps(input_texts, ensure_ascii=False)}",
                    },
                ],
                temperature=0.2,
            )
            try:
                content = response.choices[0].message.content
                # マークダウンの記号を削除
                content = content.strip()
                if content.startswith("```"):
                    content = content.split("```")[1]
                if content.startswith("json"):
                    content = content.split("\n", 1)[1]
                content = content.strip()
                classified_batch = json.loads(content)
                if not isinstance(classified_batch, list):
                    raise ValueError("Response is not a list")
                for item in classified_batch:
                    if "id" not in item or "categories" not in item:
                        raise ValueError("Invalid response format")
                classified_batch = sorted(classified_batch, key=lambda x: x["id"])
                # カテゴリーリストをJSON文字列に変換
                classifications.extend(
                    [json.dumps(item["categories"]) for item in classified_batch]
                )
            except (json.JSONDecodeError, ValueError) as e:
                print(f"Error processing response: {e}")
                print(f"Raw response: {response.choices[0].message.content}")
                classifications.extend([json.dumps(["Unclassifiable"]) for _ in batch])
            time.sleep(0.5)
        except Exception as e:
            print(f"API call error: {e}")
            classifications.extend([json.dumps(["Unclassifiable"]) for _ in batch])

    return classifications


def process_review_classification(
    df, column_name="review_comment_message_en", max_rows=100
):
    """
    データフレームの英訳済みレビューを分類する
    """
    df = df.copy()
    non_null_mask = df[column_name].notna()
    texts_to_classify = df.loc[non_null_mask, column_name].head(max_rows).tolist()
    print(f"Starting classification for {len(texts_to_classify)} reviews")

    classifications = classify_review_sentiment(texts_to_classify)
    df["review_categories"] = None
    indices = df.loc[non_null_mask].head(max_rows).index
    df.loc[indices, "review_categories"] = classifications

    # カテゴリーを文字列からリストに戻す（必要な場合）
    df["review_categories"] = df["review_categories"].apply(
        lambda x: json.loads(x) if isinstance(x, str) else x
    )

    return df


# 実行例
df_reviews_unique_with_categories = process_review_classification(
    df_reviews_unique,
    max_rows=df_reviews_unique.shape[0],
)
df_reviews_unique_with_categories["review_categories_str"] = (
    df_reviews_unique_with_categories["review_categories"].apply(
        lambda x: "_".join(x) if isinstance(x, list) else None
    )
)

Starting classification for 40650 reviews


100%|██████████| 2033/2033 [3:28:38<00:00,  6.16s/it]  


In [11]:
df_reviews_translated_label_review_categories_merged = pd.merge(
    df_reviews_translated_label_merged,
    df_reviews_unique_with_categories[
        ["review_id", "review_categories", "review_categories_str"]
    ],
    on="review_id",
    how="left",
)
assert (
    df_reviews_translated_label_review_categories_merged.shape[0]
    == df_reviews_translated_label_merged.shape[0]
)

In [None]:
df_reviews_translated_label_review_categories_merged[
    "review_categories_str_modified"
] = df_reviews_translated_label_review_categories_merged["review_categories_str"].apply(
    lambda x: x.replace(" Praise", "_Praise").replace(" Issue", "_Issue")
    if type(x) is str
    else None
)


review_categories_str_modified
Delivery_Praise               9511
Delivery_Issue                6627
Unclassifiable                3884
Product Quality_Praise        2977
Product_Praise                2348
                              ... 
Delivery Cost_Issue              1
Product Replacement_Praise       1
Checkout_Issue                   1
Labeling_Issue                   1
Color_Mismatch_Others            1
Name: count, Length: 1057, dtype: int64

## 出力

In [8]:
df_reviews_translated_label_review_categories_merged["review_creation_date"] = (
    pd.to_datetime(
        df_reviews_translated_label_review_categories_merged["review_creation_date"]
    )
)
df_reviews_translated_label_review_categories_merged["review_creation_month"] = (
    df_reviews_translated_label_review_categories_merged[
        "review_creation_date"
    ].dt.to_period("M")
)
df_reviews_translated_label_review_categories_merged["review_answer_timestamp"] = (
    pd.to_datetime(
        df_reviews_translated_label_review_categories_merged["review_answer_timestamp"]
    )
)
df_reviews_translated_label_review_categories_merged["review_answer_month"] = (
    df_reviews_translated_label_review_categories_merged[
        "review_answer_timestamp"
    ].dt.to_period("M")
)
df_reviews_translated_label_review_categories_merged["review_answer_date"] = (
    df_reviews_translated_label_review_categories_merged[
        "review_answer_timestamp"
    ].dt.to_period("D")
)

In [7]:
df_reviews_translated_label_review_categories_merged.to_csv(
    config.interim_dir
    / "olist_order_reviews_translated_label_review_categories_merged.csv",
    index=False,
)
