In [22]:
%%time
%matplotlib inline
%load_ext autoreload
%autoreload 2

import sys
from dataclasses import dataclass
from itertools import combinations
from pathlib import Path

import pandas as pd


@dataclass
class Config:
    project_dir: Path = Path("../../")
    model_dir: Path = project_dir / "models"
    outputs_dir: Path = project_dir / "outputs"
    results_dir: Path = project_dir / "results"
    results_agg_dir: Path = results_dir / "agg"
    results_figs_dir: Path = results_dir / "figs"
    data_dir: Path = project_dir / "data"
    raw_dir: Path = data_dir / "raw"
    interim_dir: Path = data_dir / "interim"
    processed_dir: Path = data_dir / "processed"


config = Config()
sys.path.append(str(config.project_dir.resolve()))

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
CPU times: user 1.16 ms, sys: 3.02 ms, total: 4.18 ms
Wall time: 5.79 ms


In [3]:
df_order_info_details = pd.read_csv(config.processed_dir / "order_info_details.csv")
df_reviews = pd.read_csv(
    config.interim_dir
    / "olist_order_reviews_translated_label_review_categories_merged.csv",
)
df_reviews_last = df_reviews.groupby("order_id").last().reset_index()
df_order_info_details_reviews_merged = pd.merge(
    df_order_info_details, df_reviews_last, on="order_id", how="left"
)
assert df_order_info_details_reviews_merged.shape[0] == df_order_info_details.shape[0]

df_review_no_null = df_order_info_details_reviews_merged.dropna(
    subset=["review_comment_message_en"]
)
print(f"df_review_no_null.shape: {df_review_no_null.shape}")


df_review_no_null.shape: (40818, 90)


In [21]:
delivery_issues = df_review_no_null.query("review_categories_str == 'Delivery_Issue'")
word_lists = []
for words_str in delivery_issues["meaningful_words"]:
    words_list = words_str.strip("[]").replace("'", "").split(", ")
    words_list = [w for w in words_list if w]
    word_lists.append(words_list)

trigram_counts = {}
for words in word_lists:
    for triple in combinations(set(words), 3):
        triple = tuple(sorted(triple))
        trigram_counts[triple] = trigram_counts.get(triple, 0) + 1

# 結果をDataFrameに変換
trigram_df = pd.DataFrame(
    [
        {"Word 1": t[0], "Word 2": t[1], "Word 3": t[2], "Frequency": count}
        for t, count in trigram_counts.items()
    ]
)

trigram_df = trigram_df.sort_values("Frequency", ascending=False).reset_index()


上位30件の3単語共起:
    index    Word 1    Word 2    Word 3  Frequency
0    1309       not   product   receive        848
1    1429   deliver       not   product        653
2    2335    arrive       not   product        318
3    1770    office      post   product        275
4    1279  delivery       not   product        268
5    1430       buy   product   receive        263
6    3296  delivery   product   receive        253
7    1432   deliver   product   receive        217
8    1787    office      pick      post        213
9    1782       not    office      post        211
10   1483   deliver       not   receive        205
11   1427       buy       not   product        196
12   1788   deliver    office      post        181
13   3309  delivery       not   receive        178
14   1481       buy       not   receive        172
15   4053   product  purchase   receive        165
16   1278       not   product     store        163
17   1624       not   product  purchase        160
18   2368   delive