In [1]:
%%time
%matplotlib inline
%load_ext autoreload
%autoreload 2

import sys
from dataclasses import dataclass
from pathlib import Path

import pandas as pd


@dataclass
class Config:
    project_dir: Path = Path("../../")
    model_dir: Path = project_dir / "models"
    outputs_dir: Path = project_dir / "outputs"
    results_dir: Path = project_dir / "results"
    results_agg_dir: Path = results_dir / "agg"
    results_figs_dir: Path = results_dir / "figs"
    data_dir: Path = project_dir / "data"
    raw_dir: Path = data_dir / "raw"
    interim_dir: Path = data_dir / "interim"
    processed_dir: Path = data_dir / "processed"


config = Config()
sys.path.append(str(config.project_dir.resolve()))

CPU times: user 270 ms, sys: 84.4 ms, total: 355 ms
Wall time: 612 ms


In [2]:
df_order_info_details = pd.read_csv(config.processed_dir / "order_info_details.csv")
df_reviews = pd.read_csv(
    config.interim_dir
    / "olist_order_reviews_translated_label_review_categories_merged.csv",
)
df_reviews_last = df_reviews.groupby("order_id").last().reset_index()
df_order_info_details_reviews_merged = pd.merge(
    df_order_info_details, df_reviews_last, on="order_id", how="left"
)
assert df_order_info_details_reviews_merged.shape[0] == df_order_info_details.shape[0]

df_review_no_null = df_order_info_details_reviews_merged.dropna(
    subset=["review_comment_message_en"]
)
print(f"df_review_no_null.shape: {df_review_no_null.shape}")


df_review_no_null.shape: (40818, 85)


1. 基本分析
- 全体傾向として，多いクレーム種類は何か
-  前半と後半で傾向ことなるのか？
-  成長率が鈍化している種類に限定したらどう？
2. 詳細分析

## label_translated列とreview_categories_str_modified列の関係性

In [3]:
praise_rows = df_review_no_null.query(
    "review_categories_str_modified.str.endswith('Praise')"
).shape[0]
print(f"Praise in df_review_no_null: {praise_rows}")
issue_rows = df_review_no_null.query(
    "review_categories_str_modified.str.endswith('Issue')"
).shape[0]
print(f"Issue in df_review_no_null: {issue_rows}")
print()
df_review_no_null["label_translated"].value_counts()

Praise in df_review_no_null: 20834
Issue in df_review_no_null: 15603



label_translated
Positive     20768
not_clear     9783
Negative      8184
Neutral       2083
Name: count, dtype: int64

In [4]:
print("Praise")
display(
    df_review_no_null.query("review_categories_str_modified.str.endswith('Praise')")[
        "label_translated"
    ].value_counts()
)
print("Issue")
display(
    df_review_no_null.query("review_categories_str_modified.str.endswith('Issue')")[
        "label_translated"
    ].value_counts()
)

Praise


label_translated
Positive     17760
not_clear     2238
Neutral        830
Negative         6
Name: count, dtype: int64

Issue


label_translated
Negative     8037
not_clear    5741
Neutral       916
Positive      909
Name: count, dtype: int64

In [5]:
print("Praise")
display(
    df_review_no_null.query("review_categories_str_modified.str.endswith('Praise')")[
        "modified_review_score"
    ].value_counts()
)
print("Issue")
display(
    df_review_no_null.query("review_categories_str_modified.str.endswith('Issue')")[
        "modified_review_score"
    ].value_counts()
)

Praise


modified_review_score
5.0    16977
4.0     3292
3.0      524
1.0       25
2.0       16
Name: count, dtype: int64

Issue


modified_review_score
1.0    8456
3.0    2595
2.0    2323
4.0    1411
5.0     818
Name: count, dtype: int64

※ review_categories_str_modified列がIssueでも，label_translated列がnot_clearのやつは文章が長くてむずかったと考えられる

## 1.基本分析


In [6]:
from src.analyzer.agg_utils import agg_review_category_ratio_by_category_name

### 1.1. レビュー全体

- 全体傾向として，多いレビュー種類は何か

#### 全体

In [7]:
agg_dict = agg_review_category_ratio_by_category_name(
    df_review_no_null,
    df_review_no_null["product_category_name_english"].unique().tolist(),
)

Praise Review num:20766
Praise Issue num:15471
Issue ratio:0.38
全体
                         category  count  percentage
0                 Delivery_Praise   9445        0.23
1                  Delivery_Issue   6552        0.16
2                  Unclassifiable   3856        0.09
3          Product Quality_Praise   2956        0.07
4                  Product_Praise   2332        0.06
5          Product Mismatch_Issue   1675        0.04
6           Product Quality_Issue   1090        0.03
7     Product Satisfaction_Praise    606        0.01
8                  Service_Praise    604        0.01
9                  Quality_Praise    572        0.01
10        Customer Service_Praise    551        0.01
11         Customer Service_Issue    450        0.01
12           Product Damage_Issue    293        0.01
13                    Order_Issue    290        0.01
14           Product Defect_Issue    289        0.01
15                         Praise    276        0.01
16                Packaging_Issu

In [8]:
agg_dict["all"].to_csv(config.results_agg_dir / "category_all.csv", index=False)
agg_dict["praise"].to_csv(config.results_agg_dir / "category_praise.csv", index=False)
agg_dict["issue"].to_csv(config.results_agg_dir / "category_issue.csv", index=False)

### 1.2. 前半と後半で傾向ことなるのか？

#### 前半

In [9]:
fh_df_reviews_no_null = df_review_no_null.query(
    "order_purchase_date >= '2017-02' & order_purchase_date <= '2017-10'"
)
sh_df_reviews_no_null = df_review_no_null.query(
    "order_purchase_date >= '2017-12' & order_purchase_date <= '2018-08'"
)

In [10]:
agg_fh_dict = agg_review_category_ratio_by_category_name(
    fh_df_reviews_no_null,
    fh_df_reviews_no_null["product_category_name_english"].unique().tolist(),
)

Praise Review num:6053
Praise Issue num:3722
Issue ratio:0.34
全体
                         category  count  percentage
0                 Delivery_Praise   2710        0.25
1                  Delivery_Issue   1418        0.13
2                  Unclassifiable   1095        0.10
3          Product Quality_Praise    838        0.08
4                  Product_Praise    713        0.06
5          Product Mismatch_Issue    424        0.04
6           Product Quality_Issue    319        0.03
7     Product Satisfaction_Praise    176        0.02
8                  Service_Praise    169        0.02
9                  Quality_Praise    166        0.02
10        Customer Service_Praise    160        0.01
11                         Praise     94        0.01
12         Customer Service_Issue     93        0.01
13           Product Damage_Issue     88        0.01
14           Product Defect_Issue     81        0.01
15                   Store_Praise     71        0.01
16                Packaging_Issue 

In [11]:
agg_fh_dict["all"].to_csv(config.results_agg_dir / "fh_category_all.csv", index=False)
agg_fh_dict["praise"].to_csv(
    config.results_agg_dir / "fh_category_praise.csv", index=False
)
agg_fh_dict["issue"].to_csv(
    config.results_agg_dir / "fh_category_issue.csv", index=False
)

#### 後半

In [12]:
agg_sh_dict = agg_review_category_ratio_by_category_name(
    sh_df_reviews_no_null,
    sh_df_reviews_no_null["product_category_name_english"].unique().tolist(),
)

Praise Review num:10670
Praise Issue num:8641
Issue ratio:0.40
全体
                       category  count  percentage
0               Delivery_Praise   4869        0.23
1                Delivery_Issue   3805        0.18
2                Unclassifiable   1998        0.09
3        Product Quality_Praise   1533        0.07
4                Product_Praise   1181        0.05
5        Product Mismatch_Issue    932        0.04
6         Product Quality_Issue    584        0.03
7                Service_Praise    324        0.02
8   Product Satisfaction_Praise    310        0.01
9                Quality_Praise    291        0.01
10      Customer Service_Praise    278        0.01
11       Customer Service_Issue    260        0.01
12                  Order_Issue    170        0.01
13         Product Damage_Issue    158        0.01
14                Product_Issue    150        0.01
15              Packaging_Issue    148        0.01
16         Product Defect_Issue    140        0.01
17      Order Fu

In [13]:
agg_sh_dict["all"].to_csv(config.results_agg_dir / "sh_category_all.csv", index=False)
agg_sh_dict["praise"].to_csv(
    config.results_agg_dir / "sh_category_praise.csv", index=False
)
agg_sh_dict["issue"].to_csv(
    config.results_agg_dir / "sh_category_issue.csv", index=False
)

### 成長率が鈍化している種類に限定したらどう？　→ 全体傾向と変わらない

In [14]:
low_growth_categories = [
    "watches_gifts",
    "cool_stuff",
    "toys",
    "sports_leisure",
    "bed_bath_table",
    "computers_accessories",
]

### 全体

In [15]:
agg_dict_low_growth = agg_review_category_ratio_by_category_name(
    df_review_no_null,
    low_growth_categories,
)
agg_dict_low_growth["all"].to_csv(
    config.results_agg_dir / "low_growth_category_all.csv", index=False
)
agg_dict_low_growth["praise"].to_csv(
    config.results_agg_dir / "low_growth_category_praise.csv", index=False
)
agg_dict_low_growth["issue"].to_csv(
    config.results_agg_dir / "low_growth_category_issue.csv", index=False
)

Praise Review num:7790
Praise Issue num:5765
Issue ratio:0.38
全体
                         category  count  percentage
0                 Delivery_Praise   3472        0.23
1                  Delivery_Issue   2397        0.16
2                  Unclassifiable   1484        0.10
3          Product Quality_Praise   1148        0.08
4                  Product_Praise    922        0.06
5          Product Mismatch_Issue    679        0.04
6           Product Quality_Issue    450        0.03
7     Product Satisfaction_Praise    224        0.01
8                  Service_Praise    222        0.01
9                  Quality_Praise    217        0.01
10        Customer Service_Praise    212        0.01
11         Customer Service_Issue    180        0.01
12           Product Defect_Issue    111        0.01
13                Packaging_Issue    110        0.01
14                    Order_Issue    108        0.01
15                  Product_Issue    106        0.01
16                         Praise 

#### 前半

In [16]:
agg_dict_low_growth_fh = agg_review_category_ratio_by_category_name(
    fh_df_reviews_no_null,
    low_growth_categories,
)
agg_dict_low_growth_fh["all"].to_csv(
    config.results_agg_dir / "low_growth_fh_category_all.csv", index=False
)
agg_dict_low_growth_fh["praise"].to_csv(
    config.results_agg_dir / "low_growth_fh_category_praise.csv", index=False
)
agg_dict_low_growth_fh["issue"].to_csv(
    config.results_agg_dir / "low_growth_fh_category_issue.csv", index=False
)

Praise Review num:2384
Praise Issue num:1283
Issue ratio:0.31
全体
                         category  count  percentage
0                 Delivery_Praise   1053        0.25
1                  Delivery_Issue    470        0.11
2                  Unclassifiable    420        0.10
3          Product Quality_Praise    342        0.08
4                  Product_Praise    295        0.07
5          Product Mismatch_Issue    155        0.04
6           Product Quality_Issue    111        0.03
7     Product Satisfaction_Praise     70        0.02
8         Customer Service_Praise     63        0.02
9                  Service_Praise     63        0.02
10                 Quality_Praise     63        0.02
11                         Praise     35        0.01
12         Customer Service_Issue     33        0.01
13  Product Recommendation_Praise     33        0.01
14                  Quality_Issue     31        0.01
15                   Store_Praise     30        0.01
16                Packaging_Issue 

In [17]:
agg_dict_low_growth_sh = agg_review_category_ratio_by_category_name(
    sh_df_reviews_no_null,
    low_growth_categories,
)
agg_dict_low_growth_sh["all"].to_csv(
    config.results_agg_dir / "low_growth_sh_category_all.csv", index=False
)
agg_dict_low_growth_sh["praise"].to_csv(
    config.results_agg_dir / "low_growth_sh_category_praise.csv", index=False
)
agg_dict_low_growth_sh["issue"].to_csv(
    config.results_agg_dir / "low_growth_sh_category_issue.csv", index=False
)


Praise Review num:3921
Praise Issue num:3369
Issue ratio:0.41
全体
                       category  count  percentage
0               Delivery_Praise   1743        0.21
1                Delivery_Issue   1473        0.18
2                Unclassifiable    765        0.09
3        Product Quality_Praise    596        0.07
4                Product_Praise    461        0.06
5        Product Mismatch_Issue    408        0.05
6         Product Quality_Issue    261        0.03
7                Service_Praise    118        0.01
8                Quality_Praise    115        0.01
9   Product Satisfaction_Praise    106        0.01
10       Customer Service_Issue    103        0.01
11      Customer Service_Praise    102        0.01
12                  Order_Issue     74        0.01
13              Packaging_Issue     64        0.01
14                Product_Issue     57        0.01
15         Product Defect_Issue     55        0.01
16                       Praise     46        0.01
17               