In [3]:
%%time
%matplotlib inline
%load_ext autoreload
%autoreload 2

import sys
from dataclasses import dataclass
from pathlib import Path

import pandas as pd
from dotenv import load_dotenv


@dataclass
class Config:
    project_dir: Path = Path("../../")
    model_dir: Path = project_dir / "models"
    outputs_dir: Path = project_dir / "outputs"
    data_dir: Path = project_dir / "data"
    raw_dir: Path = data_dir / "raw"
    interim_dir: Path = data_dir / "interim"
    processed_dir: Path = data_dir / "processed"


config = Config()
sys.path.append(str(config.project_dir.resolve()))
load_dotenv()

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
CPU times: user 2.13 ms, sys: 2.61 ms, total: 4.74 ms
Wall time: 4.76 ms


True

In [4]:
df_order_info_details = pd.read_csv(config.processed_dir / "order_info_details.csv")
df_reviews = pd.read_csv(
    config.interim_dir
    / "olist_order_reviews_translated_label_review_categories_merged.csv",
)
df_reviews_last = df_reviews.groupby("order_id").last().reset_index()
df_order_info_details_reviews_merged = pd.merge(
    df_order_info_details, df_reviews_last, on="order_id", how="left"
)
assert df_order_info_details_reviews_merged.shape[0] == df_order_info_details.shape[0]

df_review_no_null = df_order_info_details_reviews_merged.dropna(
    subset=["review_comment_message_en"]
)
print(f"df_review_no_null.shape: {df_review_no_null.shape}")


1. 基本分析
- 全体傾向として，多いクレーム種類は何か
-  前半と後半で傾向ことなるのか？
-  成長率が鈍化している種類に限定したらどう？
2. 詳細分析

## label_translated列とreview_categories_str_modified列の関係性

In [6]:
praise_rows = df_review_no_null.query(
    "review_categories_str_modified.str.endswith('Praise')"
).shape[0]
print(f"Praise in df_review_no_null: {praise_rows}")
issue_rows = df_review_no_null.query(
    "review_categories_str_modified.str.endswith('Issue')"
).shape[0]
print(f"Issue in df_review_no_null: {issue_rows}")
print()
df_review_no_null["label_translated"].value_counts()

Praise in df_review_no_null: 20795
Issue in df_review_no_null: 15534



label_translated
Positive     20729
not_clear     9746
Negative      8148
Neutral       2077
Name: count, dtype: int64

In [7]:
print("Praise")
display(
    df_review_no_null.query("review_categories_str_modified.str.endswith('Praise')")[
        "label_translated"
    ].value_counts()
)
print("Issue")
display(
    df_review_no_null.query("review_categories_str_modified.str.endswith('Issue')")[
        "label_translated"
    ].value_counts()
)

Praise


label_translated
Positive     17726
not_clear     2234
Neutral        829
Negative         6
Name: count, dtype: int64

Issue


label_translated
Negative     8001
not_clear    5713
Neutral       911
Positive      909
Name: count, dtype: int64

※ review_categories_str_modified列がIssueでも，label_translated列がnot_clearのやつは文章が長くてむずかったと考えられる

## 1.基本分析


In [9]:
def agg_review_category_ratio_by_category_name(df, product_category_name):
    df_target = df.query("product_category_name_english in @product_category_name")
    df_target_praise = df_target.query(
        "review_categories_str_modified.str.endswith('Praise')"
    )
    df_target_issue = df_target.query(
        "review_categories_str_modified.str.endswith('Issue')"
    )
    print(f"Praise Review num:{df_target_praise['review_id'].nunique()}")
    print(f"Praise Issue num:{df_target_issue['review_id'].nunique()}")
    print(
        f"Issue ratio:{(df_target_issue['review_id'].nunique() / df_target['review_id'].nunique()):.2f}"
    )
    category_count_stats = (
        df_target.groupby(["review_categories_str_modified"])["review_id"]
        .nunique()
        .sort_values(ascending=False)
        .reset_index()
    )
    category_count_stats["percentage"] = (
        category_count_stats["review_id"] / category_count_stats["review_id"].sum()
    ).round(2)
    category_count_stats.columns = ["category", "count", "percentage"]
    print("全体")
    display(category_count_stats.head(20))
    print("praise")
    category_praise_count_stats = (
        df_target.query("review_categories_str_modified.str.endswith('Praise')")
        .groupby(["review_categories_str_modified"])["review_id"]
        .nunique()
        .sort_values(ascending=False)
        .reset_index()
    )
    category_praise_count_stats["percentage"] = (
        category_praise_count_stats["review_id"]
        / category_praise_count_stats["review_id"].sum()
    ).round(2)
    category_praise_count_stats.columns = ["category", "count", "percentage"]
    display(category_praise_count_stats.head(20))
    print("issue")
    category_issue_count_stats = (
        df_target.query("review_categories_str_modified.str.endswith('Issue')")
        .groupby(["review_categories_str_modified"])["review_id"]
        .nunique()
        .sort_values(ascending=False)
        .reset_index()
    )
    category_issue_count_stats["percentage"] = (
        category_issue_count_stats["review_id"]
        / category_issue_count_stats["review_id"].sum()
    ).round(2)
    category_issue_count_stats.columns = ["category", "count", "percentage"]
    display(category_issue_count_stats.head(20))

### 1.1. レビュー全体

- 全体傾向として，多いレビュー種類は何か

#### 全体

In [16]:
agg_review_category_ratio_by_category_name(
    df_review_no_null,
    df_review_no_null["product_category_name_english"].unique().tolist(),
)

Praise Review num:20766
Praise Issue num:15471
Issue ratio:0.38
全体


Unnamed: 0,category,count,percentage
0,Delivery_Praise,9445,0.23
1,Delivery_Issue,6552,0.16
2,Unclassifiable,3856,0.09
3,Product Quality_Praise,2956,0.07
4,Product_Praise,2332,0.06
5,Product Mismatch_Issue,1675,0.04
6,Product Quality_Issue,1090,0.03
7,Product Satisfaction_Praise,606,0.01
8,Service_Praise,604,0.01
9,Quality_Praise,572,0.01


praise


Unnamed: 0,category,count,percentage
0,Delivery_Praise,9445,0.45
1,Product Quality_Praise,2956,0.14
2,Product_Praise,2332,0.11
3,Product Satisfaction_Praise,606,0.03
4,Service_Praise,604,0.03
5,Quality_Praise,572,0.03
6,Customer Service_Praise,551,0.03
7,Praise,276,0.01
8,Product Recommendation_Praise,214,0.01
9,Satisfaction_Praise,181,0.01


issue


Unnamed: 0,category,count,percentage
0,Delivery_Issue,6552,0.42
1,Product Mismatch_Issue,1675,0.11
2,Product Quality_Issue,1090,0.07
3,Customer Service_Issue,450,0.03
4,Product Damage_Issue,293,0.02
5,Order_Issue,290,0.02
6,Product Defect_Issue,289,0.02
7,Packaging_Issue,275,0.02
8,Product_Issue,274,0.02
9,Order Fulfillment_Issue,230,0.01


### 1.2. 前半と後半で傾向ことなるのか？

#### 前半

In [17]:
fh_df_reviews_no_null = df_review_no_null.query(
    "order_purchase_date >= '2017-02' & order_purchase_date <= '2017-10'"
)
sh_df_reviews_no_null = df_review_no_null.query(
    "order_purchase_date >= '2017-12' & order_purchase_date <= '2018-08'"
)

In [18]:
agg_review_category_ratio_by_category_name(
    fh_df_reviews_no_null,
    fh_df_reviews_no_null["product_category_name_english"].unique().tolist(),
)

Praise Review num:6053
Praise Issue num:3722
Issue ratio:0.34
全体


Unnamed: 0,category,count,percentage
0,Delivery_Praise,2710,0.25
1,Delivery_Issue,1418,0.13
2,Unclassifiable,1095,0.1
3,Product Quality_Praise,838,0.08
4,Product_Praise,713,0.06
5,Product Mismatch_Issue,424,0.04
6,Product Quality_Issue,319,0.03
7,Product Satisfaction_Praise,176,0.02
8,Service_Praise,169,0.02
9,Quality_Praise,166,0.02


praise


Unnamed: 0,category,count,percentage
0,Delivery_Praise,2710,0.45
1,Product Quality_Praise,838,0.14
2,Product_Praise,713,0.12
3,Product Satisfaction_Praise,176,0.03
4,Service_Praise,169,0.03
5,Quality_Praise,166,0.03
6,Customer Service_Praise,160,0.03
7,Praise,94,0.02
8,Store_Praise,71,0.01
9,Product Recommendation_Praise,65,0.01


issue


Unnamed: 0,category,count,percentage
0,Delivery_Issue,1418,0.38
1,Product Mismatch_Issue,424,0.11
2,Product Quality_Issue,319,0.09
3,Customer Service_Issue,93,0.02
4,Product Damage_Issue,88,0.02
5,Product Defect_Issue,81,0.02
6,Packaging_Issue,67,0.02
7,Product_Issue,66,0.02
8,Product Condition_Issue,61,0.02
9,Order_Issue,56,0.02


#### 後半

In [19]:
agg_review_category_ratio_by_category_name(
    sh_df_reviews_no_null,
    sh_df_reviews_no_null["product_category_name_english"].unique().tolist(),
)

Praise Review num:10670
Praise Issue num:8641
Issue ratio:0.40
全体


Unnamed: 0,category,count,percentage
0,Delivery_Praise,4869,0.23
1,Delivery_Issue,3805,0.18
2,Unclassifiable,1998,0.09
3,Product Quality_Praise,1533,0.07
4,Product_Praise,1181,0.05
5,Product Mismatch_Issue,932,0.04
6,Product Quality_Issue,584,0.03
7,Service_Praise,324,0.02
8,Product Satisfaction_Praise,310,0.01
9,Quality_Praise,291,0.01


praise


Unnamed: 0,category,count,percentage
0,Delivery_Praise,4869,0.46
1,Product Quality_Praise,1533,0.14
2,Product_Praise,1181,0.11
3,Service_Praise,324,0.03
4,Product Satisfaction_Praise,310,0.03
5,Quality_Praise,291,0.03
6,Customer Service_Praise,278,0.03
7,Praise,134,0.01
8,Product Recommendation_Praise,95,0.01
9,Satisfaction_Praise,94,0.01


issue


Unnamed: 0,category,count,percentage
0,Delivery_Issue,3805,0.44
1,Product Mismatch_Issue,932,0.11
2,Product Quality_Issue,584,0.07
3,Customer Service_Issue,260,0.03
4,Order_Issue,170,0.02
5,Product Damage_Issue,158,0.02
6,Product_Issue,150,0.02
7,Packaging_Issue,148,0.02
8,Product Defect_Issue,140,0.02
9,Order Fulfillment_Issue,137,0.02


### 成長率が鈍化している種類に限定したらどう？　→ 全体傾向と変わらない

In [20]:
low_growth_categories = [
    "watches_gifts",
    "cool_stuff",
    "toys",
    "sports_leisure",
    "bed_bath_table",
    "computers_accessories",
]

#### 前半

In [21]:
agg_review_category_ratio_by_category_name(
    fh_df_reviews_no_null,
    low_growth_categories,
)

Praise Review num:2384
Praise Issue num:1283
Issue ratio:0.31
全体


Unnamed: 0,category,count,percentage
0,Delivery_Praise,1053,0.25
1,Delivery_Issue,470,0.11
2,Unclassifiable,420,0.1
3,Product Quality_Praise,342,0.08
4,Product_Praise,295,0.07
5,Product Mismatch_Issue,155,0.04
6,Product Quality_Issue,111,0.03
7,Product Satisfaction_Praise,70,0.02
8,Customer Service_Praise,63,0.02
9,Service_Praise,63,0.02


praise


Unnamed: 0,category,count,percentage
0,Delivery_Praise,1053,0.44
1,Product Quality_Praise,342,0.14
2,Product_Praise,295,0.12
3,Product Satisfaction_Praise,70,0.03
4,Service_Praise,63,0.03
5,Customer Service_Praise,63,0.03
6,Quality_Praise,63,0.03
7,Praise,35,0.01
8,Product Recommendation_Praise,33,0.01
9,Store_Praise,30,0.01


issue


Unnamed: 0,category,count,percentage
0,Delivery_Issue,470,0.37
1,Product Mismatch_Issue,155,0.12
2,Product Quality_Issue,111,0.09
3,Customer Service_Issue,33,0.03
4,Quality_Issue,31,0.02
5,Packaging_Issue,29,0.02
6,Product Defect_Issue,29,0.02
7,Product_Issue,26,0.02
8,Product Condition_Issue,20,0.02
9,Order Fulfillment_Issue,17,0.01


In [22]:
agg_review_category_ratio_by_category_name(
    sh_df_reviews_no_null,
    low_growth_categories,
)

Praise Review num:3921
Praise Issue num:3369
Issue ratio:0.41
全体


Unnamed: 0,category,count,percentage
0,Delivery_Praise,1743,0.21
1,Delivery_Issue,1473,0.18
2,Unclassifiable,765,0.09
3,Product Quality_Praise,596,0.07
4,Product_Praise,461,0.06
5,Product Mismatch_Issue,408,0.05
6,Product Quality_Issue,261,0.03
7,Service_Praise,118,0.01
8,Quality_Praise,115,0.01
9,Product Satisfaction_Praise,106,0.01


praise


Unnamed: 0,category,count,percentage
0,Delivery_Praise,1743,0.44
1,Product Quality_Praise,596,0.15
2,Product_Praise,461,0.12
3,Service_Praise,118,0.03
4,Quality_Praise,115,0.03
5,Product Satisfaction_Praise,106,0.03
6,Customer Service_Praise,102,0.03
7,Praise,46,0.01
8,Satisfaction_Praise,34,0.01
9,Performance_Praise,34,0.01


issue


Unnamed: 0,category,count,percentage
0,Delivery_Issue,1473,0.44
1,Product Mismatch_Issue,408,0.12
2,Product Quality_Issue,261,0.08
3,Customer Service_Issue,103,0.03
4,Order_Issue,74,0.02
5,Packaging_Issue,64,0.02
6,Product_Issue,57,0.02
7,Product Defect_Issue,55,0.02
8,Quality_Issue,46,0.01
9,Product Size_Issue,42,0.01
