In [48]:
%%time
%matplotlib inline
%load_ext autoreload
%autoreload 2

import sys
from dataclasses import dataclass
from pathlib import Path

import pandas as pd
from dotenv import load_dotenv


@dataclass
class Config:
    project_dir: Path = Path("../../")
    model_dir: Path = project_dir / "models"
    outputs_dir: Path = project_dir / "outputs"
    data_dir: Path = project_dir / "data"
    raw_dir: Path = data_dir / "raw"
    interim_dir: Path = data_dir / "interim"
    processed_dir: Path = data_dir / "processed"


config = Config()
sys.path.append(str(config.project_dir.resolve()))
load_dotenv()

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
CPU times: user 1.78 ms, sys: 2.71 ms, total: 4.49 ms
Wall time: 4.33 ms


True

In [49]:
df_reviews = pd.read_csv(
    config.interim_dir
    / "olist_order_reviews_translated_label_review_categories_merged.csv",
)

df_orders = pd.read_csv(config.interim_dir / "olist_orders_customer_merged.csv")
df_item_products = pd.read_csv(
    config.interim_dir / "olist_item_product_seller_merged.csv"
)
df_orders_item_products_merged = pd.merge(
    df_orders, df_item_products, on="order_id", how="left"
)
assert (
    df_orders_item_products_merged["order_id"].nunique()
    == df_orders["order_id"].nunique()
)
df_orders_item_products_review_merged = pd.merge(
    df_orders_item_products_merged,
    df_reviews,
    on="order_id",
    how="left",
)
assert (
    df_orders_item_products_review_merged["order_id"].nunique()
    == df_orders_item_products_merged["order_id"].nunique()
)

In [50]:
df_order_info_details = pd.read_csv(config.processed_dir / "order_info_details.csv")
df_reviews = pd.read_csv(
    config.interim_dir
    / "olist_order_reviews_translated_label_review_categories_merged.csv",
)
df_reviews_last = df_reviews.groupby("order_id").last().reset_index()
df_order_info_details_reviews_merged = pd.merge(
    df_order_info_details, df_reviews_last, on="order_id", how="left"
)
assert df_order_info_details_reviews_merged.shape[0] == df_order_info_details.shape[0]

df_review_no_null = df_order_info_details_reviews_merged.dropna(
    subset=["review_comment_message_en"]
)
print(f"df_review_no_null.shape: {df_review_no_null.shape}")


df_review_no_null.shape: (40818, 83)


In [51]:
fh_df_reviews_no_null = df_review_no_null.query(
    "order_purchase_date >= '2017-02' & order_purchase_date <= '2017-10'"
)
sh_df_reviews_no_null = df_review_no_null.query(
    "order_purchase_date >= '2017-12' & order_purchase_date <= '2018-08'"
)

In [52]:
def agg_review_category_ratio_by_category_name(df, product_category_name):
    df_target = df.query("product_category_name_english in @product_category_name")
    df_target_praise = df_target.query(
        "review_categories_str_modified.str.endswith('Praise')"
    )
    df_target_issue = df_target.query(
        "review_categories_str_modified.str.endswith('Issue')"
    )
    print(f"Praise Review num:{df_target_praise['review_id'].nunique()}")
    print(f"Praise Issue num:{df_target_issue['review_id'].nunique()}")
    print(
        f"Issue ratio:{(df_target_issue['review_id'].nunique() / df_target['review_id'].nunique()):.2f}"
    )
    category_count_stats = (
        df_target.groupby(["review_categories_str_modified"])["review_id"]
        .nunique()
        .sort_values(ascending=False)
        .reset_index()
    )
    category_count_stats["percentage"] = (
        category_count_stats["review_id"] / category_count_stats["review_id"].sum()
    ).round(2)
    category_count_stats.columns = ["category", "count", "percentage"]
    print("全体")
    display(category_count_stats.head(20))
    print("praise")
    category_praise_count_stats = (
        df_target.query("review_categories_str_modified.str.endswith('Praise')")
        .groupby(["review_categories_str_modified"])["review_id"]
        .nunique()
        .sort_values(ascending=False)
        .reset_index()
    )
    category_praise_count_stats["percentage"] = (
        category_praise_count_stats["review_id"]
        / category_praise_count_stats["review_id"].sum()
    ).round(2)
    category_praise_count_stats.columns = ["category", "count", "percentage"]
    display(category_praise_count_stats.head(20))
    print("issue")
    category_issue_count_stats = (
        df_target.query("review_categories_str_modified.str.endswith('Issue')")
        .groupby(["review_categories_str_modified"])["review_id"]
        .nunique()
        .sort_values(ascending=False)
        .reset_index()
    )
    category_issue_count_stats["percentage"] = (
        category_issue_count_stats["review_id"]
        / category_issue_count_stats["review_id"].sum()
    ).round(2)
    category_issue_count_stats.columns = ["category", "count", "percentage"]
    display(category_issue_count_stats.head(20))

# 成長率が低くなった商品群

## watches_gifts

### 前半

In [53]:
agg_review_category_ratio_by_category_name(fh_df_reviews_no_null, "watches_gifts")

Praise Review num:278
Praise Issue num:130
Issue ratio:0.28
全体


Unnamed: 0,category,count,percentage
0,Delivery_Praise,132,0.29
1,Delivery_Issue,44,0.1
2,Product Quality_Praise,39,0.09
3,Unclassifiable,39,0.09
4,Product_Praise,34,0.07
5,Product Quality_Issue,12,0.03
6,Product Mismatch_Issue,11,0.02
7,Product Satisfaction_Praise,10,0.02
8,Product Defect_Issue,9,0.02
9,Service_Praise,9,0.02


praise


Unnamed: 0,category,count,percentage
0,Delivery_Praise,132,0.47
1,Product Quality_Praise,39,0.14
2,Product_Praise,34,0.12
3,Product Satisfaction_Praise,10,0.04
4,Service_Praise,9,0.03
5,Customer Service_Praise,5,0.02
6,Quality_Praise,4,0.01
7,Praise,4,0.01
8,Store_Praise,3,0.01
9,Performance_Praise,2,0.01


issue


Unnamed: 0,category,count,percentage
0,Delivery_Issue,44,0.34
1,Product Quality_Issue,12,0.09
2,Product Mismatch_Issue,11,0.08
3,Product Defect_Issue,9,0.07
4,Product Condition_Issue,7,0.05
5,Customer Service_Issue,7,0.05
6,Product_Issue,4,0.03
7,Product Size_Issue,3,0.02
8,Refund_Issue,2,0.02
9,Service_Issue,2,0.02


### 後半

In [54]:
agg_review_category_ratio_by_category_name(sh_df_reviews_no_null, "watches_gifts")

Praise Review num:756
Praise Issue num:634
Issue ratio:0.41
全体


Unnamed: 0,category,count,percentage
0,Delivery_Praise,356,0.23
1,Delivery_Issue,251,0.16
2,Unclassifiable,132,0.09
3,Product Quality_Praise,106,0.07
4,Product_Praise,97,0.06
5,Product Mismatch_Issue,81,0.05
6,Product Quality_Issue,43,0.03
7,Packaging_Issue,28,0.02
8,Customer Service_Issue,27,0.02
9,Product Satisfaction_Praise,26,0.02


praise


Unnamed: 0,category,count,percentage
0,Delivery_Praise,356,0.47
1,Product Quality_Praise,106,0.14
2,Product_Praise,97,0.13
3,Product Satisfaction_Praise,26,0.03
4,Service_Praise,20,0.03
5,Customer Service_Praise,19,0.03
6,Quality_Praise,11,0.01
7,Satisfaction_Praise,6,0.01
8,Performance_Praise,6,0.01
9,Recommendation_Praise,6,0.01


issue


Unnamed: 0,category,count,percentage
0,Delivery_Issue,251,0.4
1,Product Mismatch_Issue,81,0.13
2,Product Quality_Issue,43,0.07
3,Packaging_Issue,28,0.04
4,Customer Service_Issue,27,0.04
5,Product Authenticity_Issue,16,0.03
6,Product Defect_Issue,14,0.02
7,Product_Issue,11,0.02
8,Product Size_Issue,11,0.02
9,Order_Issue,9,0.01


## cools_stuff

In [55]:
agg_review_category_ratio_by_category_name(fh_df_reviews_no_null, "cool_stuff")

Praise Review num:355
Praise Issue num:161
Issue ratio:0.28
全体


Unnamed: 0,category,count,percentage
0,Delivery_Praise,162,0.28
1,Product Quality_Praise,64,0.11
2,Delivery_Issue,63,0.11
3,Unclassifiable,49,0.09
4,Product_Praise,45,0.08
5,Product Quality_Issue,11,0.02
6,Customer Service_Praise,11,0.02
7,Service_Praise,10,0.02
8,Product Mismatch_Issue,10,0.02
9,Quality_Praise,7,0.01


praise


Unnamed: 0,category,count,percentage
0,Delivery_Praise,162,0.46
1,Product Quality_Praise,64,0.18
2,Product_Praise,45,0.13
3,Customer Service_Praise,11,0.03
4,Service_Praise,10,0.03
5,Quality_Praise,7,0.02
6,Product Satisfaction_Praise,6,0.02
7,Store_Praise,5,0.01
8,Praise,4,0.01
9,Product Recommendation_Praise,4,0.01


issue


Unnamed: 0,category,count,percentage
0,Delivery_Issue,63,0.39
1,Product Quality_Issue,11,0.07
2,Product Mismatch_Issue,10,0.06
3,Product_Issue,6,0.04
4,Shipping Cost_Issue,4,0.02
5,Quality_Issue,4,0.02
6,Customer Service_Issue,3,0.02
7,Performance_Issue,3,0.02
8,Product Condition_Issue,3,0.02
9,Pricing_Issue,2,0.01


In [56]:
agg_review_category_ratio_by_category_name(sh_df_reviews_no_null, "cool_stuff")

Praise Review num:343
Praise Issue num:226
Issue ratio:0.36
全体


Unnamed: 0,category,count,percentage
0,Delivery_Praise,168,0.27
1,Delivery_Issue,103,0.16
2,Unclassifiable,55,0.09
3,Product Quality_Praise,52,0.08
4,Product_Praise,38,0.06
5,Product Quality_Issue,15,0.02
6,Quality_Praise,14,0.02
7,Product Mismatch_Issue,13,0.02
8,Product Satisfaction_Praise,8,0.01
9,Customer Service_Issue,8,0.01


praise


Unnamed: 0,category,count,percentage
0,Delivery_Praise,168,0.49
1,Product Quality_Praise,52,0.15
2,Product_Praise,38,0.11
3,Quality_Praise,14,0.04
4,Product Satisfaction_Praise,8,0.02
5,Service_Praise,6,0.02
6,Customer Service_Praise,5,0.01
7,Recommendation_Praise,5,0.01
8,Praise,5,0.01
9,Satisfaction_Praise,4,0.01


issue


Unnamed: 0,category,count,percentage
0,Delivery_Issue,103,0.46
1,Product Quality_Issue,15,0.07
2,Product Mismatch_Issue,13,0.06
3,Customer Service_Issue,8,0.04
4,Packaging_Issue,6,0.03
5,Product Defect_Issue,5,0.02
6,Product_Issue,5,0.02
7,Order_Issue,4,0.02
8,Product Size_Issue,4,0.02
9,Product Damage_Issue,4,0.02


## toys

### 前半

In [57]:
agg_review_category_ratio_by_category_name(fh_df_reviews_no_null, "toys")

Praise Review num:294
Praise Issue num:120
Issue ratio:0.26
全体


Unnamed: 0,category,count,percentage
0,Delivery_Praise,152,0.33
1,Unclassifiable,44,0.09
2,Delivery_Issue,42,0.09
3,Product_Praise,32,0.07
4,Product Quality_Praise,28,0.06
5,Product Mismatch_Issue,14,0.03
6,Product Satisfaction_Praise,10,0.02
7,Product Quality_Issue,10,0.02
8,Customer Service_Praise,9,0.02
9,Quality_Praise,8,0.02


praise


Unnamed: 0,category,count,percentage
0,Delivery_Praise,152,0.52
1,Product_Praise,32,0.11
2,Product Quality_Praise,28,0.1
3,Product Satisfaction_Praise,10,0.03
4,Customer Service_Praise,9,0.03
5,Quality_Praise,8,0.03
6,Product Recommendation_Praise,7,0.02
7,Service_Praise,5,0.02
8,Seller_Praise,4,0.01
9,Praise,4,0.01


issue


Unnamed: 0,category,count,percentage
0,Delivery_Issue,42,0.35
1,Product Mismatch_Issue,14,0.12
2,Product Quality_Issue,10,0.08
3,Product Damage_Issue,7,0.06
4,Customer Service_Issue,5,0.04
5,Quality_Issue,5,0.04
6,Shipping Cost_Issue,3,0.02
7,Refund_Issue,3,0.02
8,Product Condition_Issue,3,0.02
9,Packaging_Issue,2,0.02


### 後半

In [58]:
agg_review_category_ratio_by_category_name(sh_df_reviews_no_null, "toys")

Praise Review num:300
Praise Issue num:252
Issue ratio:0.40
全体


Unnamed: 0,category,count,percentage
0,Delivery_Praise,146,0.23
1,Delivery_Issue,104,0.17
2,Unclassifiable,65,0.1
3,Product_Praise,40,0.06
4,Product Quality_Praise,37,0.06
5,Product Mismatch_Issue,27,0.04
6,Product Quality_Issue,25,0.04
7,Customer Service_Praise,12,0.02
8,Service_Praise,8,0.01
9,Product Damage_Issue,6,0.01


praise


Unnamed: 0,category,count,percentage
0,Delivery_Praise,146,0.49
1,Product_Praise,40,0.13
2,Product Quality_Praise,37,0.12
3,Customer Service_Praise,12,0.04
4,Service_Praise,8,0.03
5,Product Satisfaction_Praise,6,0.02
6,Quality_Praise,5,0.02
7,Praise,5,0.02
8,Product Performance_Praise,4,0.01
9,Product Condition_Praise,3,0.01


issue


Unnamed: 0,category,count,percentage
0,Delivery_Issue,104,0.41
1,Product Mismatch_Issue,27,0.11
2,Product Quality_Issue,25,0.1
3,Product Damage_Issue,6,0.02
4,Product Condition_Issue,6,0.02
5,Packaging_Issue,6,0.02
6,Product Size_Issue,5,0.02
7,Product_Issue,5,0.02
8,Order Cancellation_Issue,5,0.02
9,Customer Service_Issue,4,0.02


## sports_leisure

### 前半

In [59]:
agg_review_category_ratio_by_category_name(fh_df_reviews_no_null, "sports_leisure")

Praise Review num:443
Praise Issue num:242
Issue ratio:0.31
全体


Unnamed: 0,category,count,percentage
0,Delivery_Praise,223,0.28
1,Delivery_Issue,116,0.15
2,Unclassifiable,88,0.11
3,Product Quality_Praise,57,0.07
4,Product_Praise,36,0.05
5,Product Mismatch_Issue,26,0.03
6,Customer Service_Praise,14,0.02
7,Store_Praise,11,0.01
8,Service_Praise,10,0.01
9,Packaging_Issue,8,0.01


praise


Unnamed: 0,category,count,percentage
0,Delivery_Praise,223,0.5
1,Product Quality_Praise,57,0.13
2,Product_Praise,36,0.08
3,Customer Service_Praise,14,0.03
4,Store_Praise,11,0.02
5,Service_Praise,10,0.02
6,Product Satisfaction_Praise,8,0.02
7,Praise,6,0.01
8,Recommendation_Praise,5,0.01
9,Product Recommendation_Praise,5,0.01


issue


Unnamed: 0,category,count,percentage
0,Delivery_Issue,116,0.48
1,Product Mismatch_Issue,26,0.11
2,Packaging_Issue,8,0.03
3,Product Quality_Issue,6,0.02
4,Order Fulfillment_Issue,6,0.02
5,Order_Issue,5,0.02
6,Customer Service_Issue,5,0.02
7,Product Defect_Issue,5,0.02
8,Product_Issue,5,0.02
9,Shipping Cost_Issue,4,0.02


# 成長率が伸びている商品群

## health_beauty

### 前半

In [60]:
agg_review_category_ratio_by_category_name(fh_df_reviews_no_null, "health_beauty")

Praise Review num:452
Praise Issue num:242
Issue ratio:0.29
全体


Unnamed: 0,category,count,percentage
0,Delivery_Praise,226,0.27
1,Unclassifiable,117,0.14
2,Delivery_Issue,95,0.12
3,Product_Praise,59,0.07
4,Product Quality_Praise,44,0.05
5,Product Mismatch_Issue,32,0.04
6,Product Quality_Issue,16,0.02
7,Praise,13,0.02
8,Service_Praise,13,0.02
9,Quality_Praise,10,0.01


praise


Unnamed: 0,category,count,percentage
0,Delivery_Praise,226,0.5
1,Product_Praise,59,0.13
2,Product Quality_Praise,44,0.1
3,Service_Praise,13,0.03
4,Praise,13,0.03
5,Quality_Praise,10,0.02
6,Product Satisfaction_Praise,9,0.02
7,Customer Service_Praise,7,0.02
8,Recommendation_Praise,6,0.01
9,Store_Praise,5,0.01


issue


Unnamed: 0,category,count,percentage
0,Delivery_Issue,95,0.39
1,Product Mismatch_Issue,32,0.13
2,Product Quality_Issue,16,0.07
3,Product Performance_Issue,9,0.04
4,Packaging_Issue,7,0.03
5,Product_Issue,6,0.02
6,Customer Service_Issue,5,0.02
7,Shipping_Issue,5,0.02
8,Product Authenticity_Issue,5,0.02
9,Product Defect_Issue,5,0.02


### 後半

In [61]:
agg_review_category_ratio_by_category_name(sh_df_reviews_no_null, "health_beauty")

Praise Review num:934
Praise Issue num:675
Issue ratio:0.37
全体


Unnamed: 0,category,count,percentage
0,Delivery_Praise,506,0.28
1,Delivery_Issue,326,0.18
2,Unclassifiable,194,0.11
3,Product Quality_Praise,93,0.05
4,Product_Praise,72,0.04
5,Product Mismatch_Issue,68,0.04
6,Service_Praise,36,0.02
7,Product Quality_Issue,27,0.01
8,Product Satisfaction_Praise,26,0.01
9,Customer Service_Issue,21,0.01


praise


Unnamed: 0,category,count,percentage
0,Delivery_Praise,506,0.54
1,Product Quality_Praise,93,0.1
2,Product_Praise,72,0.08
3,Service_Praise,36,0.04
4,Product Satisfaction_Praise,26,0.03
5,Customer Service_Praise,21,0.02
6,Quality_Praise,16,0.02
7,Product Performance_Praise,9,0.01
8,Product Recommendation_Praise,7,0.01
9,Overall Satisfaction_Praise,7,0.01


issue


Unnamed: 0,category,count,percentage
0,Delivery_Issue,326,0.48
1,Product Mismatch_Issue,68,0.1
2,Product Quality_Issue,27,0.04
3,Customer Service_Issue,21,0.03
4,Packaging_Issue,17,0.03
5,Shipping_Issue,16,0.02
6,Product Damage_Issue,14,0.02
7,Product_Issue,12,0.02
8,Order Fulfillment_Issue,10,0.01
9,Order_Issue,10,0.01
