In [1]:
%%time
%matplotlib inline
%load_ext autoreload
%autoreload 2

import sys
from dataclasses import dataclass
from pathlib import Path

import pandas as pd
from dotenv import load_dotenv


@dataclass
class Config:
    project_dir: Path = Path("../../")
    model_dir: Path = project_dir / "models"
    outputs_dir: Path = project_dir / "outputs"
    results_dir: Path = project_dir / "results"
    results_agg_dir: Path = results_dir / "agg"
    results_figs_dir: Path = results_dir / "figs"
    data_dir: Path = project_dir / "data"
    raw_dir: Path = data_dir / "raw"
    interim_dir: Path = data_dir / "interim"
    processed_dir: Path = data_dir / "processed"


config = Config()
sys.path.append(str(config.project_dir.resolve()))
load_dotenv()

CPU times: user 295 ms, sys: 91.3 ms, total: 387 ms
Wall time: 662 ms


True

In [2]:
df_reviews = pd.read_csv(
    config.interim_dir / "olist_order_reviews_translated_label_merged.csv",
)
df_reviews["review_creation_date"] = pd.to_datetime(df_reviews["review_creation_date"])
df_reviews["review_creation_month"] = df_reviews["review_creation_date"].dt.to_period(
    "M"
)
df_reviews["review_answer_timestamp"] = pd.to_datetime(
    df_reviews["review_answer_timestamp"]
)
df_reviews["review_answer_month"] = df_reviews["review_answer_timestamp"].dt.to_period(
    "M"
)
df_reviews["review_answer_date"] = df_reviews["review_answer_timestamp"].dt.to_period(
    "D"
)
df_orders = pd.read_csv(config.interim_dir / "olist_orders_customer_merged.csv")
df_item_products = pd.read_csv(
    config.interim_dir / "olist_item_product_seller_merged.csv"
)
df_orders_item_products_merged = pd.merge(
    df_orders, df_item_products, on="order_id", how="left"
)
assert (
    df_orders_item_products_merged["order_id"].nunique()
    == df_orders["order_id"].nunique()
)
df_orders_item_products_review_merged = pd.merge(
    df_orders_item_products_merged,
    df_reviews,
    on="order_id",
    how="left",
)
assert (
    df_orders_item_products_review_merged["order_id"].nunique()
    == df_orders_item_products_merged["order_id"].nunique()
)

In [3]:
df_orders_item_products_review_merged.columns

Index(['order_id', 'customer_id', 'order_status', 'order_purchase_timestamp',
       'order_approved_at', 'order_delivered_carrier_date',
       'order_delivered_customer_date', 'order_estimated_delivery_date',
       'order_purchase_month', 'order_purchase_date', 'order_approved_month',
       'order_approved_date', 'order_delivered_carrier_month',
       'order_delivered_customer_month', 'order_estimated_delivery_month',
       'approved_span', 'estimated_delivery_span',
       'order_delivered_carrier_date_span_from_purchase',
       'order_delivered_customer_date_span_from_purchase',
       'order_delivered_customer_date_span_from_delivery_carrier',
       'is_delivery_delayed', 'customer_unique_id', 'customer_zip_code_prefix',
       'customer_city', 'customer_state', 'order_item_id', 'product_id',
       'seller_id', 'shipping_limit_date', 'price', 'freight_value',
       'sum_price_freight_by_order', 'product_category_name',
       'product_name_lenght', 'product_description_len

In [4]:
df_review_no_null = (
    df_reviews.dropna(subset=["review_comment_message_en"])[
        ["review_id", "review_comment_message_en"]
    ]
    .drop_duplicates()
    .reset_index()
    .drop(columns=["index"])
)

## クラスタリング

In [5]:
from sentence_transformers import SentenceTransformer, util

model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = model.encode(
    df_review_no_null["review_comment_message_en"].values.tolist()
)

In [15]:
clusters = util.community_detection(
    embeddings,
    min_community_size=100,
    threshold=0.85,
)
for i, cluster in enumerate(clusters):
    print(f"\nCluster {i + 1}, # Elements: {len(cluster)}")
    for sentence_id in cluster:
        print(
            f"\t{df_review_no_null['review_comment_message_en'].values.tolist()[sentence_id]}"
        )


Cluster 1, # Elements: 661
	Very good.
	Very good.
	Very good.
	Very good.
	Very good.
	Very good.
	very good.
	Very good.
	Very good.
	Very good.
	Very good.
	Very good.
	Very good.
	Very good.
	Very good.
	Very good.
	Very good.
	Very good.
	Very good.
	Very good.
	Very good.
	Very good.
	Very good.
	Very good.
	Very good.
	Very good.
	Very good.
	Very good.
	Very good.
	Very good.
	Very good.
	Very good.
	Very good.
	Very good.
	Very good.
	Very good.
	Very good.
	Very good.
	Very good.
	Very good.
	Very good.
	VERY GOOD.
	Very good.
	Very good.
	Very good.
	Very good.
	Very good.
	Very good.
	Very good.
	Very good.
	Very good.
	Very good.
	Very good.
	Very good.
	Very good.
	VERY GOOD.
	Very good.
	Very good.
	Very good.
	Very good.
	Very good.
	Very good.
	Very good.
	Very good.
	Very good.
	Very good.
	Very good.
	Very good.
	Very good.
	Very good.
	Very good.
	Very good.
	Very good.
	Very good.
	Very good.
	Very good.
	Very good.
	Very good.
	Very good.
	Very good.
	Very good.


In [19]:
print(f"num of clusters: {len(clusters)}")
total_num = 0
for i in clusters:
    total_num += len(i)
total_num

num of clusters: 20


4644

In [21]:
low_growth_categories = [
    "watches_gifts",
    "cool_stuff",
    "toys",
    "sports_leisure",
    "bed_bath_table",
    "computers_accessories",
]
df_orders_item_products_review_merged_low_growth_categories = (
    df_orders_item_products_review_merged.query(
        "product_category_name_english in @low_growth_categories"
    )
    .dropna(subset=["review_comment_message_en"])[
        ["review_id", "review_comment_message_en"]
    ]
    .drop_duplicates()
    .reset_index()
    .drop(columns=["index"])
)

In [27]:
(
    df_orders_item_products_review_merged.query(
        "product_category_name_english in @low_growth_categories"
    )["review_id"].nunique()
)

36604

In [33]:
low_growth_categories_review_comment_message_en_list = (
    df_orders_item_products_review_merged_low_growth_categories[
        "review_comment_message_en"
    ].values.tolist()
)
low_growth_categories_embeddings = model.encode(
    low_growth_categories_review_comment_message_en_list
)
low_growth_categories_clusters = util.community_detection(
    low_growth_categories_embeddings,
    min_community_size=100,
    threshold=0.75,
)
for i, cluster in enumerate(low_growth_categories_clusters):
    print(f"\nCluster {i + 1}, # Elements: {len(cluster)}")
    for sentence_id in cluster:
        print(f"\t{low_growth_categories_review_comment_message_en_list[sentence_id]}")


Cluster 1, # Elements: 489
	The product arrived perfect. Delivered well before the deadline.
	The product arrived well before the delivery deadline in perfect condition. Very good.
	Arrived well before the deadline, great product.
	ARRIVED WELL BEFORE THE DEADLINE. GOOD PRODUCT
	Arrived before the given deadline and the product is of great quality.
	Product arrived well before the deadline, excellent store.
	Arrived well before the deadline, and the product is of excellent quality!
	Perfect product
Arrived well before the deadline
Exceeded my expectations.
Congratulations.
	It arrived well before the deadline. The product came just right and well packaged.
	The product was delivered correctly and before the established deadline. Great quality.
	Product came perfect and before the delivery deadline.
	The product arrived perfectly and much earlier than the deadline! It exceeded my expectations!
	The product came correctly, delivered even before the deadline, I loved it
	The product arri

In [31]:
print(f"num of clusters: {len(low_growth_categories_clusters)}")
total_num = 0
for i in low_growth_categories_clusters:
    total_num += len(i)
total_num

num of clusters: 18


3356