In [22]:
%%time
%matplotlib inline
%load_ext autoreload
%autoreload 2

import sys
from dataclasses import dataclass
from itertools import combinations
from pathlib import Path

import pandas as pd


@dataclass
class Config:
    project_dir: Path = Path("../../")
    model_dir: Path = project_dir / "models"
    outputs_dir: Path = project_dir / "outputs"
    results_dir: Path = project_dir / "results"
    results_agg_dir: Path = results_dir / "agg"
    results_figs_dir: Path = results_dir / "figs"
    data_dir: Path = project_dir / "data"
    raw_dir: Path = data_dir / "raw"
    interim_dir: Path = data_dir / "interim"
    processed_dir: Path = data_dir / "processed"


config = Config()
sys.path.append(str(config.project_dir.resolve()))

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
CPU times: user 1.16 ms, sys: 3.02 ms, total: 4.18 ms
Wall time: 5.79 ms


In [3]:
df_order_info_details = pd.read_csv(config.processed_dir / "order_info_details.csv")
df_reviews = pd.read_csv(
    config.interim_dir
    / "olist_order_reviews_translated_label_review_categories_merged.csv",
)
df_reviews_last = df_reviews.groupby("order_id").last().reset_index()
df_order_info_details_reviews_merged = pd.merge(
    df_order_info_details, df_reviews_last, on="order_id", how="left"
)
assert df_order_info_details_reviews_merged.shape[0] == df_order_info_details.shape[0]

df_review_no_null = df_order_info_details_reviews_merged.dropna(
    subset=["review_comment_message_en"]
)
print(f"df_review_no_null.shape: {df_review_no_null.shape}")


df_review_no_null.shape: (40818, 90)


In [24]:
delivery_issues = df_review_no_null.query("review_categories_str == 'Delivery_Issue'")
word_lists = []
for words_str in delivery_issues["meaningful_words"]:
    words_list = words_str.strip("[]").replace("'", "").split(", ")
    words_list = [w for w in words_list if w]
    word_lists.append(words_list)

trigram_counts = {}
for words in word_lists:
    for triple in combinations(set(words), 3):
        triple = tuple(sorted(triple))
        trigram_counts[triple] = trigram_counts.get(triple, 0) + 1

trigram_df = pd.DataFrame(
    [
        {"Word 1": t[0], "Word 2": t[1], "Word 3": t[2], "Frequency": count}
        for t, count in trigram_counts.items()
    ]
)

trigram_df = (
    trigram_df.sort_values("Frequency", ascending=False)
    .reset_index()
    .drop(columns=["index"])
)


In [26]:
trigram_df.head(20)

Unnamed: 0,Word 1,Word 2,Word 3,Frequency
0,not,product,receive,848
1,deliver,not,product,653
2,arrive,not,product,318
3,office,post,product,275
4,delivery,not,product,268
5,buy,product,receive,263
6,delivery,product,receive,253
7,deliver,product,receive,217
8,office,pick,post,213
9,not,office,post,211


In [27]:
delivery_issues

Unnamed: 0,order_id,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date,order_purchase_month,order_purchase_date,...,label_translated,label_score,meaningful_words,modified_review_score,review_categories,review_categories_str,review_categories_str_modified,review_creation_month,review_answer_month,review_answer_date
20,203096f03d82e0dffbc41ebc2e2bcfb7,d2b091571da224a1b36412c18bc3bbfe,delivered,2017-09-18 14:31:30,2017-09-19 04:04:09,2017-10-06 17:50:03,2017-10-09 22:23:46,2017-09-28,2017-09,2017-09-18,...,not_clear,0.501517,"['postal', 'service', 'strike', 'receive', 'no...",2.0,['Delivery_Issue'],Delivery_Issue,Delivery_Issue,2017-10,2017-10,2017-10-01
25,fbf9ac61453ac646ce8ad9783d7d0af6,3a874b4d4c4b6543206ff5d89287f0c3,delivered,2018-02-20 23:46:53,2018-02-22 02:30:46,2018-02-26 22:25:22,2018-03-21 22:03:54,2018-03-12,2018-02,2018-02-20,...,Negative,0.921674,"['take', 'long', 'deliver', 'deadline', 'pass'...",2.0,['Delivery_Issue'],Delivery_Issue,Delivery_Issue,2018-03,2018-03,2018-03-20
26,acce194856392f074dbf9dada14d8d82,7e20bf5ca92da68200643bda76c504c6,delivered,2018-06-04 00:00:13,2018-06-05 00:35:10,2018-06-05 13:24:00,2018-06-16 15:20:55,2018-07-18,2018-06,2018-06-04,...,Negative,0.907309,"['far', 'not', 'receive', 'crib', 'mattress', ...",1.0,['Delivery_Issue'],Delivery_Issue,Delivery_Issue,2018-06,2018-06,2018-06-20
41,6ea2f835b4556291ffdc53fa0b3b95e8,c7340080e394356141681bd4c9b8fe31,delivered,2017-11-24 21:27:48,2017-11-25 00:21:09,2017-12-13 21:14:05,2017-12-28 18:59:23,2017-12-21,2017-11,2017-11-24,...,Negative,0.798783,"['initially', 'purchase', 'date', 'product', '...",1.0,['Delivery_Issue'],Delivery_Issue,Delivery_Issue,2017-12,2017-12,2017-12-28
45,6ebaec694d7025e2ad4a05dba887c032,4f28355e5c17a4a42d3ce2439a1d4501,delivered,2017-05-18 13:55:47,2017-05-18 14:05:17,2017-05-19 12:01:38,2017-05-29 12:47:20,2017-06-09,2017-05,2017-05-18,...,not_clear,0.626985,"['sink', 'break', 'proceed', 'return']",1.0,['Delivery_Issue'],Delivery_Issue,Delivery_Issue,2017-05,2017-05,2017-05-31
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99348,a69ba794cc7deb415c3e15a0a3877e69,726f0894b5becdf952ea537d5266e543,unavailable,2017-08-23 16:28:04,2017-08-28 15:44:47,,,2017-09-15,2017-08,2017-08-23,...,Negative,0.902107,"['buy', 'product', 'gift', 'day', 'product', '...",1.0,['Delivery_Issue'],Delivery_Issue,Delivery_Issue,2017-09,2017-09,2017-09-20
99396,b3112ca67f3afd4e20cf2ee91fc4f804,6f83c71b6c044fb156d697d4130fe9b5,delivered,2018-08-02 22:46:54,2018-08-02 23:04:06,2018-08-15 17:42:00,2018-08-21 00:03:26,2018-08-16,2018-08,2018-08-02,...,Negative,0.859561,"['not', 'receive', 'product']",2.0,['Delivery_Issue'],Delivery_Issue,Delivery_Issue,2018-08,2018-08,2018-08-18
99397,0fa1fab1d7c1211c824596ed5e111e3c,7f3bd6c94d2daf7b6462d1a894a775b4,delivered,2018-03-13 21:48:57,2018-03-13 22:40:28,2018-03-14 19:27:23,2018-04-05 19:59:49,2018-04-02,2018-03,2018-03-13,...,Negative,0.975119,"['buy', 'store', 'take', 'forever', 'deliver',...",1.0,['Delivery_Issue'],Delivery_Issue,Delivery_Issue,2018-04,2018-04,2018-04-04
99423,38e9133ce29f6bbe35aed9c3863dce01,ad312389a098ceff46ce92c4595c06d0,delivered,2017-10-12 20:54:11,2017-10-14 03:28:24,2017-10-17 17:04:42,2017-11-21 17:06:59,2017-10-31,2017-10,2017-10-12,...,Negative,0.979752,"['not', 'receive', 'product', 'pay', 'not', 'r...",1.0,['Delivery_Issue'],Delivery_Issue,Delivery_Issue,2017-11,2017-11,2017-11-06
