In [1]:
%%time
%matplotlib inline
%load_ext autoreload
%autoreload 2

import sys
from dataclasses import dataclass
from pathlib import Path

import pandas as pd
from dotenv import load_dotenv


@dataclass
class Config:
    project_dir: Path = Path("../../")
    model_dir: Path = project_dir / "models"
    outputs_dir: Path = project_dir / "outputs"
    results_dir: Path = project_dir / "results"
    results_agg_dir: Path = results_dir / "agg"
    results_figs_dir: Path = results_dir / "figs"
    data_dir: Path = project_dir / "data"
    raw_dir: Path = data_dir / "raw"
    interim_dir: Path = data_dir / "interim"
    processed_dir: Path = data_dir / "processed"


config = Config()
sys.path.append(str(config.project_dir.resolve()))
load_dotenv()

CPU times: user 278 ms, sys: 81.6 ms, total: 360 ms
Wall time: 613 ms


False

In [2]:
df_reviews = pd.read_csv(
    config.interim_dir
    / "olist_order_reviews_translated_label_review_categories_merged.csv",
)

df_orders = pd.read_csv(config.interim_dir / "olist_orders_customer_merged.csv")
df_item_products = pd.read_csv(
    config.interim_dir / "olist_item_product_seller_merged.csv"
)
df_orders_item_products_merged = pd.merge(
    df_orders, df_item_products, on="order_id", how="left"
)
assert (
    df_orders_item_products_merged["order_id"].nunique()
    == df_orders["order_id"].nunique()
)
df_orders_item_products_review_merged = pd.merge(
    df_orders_item_products_merged,
    df_reviews,
    on="order_id",
    how="left",
)
assert (
    df_orders_item_products_review_merged["order_id"].nunique()
    == df_orders_item_products_merged["order_id"].nunique()
)
df_reviews_unique = df_reviews[
    [
        "review_id",
        "review_comment_message_en",
        "label_score",
        "label_translated",
        "modified_review_score",
    ]
].drop_duplicates()
df_review_no_null = (
    df_reviews_unique.dropna(subset=["review_comment_message_en"])
    .reset_index()
    .drop(columns=["index"])
)

print(f"df_reviews_unique.shape: {df_reviews_unique.shape}")
print(f"df_review_no_null.shape: {df_review_no_null.shape}")


df_reviews_unique.shape: (98410, 5)
df_review_no_null.shape: (40650, 5)


## scoring X sentiment

In [3]:
label_map = {
    "Positive": 1,
    "Negative": -1,
    "Neutral": 0,
    "not_clear": None,
}
df_reviews_unique["label_numeric"] = df_reviews_unique["label_translated"].map(
    label_map
)
correlation = df_reviews_unique[["modified_review_score", "label_numeric"]].corr()
display(correlation)

Unnamed: 0,modified_review_score,label_numeric
modified_review_score,1.0,0.904295
label_numeric,0.904295,1.0


## Review Scoring

In [4]:
print("df_reviews_unique")
display(
    (
        df_reviews_unique["modified_review_score"].value_counts()
        / df_reviews_unique["modified_review_score"].count()
    ).sort_index()
)
print("df_review_no_null")
display(
    (
        df_review_no_null["modified_review_score"].value_counts()
        / df_review_no_null["modified_review_score"].count()
    ).sort_index()
)

df_reviews_unique


modified_review_score
1.0    0.115710
2.0    0.034763
3.0    0.082278
4.0    0.190021
5.0    0.577228
Name: count, dtype: float64

df_review_no_null


modified_review_score
1.0    0.215129
2.0    0.059656
3.0    0.086790
4.0    0.138499
5.0    0.499926
Name: count, dtype: float64

In [5]:
df_review_no_null.groupby("label_translated")["modified_review_score"].mean()

label_translated
Negative     1.407134
Neutral      3.376931
Positive     4.759140
not_clear    3.211447
Name: modified_review_score, dtype: float64

## Sentiment Analysis 

In [6]:
fh_df = df_orders_item_products_review_merged.query(
    "order_purchase_month >= '2017-02' & order_purchase_month <= '2017-10'"
)
sh_df = df_orders_item_products_review_merged.query(
    "order_purchase_month >= '2017-12' & order_purchase_month <= '2018-08'"
)
custom_order = ["Positive", "Negative", "Neutral", "not_clear"]
print("First Half")
display(
    (
        fh_df["label_translated"].value_counts()
        / fh_df["label_translated"].value_counts().sum()
    ).reindex(custom_order)
)
print("Second Half")
display(
    (
        sh_df["label_translated"].value_counts()
        / sh_df["label_translated"].value_counts().sum()
    ).reindex(custom_order)
)

First Half


label_translated
Positive     0.522057
Negative     0.183185
Neutral      0.047873
not_clear    0.246884
Name: count, dtype: float64

Second Half


label_translated
Positive     0.461719
Negative     0.216896
Neutral      0.058551
not_clear    0.262833
Name: count, dtype: float64

In [7]:
low_growth_categories = [
    "watches_gifts",
    "cool_stuff",
    "toys",
    "sports_leisure",
    "bed_bath_table",
    "computers_accessories",
]
for category in low_growth_categories:
    tmp = df_orders_item_products_review_merged.query(
        "order_purchase_month >= '2017-02' & order_purchase_month <= '2017-10'"
    ).query("product_category_name_english in @category")

    tmp2 = df_orders_item_products_review_merged.query(
        "order_purchase_month >= '2017-12' & order_purchase_month <= '2018-08'"
    ).query("product_category_name_english in @category")
    print(f"-----{category}-----")
    print("First Half")
    display(
        (
            tmp["label_translated"].value_counts()
            / tmp["label_translated"].value_counts().sum()
        ).reindex(custom_order)
    )
    print("Second Half")
    display(
        (
            tmp2["label_translated"].value_counts()
            / tmp2["label_translated"].value_counts().sum()
        ).reindex(custom_order)
    )

-----watches_gifts-----
First Half


label_translated
Positive     0.578447
Negative     0.166403
Neutral      0.022187
not_clear    0.232964
Name: count, dtype: float64

Second Half


label_translated
Positive     0.488360
Negative     0.211695
Neutral      0.051976
not_clear    0.247970
Name: count, dtype: float64

-----cool_stuff-----
First Half


label_translated
Positive     0.621127
Negative     0.125352
Neutral      0.038028
not_clear    0.215493
Name: count, dtype: float64

Second Half


label_translated
Positive     0.506224
Negative     0.208852
Neutral      0.059474
not_clear    0.225450
Name: count, dtype: float64

-----toys-----
First Half


label_translated
Positive     0.594059
Negative     0.141914
Neutral      0.051155
not_clear    0.212871
Name: count, dtype: float64

Second Half


label_translated
Positive     0.490642
Negative     0.231283
Neutral      0.044118
not_clear    0.233957
Name: count, dtype: float64

-----sports_leisure-----
First Half


label_translated
Positive     0.504977
Negative     0.166516
Neutral      0.052489
not_clear    0.276018
Name: count, dtype: float64

Second Half


label_translated
Positive     0.460941
Negative     0.203299
Neutral      0.059680
not_clear    0.276080
Name: count, dtype: float64

-----bed_bath_table-----
First Half


label_translated
Positive     0.539664
Negative     0.168500
Neutral      0.052113
not_clear    0.239722
Name: count, dtype: float64

Second Half


label_translated
Positive     0.457835
Negative     0.211155
Neutral      0.058101
not_clear    0.272908
Name: count, dtype: float64

-----computers_accessories-----
First Half


label_translated
Positive     0.454741
Negative     0.191810
Neutral      0.077586
not_clear    0.275862
Name: count, dtype: float64

Second Half


label_translated
Positive     0.365193
Negative     0.272943
Neutral      0.067998
not_clear    0.293866
Name: count, dtype: float64