In [12]:
%%time
%matplotlib inline
%load_ext autoreload
%autoreload 2

import sys
from dataclasses import dataclass
from pathlib import Path

import pandas as pd
from dotenv import load_dotenv


@dataclass
class Config:
    project_dir: Path = Path("../../")
    model_dir: Path = project_dir / "models"
    outputs_dir: Path = project_dir / "outputs"
    results_dir: Path = project_dir / "results"
    results_agg_dir: Path = results_dir / "agg"
    results_figs_dir: Path = results_dir / "figs"
    data_dir: Path = project_dir / "data"
    raw_dir: Path = data_dir / "raw"
    interim_dir: Path = data_dir / "interim"
    processed_dir: Path = data_dir / "processed"


config = Config()
sys.path.append(str(config.project_dir.resolve()))
load_dotenv()

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
CPU times: user 1.61 ms, sys: 3.24 ms, total: 4.85 ms
Wall time: 5.41 ms


True

In [16]:
df_order_info_details = pd.read_csv(config.processed_dir / "order_info_details.csv")
df_reviews = pd.read_csv(
    config.interim_dir
    / "olist_order_reviews_translated_label_review_categories_merged.csv",
)
df_reviews_last = df_reviews.groupby("order_id").last().reset_index()
df_order_info_details_reviews_merged = pd.merge(
    df_order_info_details, df_reviews_last, on="order_id", how="left"
)
assert df_order_info_details_reviews_merged.shape[0] == df_order_info_details.shape[0]
for col in [
    "order_purchase_timestamp",
    "order_approved_at",
    "order_delivered_carrier_date",
    "order_delivered_customer_date",
    "order_estimated_delivery_date",
    "shipping_limit_date",
]:
    df_order_info_details_reviews_merged[col] = pd.to_datetime(
        df_order_info_details_reviews_merged[col],
        format="mixed",
    )
df_review_no_null = df_order_info_details_reviews_merged.dropna(
    subset=["review_comment_message_en"]
)
print(f"df_review_no_null.shape: {df_review_no_null.shape}")


df_review_no_null.shape: (40818, 83)


In [3]:
df_review_no_null.query(
    "review_categories_str_modified in ['Delivery_Issue','Delivery_Praise'] "
).groupby(
    [
        "modified_review_score",
    ]
)["distance_between_customer_and_seller"].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
modified_review_score,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1.0,3930.0,731.072685,659.836861,1.705832,321.052393,509.069964,897.150921,3319.140717
2.0,863.0,728.607892,677.532301,1.366758,297.753982,492.879759,896.509923,3190.580333
3.0,1083.0,669.392869,635.067766,1.248978,273.396481,486.469667,825.13515,3315.202943
4.0,1861.0,671.935073,644.68588,1.925268,222.366189,474.046171,864.949363,3173.766834
5.0,7961.0,583.863658,602.649669,0.0,132.650542,406.605523,791.356169,3509.071033


In [4]:
df_review_no_null.groupby("distance_bin").agg(
    {
        "order_estimated_delivery_span_hours_from_purchase": "mean",
        "order_delivered_customer_span_hours_from_purchase": "mean",
        "order_delivered_carrier_span_hours_from_purchase": "mean",
        "order_delivered_customer_span_hours_from_carrier": "mean",
        # "order_delivered_carrier_span_hours_from_limit_date": "mean",
        # "order_delivered_customer_span_hours_from_limit_date": "mean",
        "modified_review_score": "mean",
    }
)

Unnamed: 0_level_0,order_estimated_delivery_span_hours_from_purchase,order_delivered_customer_span_hours_from_purchase,order_delivered_carrier_span_hours_from_purchase,order_delivered_customer_span_hours_from_carrier,modified_review_score
distance_bin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Bin 1,380.219811,170.214057,81.835727,88.377386,3.792228
Bin 2,539.067614,275.718857,79.972045,195.816213,3.713725
Bin 3,589.725652,321.131187,82.464583,238.73625,3.640119
Bin 4,634.305529,363.785484,82.696549,281.3403,3.615385
Bin 5,741.632825,466.700352,84.368165,382.579987,3.609647


In [5]:
df_review_no_null.query(
    "review_categories_str_modified in ['Delivery_Issue','Delivery_Praise'] "
).groupby("distance_bin").agg(
    {
        "order_estimated_delivery_span_hours_from_purchase": "mean",
        "order_delivered_customer_span_hours_from_purchase": "mean",
        "order_delivered_carrier_span_hours_from_purchase": "mean",
        "order_delivered_customer_span_hours_from_carrier": "mean",
        # "order_delivered_carrier_span_hours_from_limit_date": "mean",
        # "order_delivered_customer_span_hours_from_limit_date": "mean",
        "modified_review_score": "mean",
    }
)

Unnamed: 0_level_0,order_estimated_delivery_span_hours_from_purchase,order_delivered_customer_span_hours_from_purchase,order_delivered_carrier_span_hours_from_purchase,order_delivered_customer_span_hours_from_carrier,modified_review_score
distance_bin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Bin 1,373.196985,175.685464,79.394091,96.108625,3.982591
Bin 2,538.696972,312.54175,83.252749,229.334183,3.618267
Bin 3,588.995287,371.012476,85.859014,284.997662,3.513601
Bin 4,635.808414,413.331058,88.195383,325.53393,3.435906
Bin 5,735.791242,525.441218,89.478708,436.369212,3.379135


In [6]:
df_review_no_null.query(
    "review_categories_str_modified in ['Delivery_Issue','Delivery_Praise'] "
).groupby("review_categories_str_modified").agg(
    {
        "order_estimated_delivery_span_hours_from_purchase": "mean",
        "order_delivered_customer_span_hours_from_purchase": "mean",
        "order_delivered_carrier_span_hours_from_purchase": "mean",
        "order_delivered_customer_span_hours_from_carrier": "mean",
        "distance_bin_Bin 1": "sum",
        "distance_bin_Bin 2": "sum",
        "distance_bin_Bin 3": "sum",
        "distance_bin_Bin 4": "sum",
        "distance_bin_Bin 5": "sum",
        # "order_delivered_carrier_span_hours_from_limit_date": "mean",
        # "order_delivered_customer_span_hours_from_limit_date": "mean",
        "modified_review_score": "mean",
        "distance_between_customer_and_seller": "mean",
    }
)

Unnamed: 0_level_0,order_estimated_delivery_span_hours_from_purchase,order_delivered_customer_span_hours_from_purchase,order_delivered_carrier_span_hours_from_purchase,order_delivered_customer_span_hours_from_carrier,distance_bin_Bin 1,distance_bin_Bin 2,distance_bin_Bin 3,distance_bin_Bin 4,distance_bin_Bin 5,modified_review_score,distance_between_customer_and_seller
review_categories_str_modified,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Delivery_Issue,604.542474,579.842681,116.193754,461.732613,840,1162,1295,1397,1628,1.761775,723.212048
Delivery_Praise,563.963212,240.269576,65.896256,174.344116,2147,1827,1793,1700,1909,4.788163,592.290948


In [7]:
df_review_no_null.query(
    "review_categories_str_modified in ['Delivery_Issue','Delivery_Praise'] "
).query("customer_state in ['SP','RJ']").groupby("review_categories_str_modified").agg(
    {
        "order_estimated_delivery_span_hours_from_purchase": "mean",
        "order_delivered_customer_span_hours_from_purchase": "mean",
        "order_delivered_carrier_span_hours_from_purchase": "mean",
        "order_delivered_customer_span_hours_from_carrier": "mean",
        "distance_bin_Bin 1": "sum",
        "distance_bin_Bin 2": "sum",
        "distance_bin_Bin 3": "sum",
        "distance_bin_Bin 4": "sum",
        "distance_bin_Bin 5": "sum",
        # "order_delivered_carrier_span_hours_from_limit_date": "mean",
        # "order_delivered_customer_span_hours_from_limit_date": "mean",
        "modified_review_score": "mean",
        "distance_between_customer_and_seller": "mean",
    }
)

Unnamed: 0_level_0,order_estimated_delivery_span_hours_from_purchase,order_delivered_customer_span_hours_from_purchase,order_delivered_carrier_span_hours_from_purchase,order_delivered_customer_span_hours_from_carrier,distance_bin_Bin 1,distance_bin_Bin 2,distance_bin_Bin 3,distance_bin_Bin 4,distance_bin_Bin 5,modified_review_score,distance_between_customer_and_seller
review_categories_str_modified,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Delivery_Issue,541.448085,522.991113,120.111128,399.673469,809,982,810,594,130,1.72093,358.163762
Delivery_Praise,490.414027,190.895149,64.48921,126.385414,1979,1430,1037,594,190,4.79586,291.449366


## 最も取引が多い，SP,RJの取引に限定して，デリバリーレビューの分析

In [20]:
product_category_name_sorted = (
    df_order_info_details_reviews_merged.groupby("product_category_name_english")[
        "order_id"
    ]
    .nunique()
    .sort_values(ascending=False)
    .index
)

df_review_no_null.query(
    "review_categories_str_modified in ['Delivery_Issue','Delivery_Praise'] "
).query("customer_state in ['SP','RJ']").groupby(
    ["product_category_name_english", "review_categories_str_modified"]
).agg(
    {
        "order_estimated_delivery_span_hours_from_purchase": "mean",
        "order_delivered_customer_span_hours_from_purchase": "mean",
        "order_delivered_carrier_span_hours_from_purchase": "mean",
        "shipping_limit_date": "mean",
        "order_delivered_customer_span_hours_from_carrier": "mean",
        "distance_bin_Bin 1": "sum",
        "distance_bin_Bin 2": "sum",
        "distance_bin_Bin 3": "sum",
        "distance_bin_Bin 4": "sum",
        "distance_bin_Bin 5": "sum",
        # "order_delivered_carrier_span_hours_from_limit_date": "mean",
        # "order_delivered_customer_span_hours_from_limit_date": "mean",
        "modified_review_score": "mean",
        "distance_between_customer_and_seller": "mean",
    }
).reindex(product_category_name_sorted, level=0).to_csv(
    config.results_agg_dir
    / "delivery_review_analysis_df_reviews_no_null_purchase_info_merged_sp_rj_delivery_review.csv"
)