In [11]:
%%time
%matplotlib inline
%load_ext autoreload
%autoreload 2

import sys
from dataclasses import dataclass
from pathlib import Path

import pandas as pd
from dotenv import load_dotenv


@dataclass
class Config:
    project_dir: Path = Path("../../")
    model_dir: Path = project_dir / "models"
    outputs_dir: Path = project_dir / "outputs"
    results_dir: Path = project_dir / "results"
    results_agg_dir: Path = results_dir / "agg"
    results_figs_dir: Path = results_dir / "figs"
    data_dir: Path = project_dir / "data"
    raw_dir: Path = data_dir / "raw"
    interim_dir: Path = data_dir / "interim"
    processed_dir: Path = data_dir / "processed"


config = Config()
sys.path.append(str(config.project_dir.resolve()))
load_dotenv()

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
CPU times: user 1.32 ms, sys: 4.44 ms, total: 5.77 ms
Wall time: 14.4 ms


True

In [12]:
df_order_info_details = pd.read_csv(config.processed_dir / "order_info_details.csv")
df_reviews = pd.read_csv(
    config.interim_dir
    / "olist_order_reviews_translated_label_review_categories_merged.csv",
)
df_reviews_last = df_reviews.groupby("order_id").last().reset_index()
df_order_info_details_reviews_merged = pd.merge(
    df_order_info_details, df_reviews_last, on="order_id", how="left"
)
assert df_order_info_details_reviews_merged.shape[0] == df_order_info_details.shape[0]
for col in [
    "order_purchase_timestamp",
    "order_approved_at",
    "order_delivered_carrier_date",
    "order_delivered_customer_date",
    "order_estimated_delivery_date",
    "shipping_limit_date",
]:
    df_order_info_details_reviews_merged[col] = pd.to_datetime(
        df_order_info_details_reviews_merged[col],
        format="mixed",
    )
df_review_no_null = df_order_info_details_reviews_merged.dropna(
    subset=["review_comment_message_en"]
)
print(f"df_review_no_null.shape: {df_review_no_null.shape}")


df_review_no_null.shape: (40818, 88)


In [13]:
df_review_no_null.columns

Index(['order_id', 'customer_id', 'order_status', 'order_purchase_timestamp',
       'order_approved_at', 'order_delivered_carrier_date',
       'order_delivered_customer_date', 'order_estimated_delivery_date',
       'order_purchase_month', 'order_purchase_date', 'order_purchase_weekday',
       'order_approved_month', 'order_approved_date', 'order_approved_weekday',
       'order_delivered_carrier_month', 'order_delivered_carrier_weekday',
       'order_delivered_customer_month', 'order_delivered_customer_weekday',
       'order_estimated_delivery_month', 'order_estimated_delivery_weekday',
       'is_delivery_to_customers_delayed', 'customer_unique_id',
       'customer_zip_code_prefix', 'customer_city', 'customer_state',
       'total_payment_value', 'payment_type_count', 'most_common_payment_type',
       'highest_value_payment_type', 'highest_payment_value',
       'geolocation_state', 'geolocation_zip_code_prefix', 'customer_lat',
       'customer_lng', 'shipping_limit_date', 's

In [19]:
df_review_about_delivery = df_review_no_null[
    [
        "order_id",
        "product_category_name_english",
        "review_categories_str",
        # 決済承認
        "order_approved_span_hours_from_purchase",
        # 全体の予想と現実
        "order_estimated_delivery_span_hours_from_purchase",
        "order_delivered_customer_span_hours_from_purchase",
        # 販売業者→配送業者
        "order_shipping_limit_span_hours_from_purchase",
        "order_delivered_carrier_span_hours_from_purchase",
        # 配送業者→顧客
        "order_estimated_delivery_span_hours_from_shipping_limit",
        "order_delivered_customer_span_hours_from_carrier",
    ]
].query("review_categories_str in ['Delivery_Issue','Delivery_Praise']")

In [26]:
print("予想")
df_review_about_delivery.groupby("review_categories_str").agg(
    {
        "order_estimated_delivery_span_hours_from_purchase": "mean",
        "order_shipping_limit_span_hours_from_purchase": "mean",
        "order_estimated_delivery_span_hours_from_shipping_limit": "mean",
    }
)


予想


Unnamed: 0_level_0,order_estimated_delivery_span_hours_from_purchase,order_shipping_limit_span_hours_from_purchase,order_estimated_delivery_span_hours_from_shipping_limit
review_categories_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Delivery_Issue,604.492826,170.398967,435.762524
Delivery_Praise,563.878956,154.40739,409.371646


In [27]:
print("実際")
df_review_about_delivery.groupby("review_categories_str").agg(
    {
        "order_approved_span_hours_from_purchase": "mean",
        "order_delivered_customer_span_hours_from_purchase": "mean",
        "order_delivered_carrier_span_hours_from_purchase": "mean",
        "order_delivered_customer_span_hours_from_carrier": "mean",
    }
)

実際


Unnamed: 0_level_0,order_approved_span_hours_from_purchase,order_delivered_customer_span_hours_from_purchase,order_delivered_carrier_span_hours_from_purchase,order_delivered_customer_span_hours_from_carrier
review_categories_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Delivery_Issue,12.356038,580.406563,116.227214,462.253785
Delivery_Praise,10.183448,240.281657,65.901053,174.35659
