In [21]:
%%time
%matplotlib inline
%load_ext autoreload
%autoreload 2

import sys
from dataclasses import dataclass
from pathlib import Path

import pandas as pd
from dotenv import load_dotenv


@dataclass
class Config:
    project_dir: Path = Path("../../")
    model_dir: Path = project_dir / "models"
    outputs_dir: Path = project_dir / "outputs"
    results_dir: Path = project_dir / "results"
    results_agg_dir: Path = results_dir / "agg"
    results_figs_dir: Path = results_dir / "figs"
    data_dir: Path = project_dir / "data"
    raw_dir: Path = data_dir / "raw"
    interim_dir: Path = data_dir / "interim"
    processed_dir: Path = data_dir / "processed"


config = Config()
sys.path.append(str(config.project_dir.resolve()))
load_dotenv()

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
CPU times: user 1.77 ms, sys: 2.52 ms, total: 4.29 ms
Wall time: 3.96 ms


True

In [22]:
df_order_info_details = pd.read_csv(config.processed_dir / "order_info_details.csv")
df_reviews = pd.read_csv(
    config.interim_dir
    / "olist_order_reviews_translated_label_review_categories_merged.csv",
)
df_reviews_last = df_reviews.groupby("order_id").last().reset_index()
df_order_info_details_reviews_merged = pd.merge(
    df_order_info_details, df_reviews_last, on="order_id", how="left"
)
assert df_order_info_details_reviews_merged.shape[0] == df_order_info_details.shape[0]
for col in [
    "order_purchase_timestamp",
    "order_approved_at",
    "order_delivered_carrier_date",
    "order_delivered_customer_date",
    "order_estimated_delivery_date",
    "shipping_limit_date",
]:
    df_order_info_details_reviews_merged[col] = pd.to_datetime(
        df_order_info_details_reviews_merged[col],
        format="mixed",
    )
df_review_no_null = df_order_info_details_reviews_merged.dropna(
    subset=["review_comment_message_en"]
)
print(f"df_review_no_null.shape: {df_review_no_null.shape}")


df_review_no_null.shape: (40818, 84)


In [23]:
df_target = (
    df_review_no_null.query(
        "review_categories_str_modified in ['Delivery_Issue','Delivery_Praise'] "
    )
    .query("customer_state in ['SP','RJ']")
    .dropna(subset=["order_delivered_customer_date_span_from_delivery_carrier"])
)

df_target["is_same_state"] = (
    df_target["customer_state"] == df_target["seller_state"]
).astype(int)
df_target["is_same_city"] = (
    df_target["customer_city"] == df_target["seller_city"]
).astype(int)
df_target["review_categories_str_modified_Delivery_Issue"] = df_target[
    "review_categories_str_modified"
].apply(lambda x: 1 if x == "Delivery_Issue" else 0)
df_target["review_categories_str_modified_Delivery_Praise"] = df_target[
    "review_categories_str_modified"
].apply(lambda x: 1 if x == "Delivery_Praise" else 0)


In [24]:
df_target.head()

Unnamed: 0,order_id,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date,order_purchase_month,order_purchase_date,...,review_categories,review_categories_str,review_categories_str_modified,review_creation_month,review_answer_month,review_answer_date,is_same_state,is_same_city,review_categories_str_modified_Delivery_Issue,review_categories_str_modified_Delivery_Praise
20,203096f03d82e0dffbc41ebc2e2bcfb7,d2b091571da224a1b36412c18bc3bbfe,delivered,2017-09-18 14:31:30,2017-09-19 04:04:09,2017-10-06 17:50:03,2017-10-09 22:23:46,2017-09-28,2017-09,2017-09-18,...,['Delivery_Issue'],Delivery_Issue,Delivery_Issue,2017-10,2017-10,2017-10-01,1,1,1,0
25,fbf9ac61453ac646ce8ad9783d7d0af6,3a874b4d4c4b6543206ff5d89287f0c3,delivered,2018-02-20 23:46:53,2018-02-22 02:30:46,2018-02-26 22:25:22,2018-03-21 22:03:54,2018-03-12,2018-02,2018-02-20,...,['Delivery_Issue'],Delivery_Issue,Delivery_Issue,2018-03,2018-03,2018-03-20,0,0,1,0
41,6ea2f835b4556291ffdc53fa0b3b95e8,c7340080e394356141681bd4c9b8fe31,delivered,2017-11-24 21:27:48,2017-11-25 00:21:09,2017-12-13 21:14:05,2017-12-28 18:59:23,2017-12-21,2017-11,2017-11-24,...,['Delivery_Issue'],Delivery_Issue,Delivery_Issue,2017-12,2017-12,2017-12-28,0,0,1,0
45,6ebaec694d7025e2ad4a05dba887c032,4f28355e5c17a4a42d3ce2439a1d4501,delivered,2017-05-18 13:55:47,2017-05-18 14:05:17,2017-05-19 12:01:38,2017-05-29 12:47:20,2017-06-09,2017-05,2017-05-18,...,['Delivery_Issue'],Delivery_Issue,Delivery_Issue,2017-05,2017-05,2017-05-31,0,0,1,0
55,40c5e18f7d112b59b3e5113a59a905b3,67407057a7d5ee17d1cd09523f484d13,delivered,2018-06-11 10:25:52,2018-06-11 10:58:32,2018-06-14 13:03:00,2018-06-19 00:31:13,2018-07-16,2018-06,2018-06-11,...,['Delivery_Praise'],Delivery_Praise,Delivery_Praise,2018-06,2018-06,2018-06-20,1,0,0,1


In [25]:
def agg_order_delivered_customer_span_hours_from_carrier_info(
    df, category_name, order_purchase_month
):
    tmp_df = df.query(
        "product_category_name_english == @category_name and order_purchase_month == @order_purchase_month"
    )
    df_agg = tmp_df.groupby(["review_categories_str_modified"]).agg(
        {
            "order_id": "count",
            "order_delivered_customer_span_hours_from_carrier": "mean",
            "order_estimated_delivery_span_hours_from_purchase": "mean",
            "distance_between_customer_and_seller": "mean",
            "price": "mean",
            "freight_value": "mean",
            "sum_price_freight_by_order": "mean",
            "product_count": "mean",
            "seller_count": "mean",
            "product_photos_qty": "mean",
            "product_weight_g": "mean",
            "product_length_cm": "mean",
            "product_height_cm": "mean",
            "product_width_cm": "mean",
            "seller_city_count": "mean",
            "seller_state_count": "mean",
            "is_same_state": "sum",
            "is_same_city": "sum",
        }
    )
    return df_agg


In [26]:
agg_order_delivered_customer_span_hours_from_carrier_info(
    df_target,
    "health_beauty",
    "2017-04",
)

Unnamed: 0_level_0,order_id,order_delivered_customer_span_hours_from_carrier,order_estimated_delivery_span_hours_from_purchase,distance_between_customer_and_seller,price,freight_value,sum_price_freight_by_order,product_count,seller_count,product_photos_qty,product_weight_g,product_length_cm,product_height_cm,product_width_cm,seller_city_count,seller_state_count,is_same_state,is_same_city
review_categories_str_modified,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
Delivery_Issue,4,576.633056,685.347569,392.733781,225.9075,15.3725,241.28,1.0,1.0,1.25,656.25,29.75,15.25,21.0,1.0,1.0,1,1
Delivery_Praise,8,99.670521,574.307361,480.75577,186.69875,20.21,206.90875,1.0,1.0,2.5,4375.0,23.625,20.25,21.875,1.0,1.0,5,0


In [53]:
y, x = (
    df_target["order_delivered_customer_span_hours_from_carrier"],
    df_target[
        [
            "order_purchase_month",
            "customer_city",
            "customer_state",
            "price",
            "freight_value",
            "sum_price_freight_by_order",
            "product_count",
            "seller_count",
            "product_photos_qty",
            "product_weight_g",
            "product_length_cm",
            "product_height_cm",
            "product_width_cm",
            "seller_city_count",
            "seller_state_count",
            "product_category_name_english",
            "seller_city",
            "seller_state",
            "distance_between_customer_and_seller",
            "is_same_state",
            "is_same_city",
            # "label_translated",
        ]
    ],
)