In [1]:
%%time
%matplotlib inline
%load_ext autoreload
%autoreload 2

import sys
from dataclasses import dataclass
from pathlib import Path

import pandas as pd


@dataclass
class Config:
    project_dir: Path = Path("../../")
    model_dir: Path = project_dir / "models"
    outputs_dir: Path = project_dir / "outputs"
    data_dir: Path = project_dir / "data"
    raw_dir: Path = data_dir / "raw"
    interim_dir: Path = data_dir / "interim"
    processed_dir: Path = data_dir / "processed"


config = Config()
sys.path.append(str(config.project_dir.resolve()))

CPU times: user 296 ms, sys: 99.4 ms, total: 395 ms
Wall time: 655 ms


In [2]:
# raw
df_customers = pd.read_csv(config.raw_dir / "olist_customers_dataset.csv")
df_orders = pd.read_csv(config.raw_dir / "olist_orders_dataset.csv")

In [3]:
# 日付カラムをdatetime型に変換
date_columns = [
    "order_purchase_timestamp",
    "order_approved_at",
    "order_delivered_carrier_date",
    "order_delivered_customer_date",
    "order_estimated_delivery_date",
]

# 全ての日付カラムをTimestamp型に統一
for col in date_columns:
    df_orders[col] = pd.to_datetime(df_orders[col])

# 月情報の抽出
df_orders["order_purchase_month"] = df_orders["order_purchase_timestamp"].dt.to_period(
    "M"
)
df_orders["order_purchase_date"] = df_orders["order_purchase_timestamp"].dt.to_period(
    "D"
)
df_orders["order_purchase_weekday"] = df_orders[
    "order_purchase_timestamp"
].dt.day_name()

df_orders["order_approved_month"] = df_orders["order_approved_at"].dt.to_period("M")
df_orders["order_approved_date"] = df_orders["order_approved_at"].dt.to_period("D")
df_orders["order_approved_weekday"] = df_orders["order_approved_at"].dt.day_name()
df_orders["order_delivered_carrier_month"] = df_orders[
    "order_delivered_carrier_date"
].dt.to_period("M")
df_orders["order_delivered_carrier_weekday"] = df_orders[
    "order_delivered_carrier_date"
].dt.day_name()
df_orders["order_delivered_customer_month"] = df_orders[
    "order_delivered_customer_date"
].dt.to_period("M")
df_orders["order_delivered_customer_weekday"] = df_orders[
    "order_delivered_customer_date"
].dt.day_name()
df_orders["order_estimated_delivery_month"] = df_orders[
    "order_estimated_delivery_date"
].dt.to_period("M")
df_orders["order_estimated_delivery_weekday"] = df_orders[
    "order_estimated_delivery_date"
].dt.day_name()

# 期間の計算
df_orders["approved_span"] = (
    df_orders["order_approved_at"] - df_orders["order_purchase_timestamp"]
)
df_orders["estimated_delivery_span"] = (
    df_orders["order_estimated_delivery_date"] - df_orders["order_purchase_timestamp"]
)
df_orders["order_delivered_carrier_date_span_from_purchase"] = (
    df_orders["order_delivered_carrier_date"] - df_orders["order_purchase_timestamp"]
)
df_orders["order_delivered_customer_date_span_from_purchase"] = (
    df_orders["order_delivered_customer_date"] - df_orders["order_purchase_timestamp"]
)
df_orders["order_delivered_customer_date_span_from_delivery_carrier"] = (
    df_orders["order_delivered_customer_date"]
    - df_orders["order_delivered_carrier_date"]
)
# 配送遅延の判定
df_orders["is_delivery_to_customers_delayed"] = (
    df_orders["order_delivered_customer_date"]
    > df_orders["order_estimated_delivery_date"]
)

In [4]:
df_orders_customer_merged = pd.merge(
    df_orders, df_customers, on="customer_id", how="left"
)
assert df_orders.shape[0] == df_orders_customer_merged.shape[0]

In [5]:
df_orders_customer_merged.to_csv(
    config.interim_dir / "olist_orders_customer_merged.csv", index=False
)