In [10]:
%%time
%matplotlib inline
%load_ext autoreload
%autoreload 2

import math
import sys
from dataclasses import dataclass
from pathlib import Path

import pandas as pd
from dotenv import load_dotenv


@dataclass
class Config:
    project_dir: Path = Path("../../")
    model_dir: Path = project_dir / "models"
    outputs_dir: Path = project_dir / "outputs"
    results_dir: Path = project_dir / "results"
    results_agg_dir: Path = results_dir / "agg"
    results_figs_dir: Path = results_dir / "figs"
    data_dir: Path = project_dir / "data"
    raw_dir: Path = data_dir / "raw"
    interim_dir: Path = data_dir / "interim"
    processed_dir: Path = data_dir / "processed"


config = Config()
sys.path.append(str(config.project_dir.resolve()))
load_dotenv()

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
CPU times: user 1.87 ms, sys: 1.42 ms, total: 3.3 ms
Wall time: 2.87 ms


True

In [None]:
df_geolocation = pd.read_csv(config.raw_dir / "olist_geolocation_dataset.csv")
df_geolocation_agg = (
    df_geolocation.groupby(["geolocation_state", "geolocation_zip_code_prefix"])
    .agg(
        {
            "geolocation_lat": "mean",
            "geolocation_lng": "mean",
        }
    )
    .reset_index()
)
df_orders = pd.read_csv(config.interim_dir / "olist_orders_customer_merged.csv")
df_payments = pd.read_csv(config.raw_dir / "olist_order_payments_dataset.csv")
# order_idごとの集計
agg_payments = (
    df_payments.groupby("order_id")
    .agg(
        {
            "payment_value": "sum",  # 支払い金額の合計
            "payment_type": [
                "nunique",  # payment_typeの種類数
                lambda x: x.value_counts().index[0],
            ],  # 最も頻出のpayment_type
        }
    )
    .reset_index()
)

# カラム名を分かりやすく変更
agg_payments.columns = [
    "order_id",
    "total_payment_value",
    "payment_type_count",
    "most_common_payment_type",
]

# さらに、各order_idの最大支払額のpayment_typeを取得
payment_type_by_value = (
    df_payments.groupby(["order_id", "payment_type"])["payment_value"]
    .sum()
    .reset_index()
)

max_payment_type = (
    payment_type_by_value.sort_values("payment_value", ascending=False)
    .groupby("order_id")
    .first()[["payment_type", "payment_value"]]
    .reset_index()
    .rename(
        columns={
            "payment_type": "highest_value_payment_type",
            "payment_value": "highest_payment_value",
        }
    )
)

# 結果を結合
df_agg_payments = pd.merge(agg_payments, max_payment_type, on="order_id", how="left")
df_orders_payments_merged = pd.merge(
    df_orders, df_agg_payments, on="order_id", how="left"
)
assert df_orders_payments_merged.shape[0] == df_orders.shape[0]
df_orders_geolocation_merged = pd.merge(
    df_orders_payments_merged,
    df_geolocation_agg.rename(
        columns={"geolocation_lat": "customer_lat", "geolocation_lng": "customer_lng"}
    ),
    left_on=["customer_zip_code_prefix", "customer_state"],
    right_on=["geolocation_zip_code_prefix", "geolocation_state"],
    how="left",
)
assert df_orders_geolocation_merged.shape[0] == df_orders_payments_merged.shape[0]

df_item_products = pd.read_csv(
    config.interim_dir / "olist_item_product_seller_merged.csv"
)
df_item_products_geolocation_merged = pd.merge(
    df_item_products,
    df_geolocation_agg.rename(
        columns={"geolocation_lat": "seller_lat", "geolocation_lng": "seller_lng"}
    ),
    left_on=["seller_zip_code_prefix", "seller_state"],
    right_on=["geolocation_zip_code_prefix", "geolocation_state"],
    how="left",
)
assert df_item_products_geolocation_merged.shape[0] == df_item_products.shape[0]

In [48]:
# 注文内容情報
df_item_products_geolocation_merged["shipping_limit_date"] = pd.to_datetime(
    df_item_products_geolocation_merged["shipping_limit_date"]
)
df_item_products_geolocation_merged["product_volume_cm3"] = (
    df_item_products_geolocation_merged["product_length_cm"]
    * df_item_products_geolocation_merged["product_height_cm"]
    * df_item_products_geolocation_merged["product_width_cm"]
)
seller_info_agg_by_order_id = (
    df_item_products_geolocation_merged.groupby("order_id")
    .agg(
        {
            "shipping_limit_date": "mean",
            "price": "sum",
            "freight_value": "sum",
            "sum_price_freight_by_order": "sum",
            "seller_lat": "mean",
            "seller_lng": "mean",
            "product_id": "nunique",
            "seller_id": "nunique",
            "product_photos_qty": "mean",
            "product_weight_g": "sum",
            "product_length_cm": "mean",
            "product_height_cm": "mean",
            "product_width_cm": "mean",
            "product_volume_cm3": "sum",
            "seller_city": "nunique",
            "seller_state": "nunique",
        }
    )
    .rename(
        columns={
            "price": "sum_price",
            "freight_value": "sum_freight",
            "product_id": "product_count",
            "product_volume_cm3": "sum_product_volume_cm3",
            "product_weight_g": "sum_product_weight_g",
            "seller_id": "seller_count",
            "seller_city": "seller_city_count",
            "seller_state": "seller_state_count",
        }
    )
).reset_index()

df_orders_geolocation_merged2 = pd.merge(
    df_orders_geolocation_merged,
    seller_info_agg_by_order_id,
    on="order_id",
    how="left",
)

assert df_orders_geolocation_merged2.shape[0] == df_orders_geolocation_merged.shape[0]


def get_most_common_category(x):
    value_counts = x.value_counts()
    return value_counts.index[0] if len(value_counts) > 0 else None


# 最も多いカテゴリを取得
agg_most_common_category_by_order_id = (
    df_item_products_geolocation_merged.groupby("order_id")[
        ["product_category_name_english", "seller_city", "seller_state"]
    ]
    .agg(
        get_most_common_category,
    )
    .reset_index()
)

df_orders_geolocation_merged3 = pd.merge(
    df_orders_geolocation_merged2,
    agg_most_common_category_by_order_id,
    on="order_id",
    how="left",
)

assert df_orders_geolocation_merged3.shape[0] == df_orders_geolocation_merged2.shape[0]

In [49]:
for col in [
    "order_purchase_timestamp",
    "order_approved_at",
    "order_delivered_carrier_date",
    "order_delivered_customer_date",
    "order_estimated_delivery_date",
]:
    df_orders_geolocation_merged3[col] = pd.to_datetime(
        df_orders_geolocation_merged3[col]
    )
df_orders_geolocation_merged3["order_estimated_delivery_span_hours_from_carrier"] = (
    pd.to_numeric(
        (
            df_orders_geolocation_merged3["order_estimated_delivery_date"]
            - df_orders_geolocation_merged3["shipping_limit_date"]
        ).dt.total_seconds()
        / 3600,
        errors="coerce",
    )
)
df_orders_geolocation_merged3["order_estimated_delivery_span_hours_from_purchase"] = (
    pd.to_numeric(
        (
            df_orders_geolocation_merged3["order_estimated_delivery_date"]
            - df_orders_geolocation_merged3["order_purchase_timestamp"]
        ).dt.total_seconds()
        / 3600,
        errors="coerce",
    )
)

df_orders_geolocation_merged3["order_delivered_carrier_span_hours_from_purchase"] = (
    pd.to_numeric(
        (
            df_orders_geolocation_merged3["order_delivered_carrier_date"]
            - df_orders_geolocation_merged3["order_purchase_timestamp"]
        ).dt.total_seconds()
        / 3600,
        errors="coerce",
    )
)
df_orders_geolocation_merged3["order_delivered_customer_span_hours_from_purchase"] = (
    pd.to_numeric(
        (
            df_orders_geolocation_merged3["order_delivered_customer_date"]
            - df_orders_geolocation_merged3["order_purchase_timestamp"]
        ).dt.total_seconds()
        / 3600,
        errors="coerce",
    )
)
df_orders_geolocation_merged3["order_delivered_customer_span_hours_from_carrier"] = (
    pd.to_numeric(
        (
            df_orders_geolocation_merged3["order_delivered_customer_date"]
            - df_orders_geolocation_merged3["order_delivered_carrier_date"]
        ).dt.total_seconds()
        / 3600,
        errors="coerce",
    )
)
df_orders_geolocation_merged3["order_delivered_carrier_span_hours_from_limit_date"] = (
    pd.to_numeric(
        (
            df_orders_geolocation_merged3["order_delivered_carrier_date"]
            - df_orders_geolocation_merged3["shipping_limit_date"]
        ).dt.total_seconds()
        / 3600,
        errors="coerce",
    )
)
df_orders_geolocation_merged3["order_delivered_customer_span_hours_from_limit_date"] = (
    pd.to_numeric(
        (
            df_orders_geolocation_merged3["order_delivered_customer_date"]
            - df_orders_geolocation_merged3["order_estimated_delivery_date"]
        ).dt.total_seconds()
        / 3600,
    )
)


def haversine(lat1, lng1, lat2, lng2):
    """
    2点間の大圏距離を計算する（Haversine formulaを使用）

    Args:
        lat1: 1点目の緯度 (度)
        lng1: 1点目の経度 (度)
        lat2: 2点目の緯度 (度)
        lng2: 2点目の経度 (度)

    Returns:
        2点間の距離 (km)
    """
    R = 6371  # 地球の半径 (km)

    lat1_rad = math.radians(lat1)
    lng1_rad = math.radians(lng1)
    lat2_rad = math.radians(lat2)
    lng2_rad = math.radians(lng2)

    dlng = lng2_rad - lng1_rad
    dlat = lat2_rad - lat1_rad

    a = (
        math.sin(dlat / 2) ** 2
        + math.cos(lat1_rad) * math.cos(lat2_rad) * math.sin(dlng / 2) ** 2
    )
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))

    distance = R * c
    return distance


df_orders_geolocation_merged3["distance_between_customer_and_seller"] = (
    df_orders_geolocation_merged3.apply(
        lambda row: haversine(
            row["customer_lat"],
            row["customer_lng"],
            row["seller_lat"],
            row["seller_lng"],
        ),
        axis=1,
    )
)
# 距離を5分位に分割
df_orders_geolocation_merged3["distance_bin"] = pd.qcut(
    df_orders_geolocation_merged3["distance_between_customer_and_seller"],
    q=5,
    labels=[f"Bin {i + 1}" for i in range(5)],
)
distance_bin_dummies = pd.get_dummies(
    df_orders_geolocation_merged3["distance_bin"], prefix="distance_bin"
).astype(int)

# 元のデータフレームに結合
df_orders_geolocation_merged3 = pd.concat(
    [df_orders_geolocation_merged3, distance_bin_dummies], axis=1
)

bin_ranges = pd.qcut(
    df_orders_geolocation_merged3["distance_between_customer_and_seller"],
    q=5,
    retbins=True,
)[1]

# ビンの範囲を表示
print("Distance ranges for each bin (km):")
for i in range(len(bin_ranges) - 1):
    print(f"Bin {i + 1}: {bin_ranges[i]:.2f} - {bin_ranges[i + 1]:.2f}")

Distance ranges for each bin (km):
Bin 1: 0.00 - 115.52
Bin 2: 115.52 - 347.34
Bin 3: 347.34 - 529.33
Bin 4: 529.33 - 876.90
Bin 5: 876.90 - 8677.91


In [1]:
df_orders_geolocation_merged3.to_csv(
    config.processed_dir / "order_info_details.csv", index=False
)

NameError: name 'df_orders_geolocation_merged3' is not defined