In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("processed_data/clean_olist_dataset.csv")

In [3]:
# Convert Date Columns
date_cols = [
    'order_purchase_timestamp',
    'order_delivered_customer_date',
    'order_estimated_delivery_date'
]

for col in date_cols:
    df[col] = pd.to_datetime(df[col])

In [4]:
# Reference Date
reference_date = df['order_purchase_timestamp'].max()


In [5]:
# RFM Aggregation
customer_df = df.groupby('customer_unique_id').agg({

    # Recency
    'order_purchase_timestamp': lambda x: (reference_date - x.max()).days,

    # Frequency
    'order_id': 'nunique',

    # Monetary
    'payment_value': 'sum',

    # Avg Review
    'review_score': 'mean'

}).reset_index()

In [6]:
# Rename Columns
customer_df.columns = [
    'customer_id',
    'Recency',
    'Frequency',
    'Monetary',
    'Avg_Review'
]


In [7]:
# Avg_Order_Value
customer_df['Avg_Order_Value'] = (
    customer_df['Monetary'] / customer_df['Frequency']
)


In [8]:
# Avg_Purchase_Gap (Buying Regularity)
df_sorted = df.sort_values(['customer_unique_id', 'order_purchase_timestamp'])

purchase_gap = df_sorted.groupby('customer_unique_id')['order_purchase_timestamp'] \
    .diff().dt.days

avg_gap = purchase_gap.groupby(df_sorted['customer_unique_id']).mean()

customer_df['Avg_Purchase_Gap'] = customer_df['customer_id'].map(avg_gap)


In [9]:
# Product_Variety (Exploration Behavior)
product_variety = df.groupby('customer_unique_id')['product_id'].nunique()

customer_df['Product_Variety'] = customer_df['customer_id'].map(product_variety)

In [10]:
# Avg_Installments (Financial Behavior)
avg_installments = df.groupby('customer_unique_id')['payment_installments'].mean()

customer_df['Avg_Installments'] = customer_df['customer_id'].map(avg_installments)

In [11]:
# Handle Missing Values
customer_df.fillna({
    'Avg_Purchase_Gap': customer_df['Avg_Purchase_Gap'].median(),
    'Avg_Review': customer_df['Avg_Review'].median()
}, inplace=True)


In [12]:
customer_df.head()


Unnamed: 0,customer_id,Recency,Frequency,Monetary,Avg_Review,Avg_Order_Value,Avg_Purchase_Gap,Product_Variety,Avg_Installments
0,0000366f3b9a7992bf8c76cfdf3221e2,111,1,141.9,5.0,141.9,0.0,1,8.0
1,0000b849f77a49e4a4ce2b2a4ca5be3f,114,1,27.19,4.0,27.19,0.0,1,1.0
2,0000f46a3911fa3c0805444483337064,536,1,86.22,3.0,86.22,0.0,1,8.0
3,0000f6ccb0745a6a4b88665a16c9f078,320,1,43.62,4.0,43.62,0.0,1,4.0
4,0004aac84e0df4da2b147fca70cf8255,287,1,196.89,5.0,196.89,0.0,1,6.0


In [13]:
customer_df.shape


(86173, 9)

In [None]:
customer_df.to_csv("processed_data/customer_feature_dataset.csv", index=False)
