# 02 â€” Feature Engineering (RFM + Customer Features)

"
        "This notebook cleans data, builds transaction-level features, then aggregates to customer-level RFM features and segmentation.

In [2]:
import pandas as pd
from pathlib import Path

DATA_PATH = Path('/Users/prithabera/Downloads/OnlineRetail.csv')

# Load
df = pd.read_csv(DATA_PATH, encoding='ISO-8859-1')

# Standardize columns
df.columns = [c.strip().lower() for c in df.columns]

# Parse datetime
df['invoicedate'] = pd.to_datetime(df['invoicedate'], errors='coerce')

# Drop rows without customerid
df = df.dropna(subset=['customerid'])

# Remove cancellations (InvoiceNo starting with 'C')
df = df[~df['invoiceno'].astype(str).str.startswith('C')]

# Remove invalid qty/price
df = df[(df['quantity'] > 0) & (df['unitprice'] > 0)]

# Add sales
df['sales'] = df['quantity'] * df['unitprice']

# Basic sanity check
print('Shape after cleaning:', df.shape)
df.head()

Shape after cleaning: (397884, 9)


Unnamed: 0,invoiceno,stockcode,description,quantity,invoicedate,unitprice,customerid,country,sales
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom,15.3
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,20.34
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom,22.0
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,20.34
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,20.34


In [4]:
# RFM feature engineering

snapshot_date = df['invoicedate'].max() + pd.Timedelta(days=1)

rfm = df.groupby('customerid').agg({
    'invoicedate': lambda x: (snapshot_date - x.max()).days,
    'invoiceno': 'nunique',
    'sales': 'sum'
}).rename(columns={
    'invoicedate': 'recency',
    'invoiceno': 'frequency',
    'sales': 'monetary'
})

rfm.head()

Unnamed: 0_level_0,recency,frequency,monetary
customerid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
12346.0,326,1,77183.6
12347.0,2,7,4310.0
12348.0,75,4,1797.24
12349.0,19,1,1757.55
12350.0,310,1,334.4


In [7]:
# RFM scoring

rfm['r_score'] = pd.qcut(rfm['recency'], 5, labels=[5,4,3,2,1])
rfm['f_score'] = pd.qcut(rfm['frequency'].rank(method='first'), 5, labels=[1,2,3,4,5])
rfm['m_score'] = pd.qcut(rfm['monetary'].rank(method='first'), 5, labels=[1,2,3,4,5])

rfm['rfm_score'] = rfm['r_score'].astype(str) + rfm['f_score'].astype(str) + rfm['m_score'].astype(str)
rfm.head()

Unnamed: 0_level_0,recency,frequency,monetary,r_score,f_score,m_score,rfm_score
customerid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
12346.0,326,1,77183.6,1,1,5,115
12347.0,2,7,4310.0,5,5,5,555
12348.0,75,4,1797.24,2,4,4,244
12349.0,19,1,1757.55,4,1,4,414
12350.0,310,1,334.4,1,1,2,112


In [8]:
# Segment mapping (simple, interpretable rules)

def segment(row):
    if row['r_score'] == 5 and row['f_score'] >= 4:
        return 'Champions'
    if row['r_score'] >= 4 and row['f_score'] >= 3:
        return 'Loyal'
    if row['r_score'] <= 2 and row['f_score'] <= 2:
        return 'At Risk'
    return 'Need Attention'

rfm['segment'] = rfm.apply(segment, axis=1)
rfm['segment'].value_counts()

segment
Need Attention    1820
At Risk           1065
Loyal              820
Champions          633
Name: count, dtype: int64

In [9]:
# Save engineered features
from pathlib import Path

out_dir = Path('data/processed')
out_dir.mkdir(parents=True, exist_ok=True)

rfm.to_csv(out_dir / 'customer_rfm.csv', index=True)

# Also save cleaned transactions for SQL/BI
df.to_csv(out_dir / 'transactions_cleaned.csv', index=False)

print('Saved:', out_dir / 'customer_rfm.csv')
print('Saved:', out_dir / 'transactions_cleaned.csv')

Saved: data/processed/customer_rfm.csv
Saved: data/processed/transactions_cleaned.csv
