Feature Engineering

In [2]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

In [3]:
# Load dataset
sathya = pd.read_csv('Online Retail.csv', encoding='ISO-8859-1')

In [4]:
# Handling missing values
sathya.dropna(subset=['CustomerID'], inplace=True)

In [5]:
# Feature Engineering
## Creating Frequency feature
frequency_df = sathya.groupby('CustomerID').InvoiceNo.nunique().reset_index()
frequency_df.columns = ['CustomerID', 'Frequency']

In [6]:
## Creating Amount feature
sathya['TotalAmount'] = sathya['Quantity'] * sathya['UnitPrice']
amount_df = sathya.groupby('CustomerID').TotalAmount.sum().reset_index()
amount_df.columns = ['CustomerID', 'Amount']

In [7]:
## Creating Recency feature
sathya['InvoiceDate'] = pd.to_datetime(sathya['InvoiceDate'], format='%d-%m-%Y %H:%M', errors='coerce')
latest_date = sathya['InvoiceDate'].max()
recency_df = sathya.groupby('CustomerID').InvoiceDate.max().reset_index()
recency_df['Recency'] = (latest_date - recency_df['InvoiceDate']).dt.days
recency_df = recency_df[['CustomerID', 'Recency']]

In [8]:
# Merging all features
final_df = frequency_df.merge(amount_df, on='CustomerID').merge(recency_df, on='CustomerID')

In [9]:
# Scaling features
scaler = StandardScaler()
final_df[['Frequency', 'Amount', 'Recency']] = scaler.fit_transform(final_df[['Frequency', 'Amount', 'Recency']])    

In [10]:
# Save processed data for clustering
final_df.to_csv('processed_features.csv', index=False)

In [11]:
print("Feature Engineering Completed. File saved as 'processed_features.csv'")

Feature Engineering Completed. File saved as 'processed_features.csv'
