In [None]:
# Libraries
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

In [107]:
# Load Data set and check Shape
data = ("../data/raw/e_commerce_shopper_behaviour_and_lifestyle.csv")
ecom_df = pd.read_csv(data)
ecom_df.shape

(1000000, 60)

We can see that there are 60 colomuns and 1M data points under each row.

In [108]:
# Show all column names
print(ecom_df.columns.tolist())

# Column names with data types
print(ecom_df.dtypes)

['user_id', 'age', 'gender', 'country', 'urban_rural', 'income_level', 'employment_status', 'education_level', 'relationship_status', 'has_children', 'household_size', 'occupation', 'ethnicity', 'language_preference', 'device_type', 'weekly_purchases', 'monthly_spend', 'cart_abandonment_rate', 'review_writing_frequency', 'average_order_value', 'preferred_payment_method', 'coupon_usage_frequency', 'loyalty_program_member', 'referral_count', 'product_category_preference', 'shopping_time_of_day', 'weekend_shopper', 'impulse_purchases_per_month', 'browse_to_buy_ratio', 'return_frequency', 'budgeting_style', 'brand_loyalty_score', 'impulse_buying_score', 'environmental_consciousness', 'health_conscious_shopping', 'travel_frequency', 'hobby_count', 'social_media_influence_score', 'reading_habits', 'exercise_frequency', 'stress_from_financial_decisions', 'overall_stress_level', 'sleep_quality', 'physical_activity_level', 'mental_health_score', 'daily_session_time_minutes', 'product_views_per_

In [109]:
for col in ecom_df.select_dtypes(include='object').columns:
    print(f"\n{col}:")
    print(ecom_df[col].unique())
    print(f"Count: {ecom_df[col].nunique()}")


gender:
['Female' 'Male' 'Non-binary' 'Other']
Count: 4

country:
['Germany' 'Japan' 'India' 'Canada' 'USA' 'France' 'Australia' 'UK'
 'Brazil' 'China']
Count: 10

urban_rural:
['Suburban' 'Urban' 'Rural']
Count: 3

employment_status:
['Self-employed' 'Unemployed' 'Employed' 'Student' 'Retired']
Count: 5

education_level:
['Associate Degree' 'Bachelor' 'High School' 'Master' 'PhD']
Count: 5

relationship_status:
['Single' 'Married' 'Widowed' 'In a relationship' 'Divorced']
Count: 5

occupation:
['Healthcare' 'Finance' 'Engineering' 'Other' 'Marketing' 'Retail' 'IT'
 'Education']
Count: 8

ethnicity:
['Other' 'Hispanic' 'Asian' 'Caucasian' 'African American']
Count: 5

language_preference:
['English' 'Mandarin' 'Hindi' 'German' 'French' 'Spanish']
Count: 6

device_type:
['Mobile' 'Desktop' 'Tablet']
Count: 3

preferred_payment_method:
['PayPal' 'Google Pay' 'Credit Card' 'Apple Pay' 'Debit Card'
 'Bank Transfer']
Count: 6

product_category_preference:
['Groceries' 'Beauty' 'Books' 'Toy

Let us restrict country to USA only as it helps us focus easiloy and lets the model not over fit with data.

In [110]:
# Filter for USA only and drop country column
usa_df = ecom_df[ecom_df['country'] == 'USA'].drop('country', axis=1)

# Verify
print(f"Original shape: {ecom_df.shape}")
print(f"USA only shape: {usa_df.shape}")


Original shape: (1000000, 60)
USA only shape: (99996, 59)


In [None]:
# Create a copy for feature engineering
df_features = usa_df.copy()

# Initialize scaler for normalization
scaler = MinMaxScaler()

In [None]:
# ============================================================================
# 1. ENGAGEMENT INTENSITY
# ============================================================================

engagement_cols = ['daily_session_time_minutes', 'product_views_per_day', 'app_usage_frequency']
df_features[engagement_cols] = scaler.fit_transform(df_features[engagement_cols])

df_features['avg_daily_engagement_score'] = (
    df_features['daily_session_time_minutes'] +
    df_features['product_views_per_day'] +
    df_features['app_usage_frequency']
) / 3

df_features['weekly_engagement_index'] = df_features['avg_daily_engagement_score'] * 7

# ============================================================================
# 2. ADVERTISING RESPONSIVENESS
# ============================================================================

df_features['ad_response_rate'] = df_features['ad_clicks_per_day'] / df_features['ad_views_per_day'].replace(0, np.nan)
df_features['ad_response_rate'].fillna(0, inplace=True)

df_features[['ad_views_per_day']] = scaler.fit_transform(df_features[['ad_views_per_day']])
df_features['ad_exposure_score'] = df_features['ad_views_per_day'] * df_features['ad_response_rate']

# ============================================================================
# 3. PURCHASE INTENT (REVISED - NO TARGET LEAKAGE)
# ============================================================================

df_features['browse_to_buy_inverse'] = 1 / df_features['browse_to_buy_ratio'].replace(0, np.nan)
df_features['browse_to_buy_inverse'].fillna(0, inplace=True)

temp_intent = pd.DataFrame({
    'cart': df_features['cart_items_average'],
    'browse': df_features['browse_to_buy_inverse'],
    'weekly': df_features['weekly_purchases']
})
temp_intent_scaled = scaler.fit_transform(temp_intent)

df_features['purchase_intent_score'] = (
    temp_intent_scaled[:, 0] +
    temp_intent_scaled[:, 1] +
    temp_intent_scaled[:, 2]
) / 3

# ============================================================================
# 4. DISCOUNT SENSITIVITY
# ============================================================================

discount_cols = ['coupon_usage_frequency', 'impulse_purchases_per_month']
df_features[discount_cols] = scaler.fit_transform(df_features[discount_cols])

df_features['discount_sensitivity_index'] = (
    df_features['coupon_usage_frequency'] +
    df_features['impulse_purchases_per_month']
) / 2

# ============================================================================
# 5. REVENUE STRENGTH
# ============================================================================

df_features[['monthly_spend']] = scaler.fit_transform(df_features[['monthly_spend']])
df_features['normalized_spend_score'] = df_features['monthly_spend']

df_features['customer_value_tier'] = pd.cut(
    df_features['normalized_spend_score'],
    bins=[-np.inf, 0.40, 0.75, np.inf],
    labels=['Low', 'Mid', 'High']
)

# ============================================================================
# 6. RECENCY (LEAKAGE-CONTROLLED)
# ============================================================================

df_features['last_purchase_date'] = pd.to_datetime(df_features['last_purchase_date'])
today = pd.Timestamp.now()
df_features['days_since_last_purchase'] = (today - df_features['last_purchase_date']).dt.days

df_features['recency_bucket'] = pd.cut(
    df_features['days_since_last_purchase'],
    bins=[-np.inf, 7, 30, 90, np.inf],
    labels=['Active', 'Warm', 'Cold', 'Dormant']
)

# ============================================================================
# 7. LOYALTY & ADVOCACY
# ============================================================================

advocacy_cols = ['brand_loyalty_score', 'review_writing_frequency', 'social_sharing_frequency', 'referral_count']
df_features[advocacy_cols] = scaler.fit_transform(df_features[advocacy_cols])

df_features['advocacy_score'] = (
    df_features['brand_loyalty_score'] +
    df_features['review_writing_frequency'] +
    df_features['social_sharing_frequency'] +
    df_features['referral_count']
) / 4

# ============================================================================
# 8. LIFESTYLE & STRESS IMPACT
# ============================================================================

stress_cols = ['stress_from_financial_decisions', 'overall_stress_level', 'mental_health_score', 'sleep_quality']
df_features[stress_cols] = scaler.fit_transform(df_features[stress_cols])

df_features['stress_impact_index'] = (
    df_features['stress_from_financial_decisions'] +
    df_features['overall_stress_level'] +
    (1 - df_features['mental_health_score']) +
    (1 - df_features['sleep_quality'])
) / 4

# ============================================================================
# 9. SHOPPING REGULARITY
# ============================================================================

from scipy.stats import entropy

def calculate_entropy(value):
    if pd.isna(value):
        return 0
    return 0  # Simplified for single values

# For categorical column, convert to numeric first
shopping_time_mapping = {time: idx for idx, time in enumerate(df_features['shopping_time_of_day'].unique())}
df_features['shopping_time_numeric'] = df_features['shopping_time_of_day'].map(shopping_time_mapping)

df_features['shopping_consistency_score'] = df_features['shopping_time_numeric'] * (1 + df_features['weekend_shopper'])

# ============================================================================
# CREATE TARGET VARIABLE (BINARY CLASSIFICATION)
# ============================================================================

threshold = 0.6  # Adjust based on business needs
df_features['cart_abandonment_flag'] = (df_features['cart_abandonment_rate'] >= threshold).astype(int)

print(f"\nAfter feature engineering shape: {df_features.shape}")

# ============================================================================
# DROP RAW COLUMNS AFTER AGGREGATION
# ============================================================================

columns_to_drop_after_aggregation = [
    # Identifiers
    'user_id',
    
    # Engagement
    'daily_session_time_minutes',
    'product_views_per_day',
    'app_usage_frequency',
    
    # Advertising
    'ad_views_per_day',
    'ad_clicks_per_day',
    'notification_response_rate',
    
    # Purchase Intent
    'cart_items_average',
    'browse_to_buy_ratio',
    'weekly_purchases',
    'browse_to_buy_inverse',
    
    # Discount
    'coupon_usage_frequency',
    'impulse_purchases_per_month',
    
    # Revenue
    'monthly_spend',
    'average_order_value',
    
    # Recency
    'last_purchase_date',
    'days_since_last_purchase',
    'account_age_months',
    
    # Advocacy
    'brand_loyalty_score',
    'review_writing_frequency',
    'social_sharing_frequency',
    'referral_count',
    
    # Stress
    'stress_from_financial_decisions',
    'overall_stress_level',
    'mental_health_score',
    'sleep_quality',
    
    # Shopping
    'shopping_time_of_day',
    'weekend_shopper',
    'shopping_time_numeric',
    
    # Low-value demographics
    'ethnicity',
    'language_preference',
    'occupation',
    'relationship_status',
    'urban_rural',
    'household_size',
    
    # Lifestyle noise
    'reading_habits',
    'hobby_count',
    'travel_frequency',
    'exercise_frequency',
    'physical_activity_level',
    
    # TARGET LEAKAGE - CRITICAL
    'cart_abandonment_rate',
    'checkout_abandonments_per_month',
]

df_final = df_features.drop(columns=columns_to_drop_after_aggregation, errors='ignore')

print(f"\nFinal dataset shape: {df_final.shape}")
print(f"\nFinal features ({len(df_final.columns)} columns):")
print(df_final.columns.tolist())
print(f"\nTarget distribution:")
print(df_final['cart_abandonment_flag'].value_counts())

Starting shape: (99996, 59)

After feature engineering shape: (99996, 75)

Final dataset shape: (99996, 33)

Final features (33 columns):
['age', 'gender', 'income_level', 'employment_status', 'education_level', 'has_children', 'device_type', 'preferred_payment_method', 'loyalty_program_member', 'product_category_preference', 'return_frequency', 'budgeting_style', 'impulse_buying_score', 'environmental_consciousness', 'health_conscious_shopping', 'social_media_influence_score', 'wishlist_items_count', 'purchase_conversion_rate', 'premium_subscription', 'return_rate', 'avg_daily_engagement_score', 'weekly_engagement_index', 'ad_response_rate', 'ad_exposure_score', 'purchase_intent_score', 'discount_sensitivity_index', 'normalized_spend_score', 'customer_value_tier', 'recency_bucket', 'advocacy_score', 'stress_impact_index', 'shopping_consistency_score', 'cart_abandonment_flag']

Target distribution:
cart_abandonment_flag
1    94495
0     5501
Name: count, dtype: int64


In [113]:
# Additional columns to drop
additional_drops = [
    'purchase_conversion_rate',
    'return_rate',
    'return_frequency',
    'wishlist_items_count'
]

df_final = df_final.drop(columns=additional_drops, errors='ignore')

print(f"Final dataset shape: {df_final.shape}")
print(f"\nFinal features ({len(df_final.columns)} columns):")
print(df_final.columns.tolist())
print(f"\nTarget distribution:")
print(df_final['cart_abandonment_flag'].value_counts())

Final dataset shape: (99996, 29)

Final features (29 columns):
['age', 'gender', 'income_level', 'employment_status', 'education_level', 'has_children', 'device_type', 'preferred_payment_method', 'loyalty_program_member', 'product_category_preference', 'budgeting_style', 'impulse_buying_score', 'environmental_consciousness', 'health_conscious_shopping', 'social_media_influence_score', 'premium_subscription', 'avg_daily_engagement_score', 'weekly_engagement_index', 'ad_response_rate', 'ad_exposure_score', 'purchase_intent_score', 'discount_sensitivity_index', 'normalized_spend_score', 'customer_value_tier', 'recency_bucket', 'advocacy_score', 'stress_impact_index', 'shopping_consistency_score', 'cart_abandonment_flag']

Target distribution:
cart_abandonment_flag
1    94495
0     5501
Name: count, dtype: int64


In [None]:
usa_df.head(1)

Unnamed: 0,age,gender,urban_rural,income_level,employment_status,education_level,relationship_status,has_children,household_size,device_type,...,social_media_influence_score,exercise_frequency,wishlist_items_count,checkout_abandonments_per_month,purchase_conversion_rate,app_usage_frequency,notification_response_rate,account_age_months,premium_subscription,return_rate
7,38,Male,Rural,72818,Retired,High School,Divorced,1,8,Desktop,...,5,1,18,8,95,6,83,6,0,90


In [None]:
print(usa_df.columns.tolist())

['age', 'gender', 'urban_rural', 'income_level', 'employment_status', 'education_level', 'relationship_status', 'has_children', 'household_size', 'device_type', 'weekly_purchases', 'monthly_spend', 'cart_abandonment_rate', 'average_order_value', 'preferred_payment_method', 'coupon_usage_frequency', 'loyalty_program_member', 'referral_count', 'product_category_preference', 'shopping_time_of_day', 'weekend_shopper', 'impulse_purchases_per_month', 'browse_to_buy_ratio', 'return_frequency', 'budgeting_style', 'brand_loyalty_score', 'impulse_buying_score', 'environmental_consciousness', 'health_conscious_shopping', 'social_media_influence_score', 'exercise_frequency', 'wishlist_items_count', 'checkout_abandonments_per_month', 'purchase_conversion_rate', 'app_usage_frequency', 'notification_response_rate', 'account_age_months', 'premium_subscription', 'return_rate']
