In [1]:
# !pip install numpy pandas
import math
import os
import numpy as np
import pandas as pd

np.random.seed(42)


In [2]:
'''
Flow:

-- 11:45 pm --

Assumptions For data creation:
1. Data consist of 1 brand 
2. 500 SKU unique SKU in the brand. Across 10 categories
3. 2 years of data
4. Have 10_000 unique users
6. This is a conversion level data, not event level data
7. At a conversion level, datapoints available: timestamp, sku, sku_category, revenue_through_conversion
8. User level data: 
   - Demographic: Single country, multiple states [A - F]
   - Gender, age


Churn Definition:
1. No purchase in the last 30 days -> We declare customer has churned

–
Training Process & training sample creation:
1. We have 2 years of data across 10_000 customer
2. We create training data for F.E. with churn label by:
    - start with 6 month of data for creating labels, customer with no purchase in next 30 days are marked as churned.
    - Now we keep increasing data for creating labels by one month (7, 8, 9, 10, and so on) until 1 year 10 months.
    - This way we would be able to generate multiple training samples with limited number of users

-- 2:30 pm -- 

3. Train a boosing models -> predict churn probability
    - Would look at PR-AUC

Inference:
1. We inference of whole 2 years of data.
2. Define buckets for churn (>90, 90-70, 70-30, <30)

-- 4:00 pm -- 

Dasboard:
1. Make a streamlit dashboard, host it locally. 
 - Show summary
 - each user level row (high rish customer)
 - corresponding button to each row with trigger an action.

Voice model to make a phone call:
- 

'''

'\nFlow:\n\n-- 11:45 pm --\n\nAssumptions For data creation:\n1. Data consist of 1 brand \n2. 500 SKU unique SKU in the brand. Across 10 categories\n3. 2 years of data\n4. Have 10_000 unique users\n6. This is a conversion level data, not event level data\n7. At a conversion level, datapoints available: timestamp, sku, sku_category, revenue_through_conversion\n8. User level data: \n   - Demographic: Single country, multiple states [A - F]\n   - Gender, age\n\n\nChurn Definition:\n1. No purchase in the last 30 days -> We declare customer has churned\n\n–\nTraining Process & training sample creation:\n1. We have 2 years of data across 10_000 customer\n2. We create training data for F.E. with churn label by:\n    - start with 6 month of data for creating labels, customer with no purchase in next 30 days are marked as churned.\n    - Now we keep increasing data for creating labels by one month (7, 8, 9, 10, and so on) until 1 year 10 months.\n    - This way we would be able to generate mult

# Synthetic Data Creation

In [19]:
brand = 'xyz'
sku = [f'sku_{i}' for i in range(1, 501)]
categories = [f'cat_{i}' for i in range(1, 11)]
gender = ['male', 'female', 'others']
country = ['country_x']
states = [f'state_{i}' for i in range(1, 16)]
users = [f'user_{i}' for i in range(1, 10_001)] 

# category to SKU proportion mapping 
sku_cat_prop_map = [0.1, 0.4, 0.05, 0.05, 0.1, 0.2, 0.08, 0.01, 0.005, 0.005]
sku_cat_prop_map = np.array(sku_cat_prop_map) / np.sum(sku_cat_prop_map)  

# gender proportion mapping (male 40%, female 59%, others 1%)
gender_user_prop_map = np.array([0.4, 0.59, 0.01])

# time window (2 years)
start_date = pd.Timestamp("2023-01-01")
end_date = pd.Timestamp("2024-12-31")
n_days = (end_date - start_date).days + 1

#User level Data
n_users = len(users)
user_genders = np.random.choice(gender, size = n_users, p = gender_user_prop_map)
user_states = np.random.choice(states, size = n_users)
user_countries = np.random.choice(country, size = n_users, p = [1])
user_ages = np.random.randint(18, 71, size=n_users)

df_users = pd.DataFrame({"user_id": users, "country": user_countries, "state": user_states, "gender": user_genders, "age": user_ages})

def generate_random_mappings_bw_two_entity(entity1, entity2, map_dict):

    users_accumulated = entity1
    sku_cat_prop_map_dict = {}

    for i, cat_item in enumerate(entity2):
        sku_for_cat = np.random.choice(users_accumulated, math.floor((map_dict[i])*len(entity1)))
        sku_cat_prop_map_dict[cat_item] = sku_for_cat
        users_accumulated = [x for x in users_accumulated if x not in sku_for_cat]
    
    return sku_cat_prop_map_dict    

cat_to_skus = generate_random_mappings_bw_two_entity(sku, categories, sku_cat_prop_map)

# conversion Level Data(assuming: for each user, number of purchases between 1 and 100) -> This is non-deterministic
purchases_per_user = np.random.randint(1, 101, size=n_users)
total_conversions = purchases_per_user.sum()
print(f"Total conversions to generate: {total_conversions}")

rows = []

for user_id, num_purchases in zip(users, purchases_per_user):
    # Sample timestamps uniformly over 2 years
    day_offsets = np.random.randint(0, n_days, size=num_purchases)
    timestamps = start_date + pd.to_timedelta(day_offsets, unit="D")

    purchase_categories = np.random.choice(categories, size=num_purchases, p=sku_cat_prop_map)

    # for each cat pick a SKU
    purchase_skus = [np.random.choice(cat_to_skus[cat]) for cat in purchase_categories]

    # Synthetic revenue through conversion
    raw_revenue = np.random.lognormal(mean=3.4, sigma=0.6, size=num_purchases)
    revenue = np.round(np.clip(raw_revenue, 1, None), 2)

    for ts, cat, s, rev in zip(timestamps, purchase_categories, purchase_skus, revenue):
        rows.append({"user_id": user_id, "timestamp": ts, "brand": brand, "sku": s, "sku_category": cat, "revenue_through_conversion": rev})

df_conversions = pd.DataFrame(rows)
df_conversions = df_conversions.sort_values("timestamp").reset_index(drop=True)

print(df_users.head())
print('-----------------------------------------------------------------')
print(df_conversions.head())
print('-----------------------------------------------------------------')
print("Users:", df_users["user_id"].nunique())
print('-----------------------------------------------------------------')
print("Conversions:", len(df_conversions))
print('-----------------------------------------------------------------')
print("Category distribution (conversions):")
print(df_conversions["sku_category"].value_counts(normalize=True))
print('-----------------------------------------------------------------')
print("Total Revenue Generated:")
print(df_conversions["revenue_through_conversion"].sum())
print('-----------------------------------------------------------------')


Total conversions to generate: 502359
  user_id    country    state  gender  age
0  user_1  country_x  state_8    male   39
1  user_2  country_x  state_2  female   69
2  user_3  country_x  state_1  female   24
3  user_4  country_x  state_2  female   31
4  user_5  country_x  state_5    male   53
-----------------------------------------------------------------
     user_id  timestamp brand      sku sku_category  \
0  user_3864 2023-01-01   xyz  sku_159        cat_6   
1  user_3217 2023-01-01   xyz  sku_403        cat_2   
2  user_1681 2023-01-01   xyz   sku_91        cat_6   
3   user_300 2023-01-01   xyz  sku_189        cat_2   
4  user_2205 2023-01-01   xyz  sku_418        cat_6   

   revenue_through_conversion  
0                       48.02  
1                       15.22  
2                       37.94  
3                       24.25  
4                       18.44  
-----------------------------------------------------------------
Users: 10000
------------------------------------

In [20]:
df_users.isnull().sum()

user_id    0
country    0
state      0
gender     0
age        0
dtype: int64

In [21]:
df_conversions.isnull().sum()


user_id                       0
timestamp                     0
brand                         0
sku                           0
sku_category                  0
revenue_through_conversion    0
dtype: int64

# Creating training data (Feature Engineering)

In [30]:
df_conversions = df_conversions.merge(df_users, how = 'right', on = 'user_id')

In [32]:
df_conversions.isnull().sum()

user_id                       0
timestamp                     0
brand                         0
sku                           0
sku_category                  0
revenue_through_conversion    0
country                       0
state                         0
gender                        0
age                           0
dtype: int64

In [33]:
df_conversions.head()

Unnamed: 0,user_id,timestamp,brand,sku,sku_category,revenue_through_conversion,country,state,gender,age
0,user_1,2023-01-30,xyz,sku_495,cat_2,17.19,country_x,state_8,male,39
1,user_1,2023-03-24,xyz,sku_282,cat_2,36.54,country_x,state_8,male,39
2,user_1,2023-05-19,xyz,sku_219,cat_2,16.3,country_x,state_8,male,39
3,user_1,2023-08-16,xyz,sku_315,cat_2,52.18,country_x,state_8,male,39
4,user_1,2023-12-17,xyz,sku_18,cat_2,39.82,country_x,state_8,male,39


In [None]:

Defined process
Process of creating training samples:
1. We have 2 years of data at user, timestamp level -> each conversion for a user is a different row.
2. I to create training sample for churn prediction in a way that 
 - Start by looking at first x months of data, create features (unique_categories, unique_sku, count_conversions, sum_revenue, recvency, frequency, etc)
 - look at next 2 months -> if no conversion for a user, mark that user a churned
3. Keep increaisng x by 1-2 months and keep incrementing the trainns sample.

label = churn_in_next_2_months

df_conversions = df_conversions.merge(df_users, on="user_id", how="left")

#training sample creation for churn prediction
#these params can be changed
lookback_months = 6         
forward_window_months = 2   
step_months = 1             

df_conversions['timestamp'] = pd.to_datetime(df_conversions['timestamp'])

def compute_user_features(df_window, window_end):
    if df_window.empty:
        empty_df = pd.DataFrame(columns=["user_id", "unique_categories", "unique_skus", "count_conversions", "sum_revenue", "recency", "frequency"])
        return empty_df

    user_grouped = df_window.groupby("user_id")
    features_created = user_grouped.agg(unique_categories=('sku_category', 'nunique'), unique_skus=('sku', 'nunique'),
        count_conversions=('user_id', 'count'), sum_revenue=('revenue_through_conversion', 'sum'),
        recency=('timestamp', lambda x: (window_end - x.max()).days),
        frequency=('timestamp', lambda x: x.count() / ((x.max() - x.min()).days + 1)if x.count() > 1 else 0.0)).reset_index()

    return features_created


def compute_churn_labels(df_future, all_users):

    active_users = df_future['user_id'].unique()
    label_df = pd.DataFrame({"user_id": all_users})
    label_df["churn_label"] = (~label_df["user_id"].isin(active_users)).astype(int)
    return label_df


def generate_training_samples(df_conv, df_users, lookback_months, forward_months, step_months):
    df = df_conv.copy()
    df['timestamp'] = pd.to_datetime(df['timestamp'])

    start_date = df['timestamp'].min()
    end_date = df['timestamp'].max()

    training_rows = []
    current_start = start_date

    while True:
        window_start = current_start
        window_end = window_start + pd.DateOffset(months=lookback_months)
        future_end = window_end + pd.DateOffset(months=forward_months)

        # stop if we can't fit a full forward window
        if future_end > end_date:
            break

        # data in lookback window
        df_window = df[(df['timestamp'] >= window_start) &
                       (df['timestamp'] < window_end)].copy()

        # data in forward window
        df_future = df[(df['timestamp'] >= window_end) &
                       (df['timestamp'] < future_end)].copy()

        if df_window.empty:
            current_start += pd.DateOffset(months=step_months)
            continue

        # features for users with activity in lookback window
        feats = compute_user_features(df_window, window_end)

        # churn labels for all users (we will only keep those with feats)
        labels = compute_churn_labels(df_future, df_users['user_id'])

        # merge features + static user attributes + churn label
        merged = (feats
                  .merge(df_users, on="user_id", how="left")
                  .merge(labels, on="user_id", how="left"))

        merged["feature_window_start"] = window_start
        merged["feature_window_end"] = window_end
        merged["label_window_end"] = future_end

        training_rows.append(merged)

        # slide window
        current_start += pd.DateOffset(months=step_months)

    if not training_rows:
        return pd.DataFrame()

    training_df = pd.concat(training_rows, ignore_index=True)
    return training_df

#final training data
training_df = generate_training_samples(
    df_conversions,
    df_users,
    lookback_months=lookback_months,
    forward_months=forward_window_months,
    step_months=step_months
)

print("Training samples shape:", training_df.shape)
print(training_df.head())
