In [None]:
import pandas as pd
import numpy as np
from enum import Enum
from collections import defaultdict
from utils import generate_transaction_data

In [None]:
DrinkSize = Enum("DrinkSize", "SHORT TALL GRANDE")
Merch = Enum("Merch", "SHIRT HAT SWEATPANTS")
Season = Enum("Season", "SPRING SUMMER FALL WINTER")
PlainCoffeeProducts = Enum(
    "PlainCoffeeProducts",
    [
        "espresso_hot",
        "cappuccino_hot",
        "latte_hot",
        "coffee_hot",
        "cappuccino_iced",
        "latte_iced",
        "coffee_iced",
    ],
)
SpecialtyCoffeeProducts = Enum(
    "SpecialtyCoffeeProducts",
    [
        "carmel_cappuccino_hot",
        "mocha_cappuccino_hot",
        "white_chocolate_cappuccino_hot",
        "carmel_cappuccino_iced",
        "mocha_cappuccino_iced",
        "white_chocolate_cappuccino_iced",
    ],
)

In [None]:
list(SpecialtyCoffeeProducts), list(PlainCoffeeProducts)

In [None]:
list(DrinkSize),list(Merch), list(Season)

In [None]:
rng = np.random.default_rng(42_000_000_000_000_000)

Tasks



In [None]:
#setup
n_stores = 100
n_customers = 1000
n_days = 100
customer_ids = list(range(n_customers))
genders  = rng.choice([0,1], size=n_customers) #0 male 1 female
ages = rng.integers(low=15, high=70, size=n_customers)
store_ids = list(range(n_stores))
max_num_preferred_stores = 3 #each customer can have 1,2, or 3 stores they frequent
preferred_stores = rng.choice(store_ids, size=(n_customers,max_num_preferred_stores)) 

In [None]:
p_coffee_transaction = rng.beta(a=4,b=20,size=n_customers) #probability per day of buying coffee
p_merch_transaction = rng.beta(a=4,b=80,size=n_customers) ##probability per day of buying merch
p_plain_coffee_preferred = rng.beta(a=20,b=20,size=n_customers) #probability the customer prefers plain coffee (over fancier drinks)

Notes from Scott
- do two years of data to put in seasonality and let it be viewed by the candidate to have some sort of seasonal trends.
- make customer purchases seasonal
- have some customers drop off, others join, so can see cohorts
- adjust customer join dates to make sure there there are joiners and people who have dropped off
- cogs so we can understand the profitability of each store (margin off of the retail price)
- specialty drink have high margin
- let them do cohort analysis and/or customer LTV

In [None]:
cust_data = {"age":ages, "gender":genders, 
"p_coffee_transaction":p_coffee_transaction,
"p_merch_transaction":p_merch_transaction,
"p_plain_coffee_preferred":p_plain_coffee_preferred,
"p_specialty_coffee_preferred": 1-p_plain_coffee_preferred
}
df = pd.concat(
    [pd.DataFrame(data=cust_data, index=customer_ids),
    pd.DataFrame(preferred_stores, index=customer_ids, columns=["store1", "store2", "store3"])
    ],
    axis=1
)
stores = (pd.DataFrame(preferred_stores, index=customer_ids, columns=["store1", "store2", "store3"])
        .apply(lambda row: [row["store1"], row["store2"], row["store3"]], axis=1)
)
stores = pd.DataFrame(stores, index=customer_ids, columns=["stores"])
df = pd.concat([df,stores], axis=1).drop(["store1","store2", "store3"], axis=1)

In [None]:
df

In [None]:
df["stores"]

In [None]:
# Make a date series
# Define the start and end dates
start_date = pd.to_datetime('2023-01-01')
end_date = start_date + pd.Timedelta(days=n_days)
date_series = pd.date_range(start=start_date, end=end_date, freq='D')
dates = pd.DataFrame(date_series, index=date_series, columns=['Date'])

# Display the DataFrame
print(dates)

In [None]:
p_coffee = df["p_coffee_transaction"].to_list()
p_merch = df["p_merch_transaction"].to_list()
p_plain_coffee = df["p_plain_coffee_preferred"].to_list()
transaction_dates = dates["Date"].to_list();transaction_dates

In [None]:

# for each date, 
# look at each customer 
# randomly decide if they purchased that day or not
# pick a random number on 0,1, 
# if number is less than p_coffee_transaction then purchased=1 else 0
# pick another random number on 0,1, 
# if number is less than p_merch_transaction then purchased=1 else 0
#compute coffee and merch purchases

result_coffee, result_merch = [], []
for date in transaction_dates:
    for cust_id, p_customer_buys_coffee in enumerate(p_coffee):
        purchased = int(rng.uniform() <= p_customer_buys_coffee)
        if purchased:
            result_coffee.append((date,cust_id,purchased))

    for cust_id, p_customer_buys_merch in enumerate(p_merch):
        purchased = int(rng.uniform() <= p_customer_buys_merch)
        if purchased:
            result_merch.append((date,cust_id,purchased))
    
result_coffee, result_merch




In [None]:
# if purchased, need to figure out what kind of coffee and/or merch purchased


result_coffee, result_merch = [], []
for date in transaction_dates:
    for cust_id, p_customer_buys_coffee in enumerate(p_coffee):
        purchased = int(rng.uniform() <= p_customer_buys_coffee)
        if purchased:
            p_plain = p_plain_coffee[cust_id]
            item = rng.choice(PlainCoffeeProducts) if rng.uniform() <  p_plain else rng.choice(SpecialtyCoffeeProducts)
            result_coffee.append((date,cust_id,item))

    for cust_id, p_customer_buys_merch in enumerate(p_merch):
        purchased = int(rng.uniform() <= p_customer_buys_merch)
        if purchased:
            item = rng.choice(Merch)
            result_coffee.append((date,cust_id,item))
    
result_coffee, result_merch

In [None]:
# need to figure out how many units are purchased rng.choice([1,2,3])


result_coffee, result_merch = [], []
for date in transaction_dates:
    for cust_id, p_customer_buys_coffee in enumerate(p_coffee):
        purchased = int(rng.uniform() <= p_customer_buys_coffee)
        if purchased:
            num_units = rng.choice([1,2,3]) #rng.poisson(lam=2) + 1 # need to add one to ensure at least 1 purchase
            p_plain = p_plain_coffee[cust_id]
            item = rng.choice(PlainCoffeeProducts) if rng.uniform() <  p_plain else rng.choice(SpecialtyCoffeeProducts)
            result_coffee.append((date,cust_id,num_units,item))

    for cust_id, p_customer_buys_merch in enumerate(p_merch):
        purchased = int(rng.uniform() <= p_customer_buys_merch)
        if purchased:
            num_units = rng.choice([1,2,3])
            item = rng.choice(Merch)
            result_coffee.append((date,cust_id,num_units,item))
    
columns = ["date", "cust_id", "num_units", "item"]
results = result_coffee + result_merch

transaction_df = pd.DataFrame(results, columns=columns)
transaction_df

In [None]:
# need to figure out how many units are purchased rng.choice([1,2,3])


result_coffee, result_merch = [], []
for date in transaction_dates:
    for cust_id, p_customer_buys_coffee in enumerate(p_coffee):
        purchased = int(rng.uniform() <= p_customer_buys_coffee)
        if purchased:
            num_units = rng.choice([1,2,3]) #rng.poisson(lam=2) + 1 # need to add one to ensure at least 1 purchase
            p_plain = p_plain_coffee[cust_id]
            item = rng.choice(PlainCoffeeProducts) if rng.uniform() <  p_plain else rng.choice(SpecialtyCoffeeProducts)
            result_coffee.append((date,cust_id,num_units,item))

    for cust_id, p_customer_buys_merch in enumerate(p_merch):
        purchased = int(rng.uniform() <= p_customer_buys_merch)
        if purchased:
            num_units = rng.choice([1,2,3])
            item = rng.choice(Merch)
            result_coffee.append((date,cust_id,num_units,item))
    
columns = ["date", "cust_id", "num_units", "item"]
results = result_coffee + result_merch

transaction_df = pd.DataFrame(results, columns=columns)
transaction_df

In [None]:
# need to get the unit price for each transaction
# need to add the store


result_coffee, result_merch = [], []
for date in transaction_dates:
    for cust_id, p_customer_buys_coffee in enumerate(p_coffee):
        purchased = int(rng.uniform() <= p_customer_buys_coffee)
        if purchased:
            # store = df["stores"][cust_id]
            num_units = rng.choice([1,2,3]) #rng.poisson(lam=2) + 1 # need to add one to ensure at least 1 purchase
            p_plain = p_plain_coffee[cust_id]
            item = rng.choice(PlainCoffeeProducts) if rng.uniform() <  p_plain else rng.choice(SpecialtyCoffeeProducts)
            result_coffee.append((date,cust_id,num_units,item))

    for cust_id, p_customer_buys_merch in enumerate(p_merch):
        purchased = int(rng.uniform() <= p_customer_buys_merch)
        if purchased:
            num_units = rng.choice([1,2,3])
            item = rng.choice(Merch)
            result_coffee.append((date,cust_id,num_units,item, store))
    
result_coffee, result_merch

In [None]:
# generate transaction data using lifetimes

In [None]:
# assign 1-5 store numbers to each customer with probability distribution
# assign gender, age, or 999 to each customer
# assign regular or specialty coffee probability to each customer with probability distribution

# assign probability of food purchase to each customer with each transaction they make
# specialty purchasers buy more food than regular coffee people
# assign probability of merch to each customer with each transaction they make

#############################################333
# FACTS
# specialty purchasers buy more merch than regular coffee people
# merch must be bought same day as food or drink

# start with transaction date
# customers either purchase or not on that date
# customers who purchased in the past are either alive or dead on that date
# pick some percentage of customer numbers from alive customers
# pick store number (each customer number has 2,3, or 4 different store ids )
# pick three stores from store list w/o replacement and save in defaultdict
# store number is a defaultdict(list) where list is their potential store ids
# pick season (Spring Summer Fall Winter)
# pick DOW (Weekday Weekend)
# pick date
# plain or specialty coffee product (each customer prefers plain or specialty but can choose both)
# product id
# number of units 1,2,3 each customer usually buys 1 but can buy more [0.55, 0.35, 0.10]
# drink size SML


In [None]:
'''
for every drink
for every size
for every season
there is a price

price[drink][size][season]
'''

# df = pd.read_excel("prices.xlsx", header=1)
# print(df)
# print(df.info())

# use this as a separate test for unpacking dicts with enums
# price = defaultdict(dict)
# i = 0
# for product in list(PlainCoffeeProducts):
#     price[product] = defaultdict(dict)
#     for size in list(DrinkSize):
#         price[product][size] = defaultdict(dict)
#         for season in list(Season):
#             price[product][size][season] = i^2
#             i = i + 1




rng.choice(list(DrinkSize))