In [2]:
import pandas as pd
import numpy as np
from enum import Enum
from collections import defaultdict
import lifetimes

In [4]:
DrinkSize = Enum("DrinkSize", "SHORT TALL GRANDE")
Merch = Enum("Merch", "SHIRT HAT SWEATPANTS")
Season = Enum("Season", "SPRING SUMMER FALL WINTER")
PlainCoffeeProducts = Enum(
    "PlainCoffeeProducts",
    [
        "espresso_hot",
        "cappuccino_hot",
        "latte_hot",
        "coffee_hot",
        "cappuccino_iced",
        "latte_iced",
        "coffee_iced",
    ],
)
SpecialtyCoffeeProducts = Enum(
    "SpecialtyCoffeeProducts",
    [
        "carmel_cappuccino_hot",
        "mocha_cappuccino_hot",
        "white_chocolate_cappuccino_hot",
        "carmel_cappuccino_iced",
        "mocha_cappuccino_iced",
        "white_chocolate_cappuccino_iced",
    ],
)

In [5]:
list(SpecialtyCoffeeProducts), list(PlainCoffeeProducts)

([<SpecialtyCoffeeProducts.carmel_cappuccino_hot: 1>,
  <SpecialtyCoffeeProducts.mocha_cappuccino_hot: 2>,
  <SpecialtyCoffeeProducts.white_chocolate_cappuccino_hot: 3>,
  <SpecialtyCoffeeProducts.carmel_cappuccino_iced: 4>,
  <SpecialtyCoffeeProducts.mocha_cappuccino_iced: 5>,
  <SpecialtyCoffeeProducts.white_chocolate_cappuccino_iced: 6>],
 [<PlainCoffeeProducts.espresso_hot: 1>,
  <PlainCoffeeProducts.cappuccino_hot: 2>,
  <PlainCoffeeProducts.latte_hot: 3>,
  <PlainCoffeeProducts.coffee_hot: 4>,
  <PlainCoffeeProducts.cappuccino_iced: 5>,
  <PlainCoffeeProducts.latte_iced: 6>,
  <PlainCoffeeProducts.coffee_iced: 7>])

In [7]:
list(DrinkSize),list(Merch), list(Season)

([<DrinkSize.SHORT: 1>, <DrinkSize.TALL: 2>, <DrinkSize.GRANDE: 3>],
 [<Merch.SHIRT: 1>, <Merch.HAT: 2>, <Merch.SWEATPANTS: 3>],
 [<Season.SPRING: 1>,
  <Season.SUMMER: 2>,
  <Season.FALL: 3>,
  <Season.WINTER: 4>])

In [8]:
rng = np.random.default_rng(42_000_000_000_000_000)

Tasks



In [21]:
#setup
n_stores = 100
n_customers = 1000
n_days = 100
customer_ids = list(range(n_customers))
genders  = rng.choice([0,1], size=n_customers) #0 male 1 female
ages = rng.integers(low=15, high=70, size=n_customers)
store_ids = list(range(n_stores))
max_num_preferred_stores = 3 #each customer can have 1,2, or 3 stores they frequent
preferred_stores = rng.choice(store_ids, size=(n_customers,max_num_preferred_stores)) 

In [22]:
p_coffee_transaction = rng.beta(a=4,b=20,size=n_customers) #probability per day of buying coffee
p_merch_transaction = rng.beta(a=4,b=80,size=n_customers) ##probability per day of buying merch
p_plain_coffee_preferred = rng.beta(a=20,b=20,size=n_customers) #probability the customer prefers plain coffee (over fancier drinks)

Notes from Scott
- do two years of data to put in seasonality and let it be viewed by the candidate to have some sort of seasonal trends.
- make customer purchases seasonal
- have some customers drop off, others join, so can see cohorts
- adjust customer join dates to make sure there there are joiners and people who have dropped off
- cogs so we can understand the profitability of each store (margin off of the retail price)
- specialty drink have high margin
- let them do cohort analysis and/or customer LTV

In [23]:
cust_data = {"age":ages, "gender":genders, 
"p_coffee_transaction":p_coffee_transaction,
"p_merch_transaction":p_merch_transaction,
"p_plain_coffee_preferred":p_plain_coffee_preferred,
"p_specialty_coffee_preferred": 1-p_plain_coffee_preferred
}
df = pd.concat(
    [pd.DataFrame(data=cust_data, index=customer_ids),
    pd.DataFrame(preferred_stores, index=customer_ids, columns=["store1", "store2", "store3"])
    ],
    axis=1
)
stores = (pd.DataFrame(preferred_stores, index=customer_ids, columns=["store1", "store2", "store3"])
        .apply(lambda row: [row["store1"], row["store2"], row["store3"]], axis=1)
)
stores = pd.DataFrame(stores, index=customer_ids, columns=["stores"])
df = pd.concat([df,stores], axis=1).drop(["store1","store2", "store3"], axis=1)

In [24]:
df

Unnamed: 0,age,gender,p_coffee_transaction,p_merch_transaction,p_plain_coffee_preferred,p_specialty_coffee_preferred,stores
0,27,0,0.077225,0.056611,0.523774,0.476226,"[27, 68, 72]"
1,35,0,0.164177,0.073854,0.508880,0.491120,"[19, 89, 58]"
2,47,1,0.239537,0.038574,0.500985,0.499015,"[5, 65, 6]"
3,60,1,0.324118,0.046529,0.475263,0.524737,"[1, 50, 38]"
4,54,1,0.081124,0.065903,0.601228,0.398772,"[62, 92, 53]"
...,...,...,...,...,...,...,...
995,39,1,0.134717,0.010577,0.395282,0.604718,"[66, 25, 16]"
996,32,0,0.225592,0.056651,0.528280,0.471720,"[26, 76, 15]"
997,53,0,0.203634,0.037987,0.403486,0.596514,"[68, 99, 52]"
998,47,0,0.115904,0.059087,0.649277,0.350723,"[50, 7, 20]"


In [32]:
df["stores"]

0      [27, 68, 72]
1      [19, 89, 58]
2        [5, 65, 6]
3       [1, 50, 38]
4      [62, 92, 53]
           ...     
995    [66, 25, 16]
996    [26, 76, 15]
997    [68, 99, 52]
998     [50, 7, 20]
999    [95, 87, 20]
Name: stores, Length: 1000, dtype: object

In [25]:
# Make a date series
# Define the start and end dates
start_date = pd.to_datetime('2023-01-01')
end_date = start_date + pd.Timedelta(days=n_days)
date_series = pd.date_range(start=start_date, end=end_date, freq='D')
dates = pd.DataFrame(date_series, index=date_series, columns=['Date'])

# Display the DataFrame
print(dates)

                 Date
2023-01-01 2023-01-01
2023-01-02 2023-01-02
2023-01-03 2023-01-03
2023-01-04 2023-01-04
2023-01-05 2023-01-05
...               ...
2023-04-07 2023-04-07
2023-04-08 2023-04-08
2023-04-09 2023-04-09
2023-04-10 2023-04-10
2023-04-11 2023-04-11

[101 rows x 1 columns]


In [26]:
p_coffee = df["p_coffee_transaction"].to_list()
p_merch = df["p_merch_transaction"].to_list()
p_plain_coffee = df["p_plain_coffee_preferred"].to_list()
transaction_dates = dates["Date"].to_list();transaction_dates

[Timestamp('2023-01-01 00:00:00'),
 Timestamp('2023-01-02 00:00:00'),
 Timestamp('2023-01-03 00:00:00'),
 Timestamp('2023-01-04 00:00:00'),
 Timestamp('2023-01-05 00:00:00'),
 Timestamp('2023-01-06 00:00:00'),
 Timestamp('2023-01-07 00:00:00'),
 Timestamp('2023-01-08 00:00:00'),
 Timestamp('2023-01-09 00:00:00'),
 Timestamp('2023-01-10 00:00:00'),
 Timestamp('2023-01-11 00:00:00'),
 Timestamp('2023-01-12 00:00:00'),
 Timestamp('2023-01-13 00:00:00'),
 Timestamp('2023-01-14 00:00:00'),
 Timestamp('2023-01-15 00:00:00'),
 Timestamp('2023-01-16 00:00:00'),
 Timestamp('2023-01-17 00:00:00'),
 Timestamp('2023-01-18 00:00:00'),
 Timestamp('2023-01-19 00:00:00'),
 Timestamp('2023-01-20 00:00:00'),
 Timestamp('2023-01-21 00:00:00'),
 Timestamp('2023-01-22 00:00:00'),
 Timestamp('2023-01-23 00:00:00'),
 Timestamp('2023-01-24 00:00:00'),
 Timestamp('2023-01-25 00:00:00'),
 Timestamp('2023-01-26 00:00:00'),
 Timestamp('2023-01-27 00:00:00'),
 Timestamp('2023-01-28 00:00:00'),
 Timestamp('2023-01-

In [27]:

# for each date, 
# look at each customer 
# randomly decide if they purchased that day or not
# pick a random number on 0,1, 
# if number is less than p_coffee_transaction then purchased=1 else 0
# pick another random number on 0,1, 
# if number is less than p_merch_transaction then purchased=1 else 0
#compute coffee and merch purchases

result_coffee, result_merch = [], []
for date in transaction_dates:
    for cust_id, p_customer_buys_coffee in enumerate(p_coffee):
        purchased = int(rng.uniform() <= p_customer_buys_coffee)
        if purchased:
            result_coffee.append((date,cust_id,purchased))

    for cust_id, p_customer_buys_merch in enumerate(p_merch):
        purchased = int(rng.uniform() <= p_customer_buys_merch)
        if purchased:
            result_merch.append((date,cust_id,purchased))
    
result_coffee, result_merch




([(Timestamp('2023-01-01 00:00:00'), 1, 1),
  (Timestamp('2023-01-01 00:00:00'), 6, 1),
  (Timestamp('2023-01-01 00:00:00'), 10, 1),
  (Timestamp('2023-01-01 00:00:00'), 12, 1),
  (Timestamp('2023-01-01 00:00:00'), 17, 1),
  (Timestamp('2023-01-01 00:00:00'), 48, 1),
  (Timestamp('2023-01-01 00:00:00'), 49, 1),
  (Timestamp('2023-01-01 00:00:00'), 64, 1),
  (Timestamp('2023-01-01 00:00:00'), 68, 1),
  (Timestamp('2023-01-01 00:00:00'), 87, 1),
  (Timestamp('2023-01-01 00:00:00'), 97, 1),
  (Timestamp('2023-01-01 00:00:00'), 100, 1),
  (Timestamp('2023-01-01 00:00:00'), 105, 1),
  (Timestamp('2023-01-01 00:00:00'), 117, 1),
  (Timestamp('2023-01-01 00:00:00'), 118, 1),
  (Timestamp('2023-01-01 00:00:00'), 135, 1),
  (Timestamp('2023-01-01 00:00:00'), 139, 1),
  (Timestamp('2023-01-01 00:00:00'), 147, 1),
  (Timestamp('2023-01-01 00:00:00'), 149, 1),
  (Timestamp('2023-01-01 00:00:00'), 169, 1),
  (Timestamp('2023-01-01 00:00:00'), 172, 1),
  (Timestamp('2023-01-01 00:00:00'), 194, 1),
 

In [28]:
# if purchased, need to figure out what kind of coffee and/or merch purchased


result_coffee, result_merch = [], []
for date in transaction_dates:
    for cust_id, p_customer_buys_coffee in enumerate(p_coffee):
        purchased = int(rng.uniform() <= p_customer_buys_coffee)
        if purchased:
            p_plain = p_plain_coffee[cust_id]
            item = rng.choice(PlainCoffeeProducts) if rng.uniform() <  p_plain else rng.choice(SpecialtyCoffeeProducts)
            result_coffee.append((date,cust_id,item))

    for cust_id, p_customer_buys_merch in enumerate(p_merch):
        purchased = int(rng.uniform() <= p_customer_buys_merch)
        if purchased:
            item = rng.choice(Merch)
            result_coffee.append((date,cust_id,item))
    
result_coffee, result_merch

([(Timestamp('2023-01-01 00:00:00'),
   2,
   <SpecialtyCoffeeProducts.white_chocolate_cappuccino_hot: 3>),
  (Timestamp('2023-01-01 00:00:00'),
   13,
   <SpecialtyCoffeeProducts.carmel_cappuccino_hot: 1>),
  (Timestamp('2023-01-01 00:00:00'),
   20,
   <PlainCoffeeProducts.cappuccino_iced: 5>),
  (Timestamp('2023-01-01 00:00:00'),
   21,
   <SpecialtyCoffeeProducts.carmel_cappuccino_hot: 1>),
  (Timestamp('2023-01-01 00:00:00'), 24, <PlainCoffeeProducts.latte_hot: 3>),
  (Timestamp('2023-01-01 00:00:00'), 38, <PlainCoffeeProducts.coffee_hot: 4>),
  (Timestamp('2023-01-01 00:00:00'),
   41,
   <SpecialtyCoffeeProducts.white_chocolate_cappuccino_hot: 3>),
  (Timestamp('2023-01-01 00:00:00'),
   48,
   <SpecialtyCoffeeProducts.carmel_cappuccino_hot: 1>),
  (Timestamp('2023-01-01 00:00:00'),
   57,
   <PlainCoffeeProducts.espresso_hot: 1>),
  (Timestamp('2023-01-01 00:00:00'),
   60,
   <SpecialtyCoffeeProducts.carmel_cappuccino_iced: 4>),
  (Timestamp('2023-01-01 00:00:00'),
   62,
   <

In [29]:
# need to figure out how many units are purchased rng.choice([1,2,3])


result_coffee, result_merch = [], []
for date in transaction_dates:
    for cust_id, p_customer_buys_coffee in enumerate(p_coffee):
        purchased = int(rng.uniform() <= p_customer_buys_coffee)
        if purchased:
            num_units = rng.choice([1,2,3]) #rng.poisson(lam=2) + 1 # need to add one to ensure at least 1 purchase
            p_plain = p_plain_coffee[cust_id]
            item = rng.choice(PlainCoffeeProducts) if rng.uniform() <  p_plain else rng.choice(SpecialtyCoffeeProducts)
            result_coffee.append((date,cust_id,num_units,item))

    for cust_id, p_customer_buys_merch in enumerate(p_merch):
        purchased = int(rng.uniform() <= p_customer_buys_merch)
        if purchased:
            num_units = rng.choice([1,2,3])
            item = rng.choice(Merch)
            result_coffee.append((date,cust_id,num_units,item))
    
columns = ["date", "cust_id", "num_units", "item"]
results = result_coffee + result_merch

transaction_df = pd.DataFrame(results, columns=columns)
transaction_df

Unnamed: 0,date,cust_id,num_units,item
0,2023-01-01,6,3,SpecialtyCoffeeProducts.carmel_cappuccino_hot
1,2023-01-01,11,3,PlainCoffeeProducts.cappuccino_iced
2,2023-01-01,21,2,SpecialtyCoffeeProducts.white_chocolate_cappuc...
3,2023-01-01,23,2,SpecialtyCoffeeProducts.mocha_cappuccino_iced
4,2023-01-01,25,1,SpecialtyCoffeeProducts.mocha_cappuccino_iced
...,...,...,...,...
21243,2023-04-11,921,1,Merch.SWEATPANTS
21244,2023-04-11,940,2,Merch.HAT
21245,2023-04-11,957,2,Merch.HAT
21246,2023-04-11,986,3,Merch.SHIRT


In [None]:
# need to figure out how many units are purchased rng.choice([1,2,3])


result_coffee, result_merch = [], []
for date in transaction_dates:
    for cust_id, p_customer_buys_coffee in enumerate(p_coffee):
        purchased = int(rng.uniform() <= p_customer_buys_coffee)
        if purchased:
            num_units = rng.choice([1,2,3]) #rng.poisson(lam=2) + 1 # need to add one to ensure at least 1 purchase
            p_plain = p_plain_coffee[cust_id]
            item = rng.choice(PlainCoffeeProducts) if rng.uniform() <  p_plain else rng.choice(SpecialtyCoffeeProducts)
            result_coffee.append((date,cust_id,num_units,item))

    for cust_id, p_customer_buys_merch in enumerate(p_merch):
        purchased = int(rng.uniform() <= p_customer_buys_merch)
        if purchased:
            num_units = rng.choice([1,2,3])
            item = rng.choice(Merch)
            result_coffee.append((date,cust_id,num_units,item))
    
columns = ["date", "cust_id", "num_units", "item"]
results = result_coffee + result_merch

transaction_df = pd.DataFrame(results, columns=columns)
transaction_df

In [19]:
# need to get the unit price for each transaction
# need to add the store


result_coffee, result_merch = [], []
for date in transaction_dates:
    for cust_id, p_customer_buys_coffee in enumerate(p_coffee):
        purchased = int(rng.uniform() <= p_customer_buys_coffee)
        if purchased:
            # store = df["stores"][cust_id]
            num_units = rng.choice([1,2,3]) #rng.poisson(lam=2) + 1 # need to add one to ensure at least 1 purchase
            p_plain = p_plain_coffee[cust_id]
            item = rng.choice(PlainCoffeeProducts) if rng.uniform() <  p_plain else rng.choice(SpecialtyCoffeeProducts)
            result_coffee.append((date,cust_id,num_units,item))

    for cust_id, p_customer_buys_merch in enumerate(p_merch):
        purchased = int(rng.uniform() <= p_customer_buys_merch)
        if purchased:
            num_units = rng.choice([1,2,3])
            item = rng.choice(Merch)
            result_coffee.append((date,cust_id,num_units,item, store))
    
result_coffee, result_merch

([(Timestamp('2023-01-01 00:00:00'),
   1,
   3,
   <SpecialtyCoffeeProducts.carmel_cappuccino_iced: 4>),
  (Timestamp('2023-01-03 00:00:00'),
   0,
   3,
   <SpecialtyCoffeeProducts.carmel_cappuccino_iced: 4>),
  (Timestamp('2023-01-03 00:00:00'),
   1,
   3,
   <SpecialtyCoffeeProducts.carmel_cappuccino_iced: 4>),
  (Timestamp('2023-01-04 00:00:00'), 2, 3, <Merch.SHIRT: 1>),
  (Timestamp('2023-01-05 00:00:00'),
   2,
   3,
   <PlainCoffeeProducts.latte_hot: 3>)],
 [])

In [None]:
# generate transaction data using lifetimes

In [None]:
# assign 1-5 store numbers to each customer with probability distribution
# assign gender, age, or 999 to each customer
# assign regular or specialty coffee probability to each customer with probability distribution

# assign probability of food purchase to each customer with each transaction they make
# specialty purchasers buy more food than regular coffee people
# assign probability of merch to each customer with each transaction they make

#############################################333
# FACTS
# specialty purchasers buy more merch than regular coffee people
# merch must be bought same day as food or drink

# start with transaction date
# customers either purchase or not on that date
# customers who purchased in the past are either alive or dead on that date
# pick some percentage of customer numbers from alive customers
# pick store number (each customer number has 2,3, or 4 different store ids )
# pick three stores from store list w/o replacement and save in defaultdict
# store number is a defaultdict(list) where list is their potential store ids
# pick season (Spring Summer Fall Winter)
# pick DOW (Weekday Weekend)
# pick date
# plain or specialty coffee product (each customer prefers plain or specialty but can choose both)
# product id
# number of units 1,2,3 each customer usually buys 1 but can buy more [0.55, 0.35, 0.10]
# drink size SML


In [None]:
'''
for every drink
for every size
for every season
there is a price

price[drink][size][season]
'''

# df = pd.read_excel("prices.xlsx", header=1)
# print(df)
# print(df.info())

# use this as a separate test for unpacking dicts with enums
# price = defaultdict(dict)
# i = 0
# for product in list(PlainCoffeeProducts):
#     price[product] = defaultdict(dict)
#     for size in list(DrinkSize):
#         price[product][size] = defaultdict(dict)
#         for season in list(Season):
#             price[product][size][season] = i^2
#             i = i + 1




rng.choice(list(DrinkSize))