In [1]:
import pandas as pd
import numpy as np
from enum import Enum
from collections import defaultdict

In [2]:
DrinkSize = Enum("DrinkSize", "SHORT TALL GRANDE")
MerchSize = Enum("MerchSize", "SMALL MEDIUM LARGE")
Season = Enum("Season", "SPRING SUMMER FALL WINTER")
PlainCoffeeProducts = Enum(
    "PlainCoffeeProducts",
    [
        "espresso_hot",
        "cappuccino_hot",
        "latte_hot",
        "coffee_hot",
        "cappuccino_iced",
        "latte_iced",
        "coffee_iced",
    ],
)
SpecialtyCoffeeProducts = Enum(
    "SpecialtyCoffeeProducts",
    [
        "carmel_cappuccino_hot",
        "mocha_cappuccino_hot",
        "white_chocolate_cappuccino_hot",
        "carmel_cappuccino_iced",
        "mocha_cappuccino_iced",
        "white_chocolate_cappuccino_iced",
    ],
)

In [5]:
rng = np.random.default_rng(42_000_000_000_000_000)

In [24]:
n_stores = 6
n_customers = 1000
n_days = 365
customer_ids = list(range(1,n_customers + 1))
genders  = rng.choice([0,1], size=n_customers) #0 male 1 female
ages = rng.integers(low=15, high=70, size=n_customers)
store_ids = list(range(1,n_stores + 1))
p_coffee_transaction = rng.beta(a=4,b=20,size=n_customers)
p_merch_transaction = rng.beta(a=4,b=80,size=n_customers) #ONLY IF COFFEE TRANSACTION

p_plain_coffee_preferred = rng.beta(a=20,b=20,size=n_customers)
preferred_stores = rng.choice(store_ids, size=(n_customers,3))

In [51]:
cust_data = {"age":ages, "gender":genders, 
"p_coffee_transaction":p_coffee_transaction,
"p_merch_transaction":p_merch_transaction,
"p_plain_coffee_preferred":p_plain_coffee_preferred
}
df = pd.concat(
    [pd.DataFrame(data=cust_data, index=customer_ids),
    pd.DataFrame(preferred_stores, index=customer_ids, columns=["store1", "store2", "store3"])
    ],
    axis=1
)
df

Unnamed: 0,age,gender,p_coffee_transaction,p_merch_transaction,p_plain_coffee_preferred,store1,store2,store3
1,43,0,0.132863,0.048753,0.443374,3,6,6
2,42,1,0.208092,0.048547,0.488979,5,3,4
3,49,1,0.357250,0.009368,0.549047,3,1,2
4,39,0,0.189318,0.118291,0.727442,3,4,3
5,62,1,0.211143,0.022669,0.450663,6,1,3
...,...,...,...,...,...,...,...,...
996,37,1,0.062302,0.042431,0.378727,2,6,3
997,47,0,0.106679,0.024215,0.479108,4,1,1
998,15,1,0.150268,0.054428,0.463096,2,5,2
999,36,1,0.250537,0.083372,0.534269,6,1,5


In [56]:
test = (pd.DataFrame(preferred_stores, index=customer_ids, columns=["store1", "store2", "store3"])
        .apply(lambda row: [row["store1"], row["store2"], row["store3"]], axis=1)
)
test.columns = ["stores"]
test.info()

<class 'pandas.core.series.Series'>
Index: 1000 entries, 1 to 1000
Series name: None
Non-Null Count  Dtype 
--------------  ----- 
1000 non-null   object
dtypes: object(1)
memory usage: 15.6+ KB


In [59]:
test = pd.DataFrame(preferred_stores, index=customer_ids, columns=["store1", "store2", "store3"])
test["sum"] = (test.apply(lambda row: [row["store1"], row["store2"], row["store3"]], axis=1)
).drop(["store1", "store2", "store3"], axis=1)
test.info()

ValueError: No axis named 1 for object type Series

In [34]:

test.iloc[:,"merged"] = np.asarray(test[0],test[1], test[2])
test

TypeError: order must be str, not Series

Unnamed: 0,age,gender,p_coffee_transaction,p_merch_transaction,p_plain_coffee_preferred
1,43,0,0.132863,0.048753,0.443374
2,42,1,0.208092,0.048547,0.488979
3,49,1,0.357250,0.009368,0.549047
4,39,0,0.189318,0.118291,0.727442
5,62,1,0.211143,0.022669,0.450663
...,...,...,...,...,...
996,37,1,0.062302,0.042431,0.378727
997,47,0,0.106679,0.024215,0.479108
998,15,1,0.150268,0.054428,0.463096
999,36,1,0.250537,0.083372,0.534269


In [None]:
# assign 1-5 store numbers to each customer with probability distribution
# assign gender, age, or 999 to each customer
# assign regular or specialty coffee probability to each customer with probability distribution

# assign probability of food purchase to each customer with each transaction they make
# specialty purchasers buy more food than regular coffee people
# assign probability of merch to each customer with each transaction they make

#############################################333
# FACTS
# specialty purchasers buy more merch than regular coffee people
# merch must be bought same day as food or drink

# start with transaction date
# customers either purchase or not on that date
# customers who purchased in the past are either alive or dead on that date
# pick some percentage of customer numbers from alive customers
# pick store number (each customer number has 2,3, or 4 different store ids )
# pick three stores from store list w/o replacement and save in defaultdict
# store number is a defaultdict(list) where list is their potential store ids
# pick season (Spring Summer Fall Winter)
# pick DOW (Weekday Weekend)
# pick date
# plain or specialty coffee product (each customer prefers plain or specialty but can choose both)
# product id
# number of units 1,2,3 each customer usually buys 1 but can buy more [0.55, 0.35, 0.10]
# drink size SML


In [1]:
'''
for every drink
for every size
for every season
there is a price

price[drink][size][season]
'''

# df = pd.read_excel("prices.xlsx", header=1)
# print(df)
# print(df.info())

# use this as a separate test for unpacking dicts with enums
# price = defaultdict(dict)
# i = 0
# for product in list(PlainCoffeeProducts):
#     price[product] = defaultdict(dict)
#     for size in list(DrinkSize):
#         price[product][size] = defaultdict(dict)
#         for season in list(Season):
#             price[product][size][season] = i^2
#             i = i + 1




rng.choice(list(DrinkSize))

hello
