In [337]:
# import libraries: 

import pandas as pd
import random
import numpy as np

In [338]:
# read in menu items csvs: 

menu = pd.read_csv("~/repos/jugos_rodier/csvs/menu_items.csv")

menu = menu[['item_id', 'price', 'item_category']]

menu.head()

Unnamed: 0,item_id,price,item_category
0,01avch02,5.0,Sandwiches
1,01avch03,8.62,Sandwiches
2,01hmgd02,4.5,Sandwiches
3,01hmgd03,7.0,Sandwiches
4,01hmsw02,4.6,Sandwiches


In [339]:
# combine item_id and price into a single string: 

    # first convert price into a string:

menu['price'] = menu['price'].astype('str')

    # concatenate them together: 

menu['item_price'] = menu['item_id'] + '_' + menu['price']

    # get rid of 'item_id' and 'price' columns:

menu = menu[['item_price', 'item_category']]

menu.head()

Unnamed: 0,item_price,item_category
0,01avch02_5.0,Sandwiches
1,01avch03_8.62,Sandwiches
2,01hmgd02_4.5,Sandwiches
3,01hmgd03_7.0,Sandwiches
4,01hmsw02_4.6,Sandwiches


In [340]:
# Replace 'Smoothie' and 'Fresh Juice' columns with 'smoothie_juice' for the sake of random item selection:

menu.item_category.replace(['Smoothie', 'Fresh Juice'],
                           'smoothie_juice', 
                           inplace = True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  menu.item_category.replace(['Smoothie', 'Fresh Juice'],


In [341]:
# Create empty DataFrame for future storage: 

master_list = []


for i in list(range(7000)):

    # Generate unique transaction_id and create a list for items in a transaction to be stored: 
    trans_id = 'trans' + str(i) + '_'
    transaction = []

    # Iterate over item categories and use probability to determine skip or select 1 or 2 items:
    for category in menu.item_category.unique():

        # Create category based subset of menu items
        to_choose = list(menu.loc[menu['item_category']== category, 'item_price'])
    
 
        # Select random number for probability sake:
        a = random.choice(range(1,100))
        if a >= 40: # 60% chance that a customer orders 1 item
            item1 = random.choice(to_choose) 
            transaction.extend([trans_id + item1])  # add each item to transaction list
            
        elif a > 15:    # 25% chance that customer orders 2 items from the category
            item1 = random.choice(to_choose) + '_'
            transaction.extend([trans_id + item1])  # add item to transaction list
            item2 = random.choice(to_choose) + '_'
            transaction.extend([trans_id + item2])  # add item to transaction list

    master_list.extend(transaction) # add individual transaction to master list that holds all transactions
        
    
#   convert master list to dataframe for further manipulation: 
trans_long = pd.DataFrame(master_list, columns=['trans_item_price'])

trans_long.head()

Unnamed: 0,trans_item_price
0,trans0_01hmsw03_7.23
1,trans0_02crep02_5.0_
2,trans0_02brln02_3.77_
3,trans0_04wtpe02_3.25
4,trans0_05fry03_3.0_


In [342]:
# Split Dataframe into 3 different columns: trans_id, item_id, price 

transaction_item = trans_long.trans_item_price.str.split(pat = '_', 
                                      expand = True)

# name columns
transaction_item.columns = ['trans_id', 'item_id', 'item_price', 'to_drop']

# drop extra column created because some transactions had a trailing '_' character.

transaction_item.drop('to_drop', axis = 1, inplace = True)

# make sure everything looks good.
transaction_item.sample(10)

Unnamed: 0,trans_id,item_id,item_price
14529,trans3340,02brln02,3.77
3021,trans686,02brln02,3.77
17891,trans4103,01chich02,10.0
19320,trans4436,03guan03,6.5
10634,trans2438,02crep02,5.0
21694,trans4989,01chich02,10.0
25585,trans5877,01cmch03,12.0
9397,trans2153,01avch03,8.62
26582,trans6099,01avch03,8.62
29820,trans6842,04gpip02,3.25


In [343]:
# Create quantity column: 

    # Find and label all duplicates:
duplicates = transaction_item.duplicated(keep = False)

transaction_item['duplicate'] = duplicates

    # create quantity column with default value = 1:
transaction_item['quantity'] = 1

    # If row is duplicate, then quantity should = 2:
for i in transaction_item.index:
    if transaction_item.loc[i, 'duplicate'] == True: 
        transaction_item.loc[i, 'quantity'] = 2


    # Remove first duplicate:
        
transaction_item.drop_duplicates(keep='last', inplace = True)

    # drop duplicate table and make sure everything looks good: 

transaction_item.drop('duplicate', axis = 1, inplace = True)

transaction_item.head(25)

Unnamed: 0,trans_id,item_id,item_price,quantity
0,trans0,01hmsw03,7.23,1
1,trans0,02crep02,5.0,1
2,trans0,02brln02,3.77,1
3,trans0,04wtpe02,3.25,1
4,trans0,05fry03,3.0,1
5,trans0,05fry02,2.5,1
6,trans1,01hmsw02,4.6,1
8,trans1,02brln02,3.77,2
9,trans1,03lucu02,5.5,1
10,trans2,01cmch02,6.0,1


In [346]:
# Write to csv: 

transaction_item.to_csv("~/repos/jugos_rodier/csvs/transaction_items.csv",index_label = 'transaction_item_id')
