In [1]:
# imports
import pandas as pd
import numpy as np
from datetime import datetime

In [2]:
# Pull in the data
file = "../data/data.csv"
df_transactions = pd.read_csv(file)
df_transactions.head()

Unnamed: 0,Customer ID,Age,Gender,Transact ID,Transact Date,Transact Time,Outlet,Outlet District,Transact Details ID,Item,Item Description,Quantity,Price,Spending
0,57618,58.0,F,183003,1/1/16,12:01:48 AM,Outlet 17,5,537557,215A,Pork Chop Noodle,1,13.94,13.94
1,57618,58.0,F,183003,1/1/16,12:01:48 AM,Outlet 17,5,537555,34,Fish & Chips,1,13.94,13.94
2,54234,40.0,,183005,1/1/16,12:03:32 AM,Outlet 11,27,537565,552,Mango Pomelo,1,8.36,8.36
3,54234,40.0,,183005,1/1/16,12:03:32 AM,Outlet 11,27,537564,215B,Pork Chop Bee Hoon,1,13.94,13.94
4,73173,30.0,,183010,1/1/16,12:14:51 AM,Outlet 14,16,537571,127,Borsch Soup,2,5.94,11.88


In [3]:
mean = 38
sd = 11

In [4]:
df_customer = pd.DataFrame()
df_customer["customerID"] = df_transactions["Customer ID"]
df_customer["age"] = df_transactions["Age"]
df_customer["gender"] = df_transactions["Gender"]

In [5]:
print("Customer table size ", len(df_customer.index))
df_customer = df_customer.drop_duplicates(keep='first')
print("Customer table size after removing duplicates ", len(df_customer.index))

Customer table size  556580
Customer table size after removing duplicates  44414


In [6]:
df_customer['age'].min()
df_customer['age'].max()

116.0

In [7]:
df_customer['age'] = pd.to_numeric(df_customer['age'])
df_customer['age'] = df_customer['age'].fillna(value=0)
df_customer['gender'] = df_customer['gender'].fillna(value="NA")

In [8]:
mean = df_customer["age"].ix[(df_customer["age"] >= 6) & (df_customer["age"] <= 88)].mean()
sd = df_customer["age"].ix[(df_customer["age"] >= 6) & (df_customer["age"] <= 88)].std()
print("Mean: ", mean, " SD: ", sd)

Mean:  37.3733306078  SD:  11.9079709467


In [9]:

def assignAge(d):
    while(d < 6 or d > 88):
        d = np.random.normal(mean, sd, 1)[0]
    return int(d)
        
df_customer["age"] = df_customer["age"].apply(assignAge)

In [10]:
df_customer.ix[df_customer["customerID"] == 66515]

Unnamed: 0,customerID,age,gender
23291,66515,28,


In [11]:
np.random.normal(mean, sd, 1)[0]

39.6953585646771

In [12]:
def age_group(d):
    if d <= 18:
        return "YOUTH"
    elif d <= 35 and d > 18:
        return "ADULT"
    elif d <= 50 and d > 35:
        return "MIDDLE-AGED"
    else:
        return "SENIOR"

df_customer['age_group'] = df_customer['age'].apply(age_group)

In [13]:
df_transactions.groupby('Transact ID')['Spending'].sum()

Transact ID
183003     27.88
183005     22.30
183010     60.73
183017     71.03
183020     12.24
183028     29.69
183030     25.94
183032     41.33
183034     37.45
183048    152.85
183056     25.82
183058     70.78
183060     35.39
183062     20.85
183066     20.96
183070     37.81
183076     40.12
183082     25.94
183086      4.73
183092     16.61
183093     17.45
183095     18.78
183104     39.15
183113     14.91
183116     56.60
183127     37.58
183146     33.46
183152     86.66
183160     34.91
183163      5.94
           ...  
601782     55.76
601785     34.43
601789     29.94
601790     21.82
601791    196.47
601793     31.27
601795     19.39
601796     52.85
601800     62.18
601805     15.03
601815     34.91
601816     22.55
601817     50.79
601819     25.34
601821     40.86
601825      6.67
601827     23.89
601828     84.12
601830     21.45
601835     12.00
601837     41.45
601839      5.94
601842      9.82
601843     48.98
601849     11.88
601850     25.95
601852     37.69
60

In [14]:
del df_customer['gender']

In [15]:
df_customer.to_csv("../data/customer.csv")

In [16]:
df_customer

Unnamed: 0,customerID,age,age_group
0,57618,58,SENIOR
2,54234,40,MIDDLE-AGED
4,73173,30,ADULT
9,73083,57,SENIOR
17,56459,39,MIDDLE-AGED
20,58559,58,SENIOR
23,58920,32,ADULT
27,51539,30,ADULT
34,58537,27,ADULT
37,58533,31,ADULT


In [17]:
# Rename values in transactions
# df_transactions.head()
df_transactions["Transact_DateTime"] = df_transactions["Transact Date"] + " " + df_transactions["Transact Time"]
# Merge transactions and df_customer

In [18]:
df_transactions["Transact_DateTime_"] = pd.to_datetime(df_transactions["Transact_DateTime"],
                                                     dayfirst=True
                                                     )

In [19]:
del df_transactions["Transact_DateTime"]
df_transactions = df_transactions.rename(columns={"Transact_DateTime_":"Transact_DateTime"})

In [20]:
df_merged = df_transactions.merge(df_customer, how="left",left_on="Customer ID", right_on='customerID')

In [21]:
len(df_merged.index)

556580

In [22]:
del df_merged["Customer ID"]
del df_merged["Age"]
del df_merged["Gender"]

In [23]:
df_merged.to_csv('../data/merged.csv')

In [24]:
print(df_merged.columns)

Index(['Transact ID', 'Transact Date', 'Transact Time', 'Outlet',
       'Outlet District', 'Transact Details ID', 'Item', 'Item Description',
       'Quantity', 'Price', 'Spending', 'Transact_DateTime', 'customerID',
       'age', 'age_group'],
      dtype='object')


In [25]:
# Time of day function
def tod(d):
    today6am = datetime(12,1,1,6,0,0).time()
    today12pm = datetime(12,1,1,12,0,0).time()
    today6pm = datetime(12,1,1,18,0,0).time()
    today12am = datetime(12,1,1,23,59,59).time()
    
    if d.time() > today6am and d.time() < today12pm: # greater than 6am and less than 12pm:
        return "BREAKFAST"
    elif d.time() >= today12pm and d.time() < today6pm: # greater than 12pm and less than 6pm
        return "LUNCH"
    elif d.time() >= today6pm and d.time() <= today12am:
#     else:
        return "DINNER"
    else:
        return "SUPPER"
    
df_merged["time_of_day"] = df_merged["Transact_DateTime"].apply(tod)

In [26]:
# Number of unique transactions in each segment of the day.
df_merged.groupby("time_of_day")['Transact ID'].nunique()

time_of_day
BREAKFAST     4021
DINNER       72760
LUNCH        55767
SUPPER        3901
Name: Transact ID, dtype: int64

In [27]:
print(df_merged.columns)

Index(['Transact ID', 'Transact Date', 'Transact Time', 'Outlet',
       'Outlet District', 'Transact Details ID', 'Item', 'Item Description',
       'Quantity', 'Price', 'Spending', 'Transact_DateTime', 'customerID',
       'age', 'age_group', 'time_of_day'],
      dtype='object')


In [28]:
df_derivedItems = pd.read_csv("../data/derived_items.csv")

In [29]:
df_derivedItems.head()

Unnamed: 0,Item,Item Description,Item Type,Item Category
0,1,Cold / Iced Water,Drink,"Drink,Cold"
1,2,Warm Water,Drink,"Drink,Hot"
2,11,Chs Bk Pork Chop Spag,Main,"Spaghetti,Chicken"
3,14,BP Chicken Chop Spag,Main,"Spaghetti,Chicken"
4,17,Fish Fillet Spag,Main,"Spaghetti,Fish"


In [30]:
df_merged['Item'] = df_merged['Item'].astype(str)
df_merged['Item'] = df_merged['Item'].str.strip()
df_derivedItems['Item'] = df_derivedItems['Item'].astype(str)
# del df_merged['Item Description']

In [31]:
print(len(df_merged.index))
print(len(df_derivedItems.index))

556580
162


In [32]:
df_merged["Item"].dtype

dtype('O')

In [33]:
df_derivedItems['Item'].dtype

dtype('O')

In [34]:
df_merged_ = pd.merge(df_merged,df_derivedItems, 'left', 'Item')

In [35]:
len(df_merged_.index)

556580

In [36]:
# Creating profiles
df_merged.groupby(['age_group','time_of_day'])

<pandas.core.groupby.DataFrameGroupBy object at 0x11859c630>

In [37]:
df_merged_.isnull().any()

Transact ID            False
Transact Date          False
Transact Time          False
Outlet                 False
Outlet District        False
Transact Details ID    False
Item                   False
Item Description_x     False
Quantity               False
Price                  False
Spending               False
Transact_DateTime      False
customerID             False
age                    False
age_group              False
time_of_day            False
Item Description_y     False
Item Type              False
Item Category          False
dtype: bool

In [38]:
df_grouped = df_merged_.groupby(['Transact ID','Item Type']).size()

In [39]:
df_merged_.to_csv('../data/merged.csv')

In [46]:
mainWeight = 0.3
drinksWeight = 0.3
sidesWeight = 0.2
dessertWeight = 0.2

def noPax(d):
    noPax = mainWeight * d['Main'] + drinksWeight * d['Drinks'] 
    + sidesWeight * d['Sides'] + dessertWeight * d['Dessert']
    return noPax

In [53]:
output = {}
for transaction, subframe in df_grouped.groupby(level=0):
    temp_dict = subframe.to_dict()
    new_dict = {}
    for k1,k2 in temp_dict:
        val = temp_dict[(k1,k2)]
        new_dict[k2] = val
    
    if 'Main' not in new_dict: 
        new_dict['Main'] = 0
    
    if 'Drinks' not in new_dict: 
        new_dict['Drinks'] = 0
        
    if 'Sides' not in new_dict: 
        new_dict['Sides'] = 0
        
    if 'Dessert' not in new_dict: 
        new_dict['Dessert'] = 0
    
    numPax = noPax(new_dict)
    numPax = np.ceil(numPax)
    if numPax < 1: numPax = 1
    output[transaction] = numPax
    
        


In [56]:
numpax_df = pd.DataFrame(list(output.items()))

In [63]:
numpax_df.columns

Index(['transactionId', 'NumPax'], dtype='object')

In [64]:
numpax_df['transactionId'] = numpax_df['transactionId'].astype(int)
numpax_df['NumPax'] = numpax_df['NumPax'].astype(int)

In [65]:
df_merged_.columns

Index(['Transact ID', 'Transact Date', 'Transact Time', 'Outlet',
       'Outlet District', 'Transact Details ID', 'Item', 'Item Description_x',
       'Quantity', 'Price', 'Spending', 'Transact_DateTime', 'customerID',
       'age', 'age_group', 'time_of_day', 'Item Description_y', 'Item Type',
       'Item Category'],
      dtype='object')

In [66]:
df_merged_final = pd.merge(df_merged_, numpax_df, how='left', left_on='Transact ID', right_on='transactionId'
                          )

In [74]:
df_merged_final.head()

Unnamed: 0,Transact ID,Transact Date,Transact Time,Outlet,Outlet District,Transact Details ID,Item,Item Description_x,Quantity,Price,...,Transact_DateTime,customerID,age,age_group,time_of_day,Item Description_y,Item Type,Item Category,transactionId,NumPax
0,183003,1/1/16,12:01:48 AM,Outlet 17,5,537557,215A,Pork Chop Noodle,1,13.94,...,2016-01-01 00:01:48,57618,58,SENIOR,SUPPER,Pork Chop Noodle,Main,"Noodle,Pork",183003,1
1,183003,1/1/16,12:01:48 AM,Outlet 17,5,537555,34,Fish & Chips,1,13.94,...,2016-01-01 00:01:48,57618,58,SENIOR,SUPPER,Fish & Chips,Main,"Western,Fish",183003,1
2,183005,1/1/16,12:03:32 AM,Outlet 11,27,537565,552,Mango Pomelo,1,8.36,...,2016-01-01 00:03:32,54234,40,MIDDLE-AGED,SUPPER,Mango Pomelo,Dessert,"Dessert,Cold",183005,1
3,183005,1/1/16,12:03:32 AM,Outlet 11,27,537564,215B,Pork Chop Bee Hoon,1,13.94,...,2016-01-01 00:03:32,54234,40,MIDDLE-AGED,SUPPER,Pork Chop Bee Hoon,Main,"Noodle,Pork",183005,1
4,183010,1/1/16,12:14:51 AM,Outlet 14,16,537571,127,Borsch Soup,2,5.94,...,2016-01-01 00:14:51,73173,30,ADULT,SUPPER,Borsch Soup,Side,"Soup,Beef",183010,1
