In [13]:
# imports
import pandas as pd
import numpy as np
from datetime import datetime

In [14]:
# Pull in the raw data file
file = "../data/data.csv"
df_transactions = pd.read_csv(file)
df_transactions.head()

Unnamed: 0,Customer ID,Age,Gender,Transact ID,Transact Date,Transact Time,Outlet,Outlet District,Transact Details ID,Item,Item Description,Quantity,Price,Spending
0,57618,58.0,F,183003,2016-01-01,12:01:48 AM,Outlet 17,5,537557,215A,Pork Chop Noodle,1,13.94,13.94
1,57618,58.0,F,183003,2016-01-01,12:01:48 AM,Outlet 17,5,537555,34,Fish & Chips,1,13.94,13.94
2,54234,40.0,,183005,2016-01-01,12:03:32 AM,Outlet 11,27,537565,552,Mango Pomelo,1,8.36,8.36
3,54234,40.0,,183005,2016-01-01,12:03:32 AM,Outlet 11,27,537564,215B,Pork Chop Bee Hoon,1,13.94,13.94
4,73173,30.0,,183010,2016-01-01,12:14:51 AM,Outlet 14,16,537571,127,Borsch Soup,2,5.94,11.88


In [15]:
mean = 38
sd = 11

In [16]:
# Creating a the customer table..
df_customer = pd.DataFrame()
df_customer["customerID"] = df_transactions["Customer ID"]
df_customer["age"] = df_transactions["Age"]
df_customer["gender"] = df_transactions["Gender"]

print("Customer table size ", len(df_customer.index))
df_customer = df_customer.drop_duplicates(keep='first')
print("Customer table size after removing duplicates ", len(df_customer.index))

Customer table size  556580
Customer table size after removing duplicates  44414


In [17]:
# Cleaning customer table..
# Chaning age to int
df_customer['age'] = pd.to_numeric(df_customer['age'])
# Filling NaNs with 0 by default
df_customer['age'] = df_customer['age'].fillna(value=0)
# Filling gender with "NA"
df_customer['gender'] = df_customer['gender'].fillna(value="NA")

In [18]:
# Getting mean and sd of age of individuals aged >= 6 and <= 88
mean = df_customer["age"].ix[(df_customer["age"] >= 6) & (df_customer["age"] <= 88)].mean()
sd = df_customer["age"].ix[(df_customer["age"] >= 6) & (df_customer["age"] <= 88)].std()
print("Mean: ", mean, " SD: ", sd)

Mean:  37.3733306078  SD:  11.9079709467


In [19]:
def assignAge(d):
    while(d < 6 or d > 88):
        d = np.random.normal(mean, sd, 1)[0]
    return int(d)
        
df_customer["age"] = df_customer["age"].apply(assignAge)

In [20]:
# Assigning age category.
def age_group(d):
    if d <= 18:
        return "YOUTH"
    elif d <= 35 and d > 18:
        return "ADULT"
    elif d <= 50 and d > 35:
        return "MIDDLE-AGED"
    else:
        return "SENIOR"

df_customer['age_group'] = df_customer['age'].apply(age_group)

In [21]:
del df_customer['gender']

In [22]:
df_customer.to_csv("../data/customer.csv")

In [23]:
df_customer = df_customer.rename(columns={'customerID':'customer_id'})
df_customer.head()

Unnamed: 0,customer_id,age,age_group
0,57618,58,SENIOR
2,54234,40,MIDDLE-AGED
4,73173,30,ADULT
9,73083,57,SENIOR
17,56459,39,MIDDLE-AGED


In [24]:
# Rename values in transactions
# df_transactions.head()
df_transactions["Transact_DateTime"] = df_transactions["Transact Date"] + " " + df_transactions["Transact Time"]
# Merge transactions and df_customer

In [25]:
df_transactions["Transact_DateTime_"] = pd.to_datetime(df_transactions["Transact_DateTime"],
                                                     dayfirst=True
                                                     )

In [26]:
del df_transactions["Transact_DateTime"]
df_transactions = df_transactions.rename(columns={"Transact_DateTime_":"Transact_DateTime"})

In [27]:
df_merged = df_transactions.merge(df_customer, how="left",left_on="Customer ID", right_on='customer_id')

In [28]:
len(df_merged.index)

556580

In [29]:
del df_merged["Customer ID"]
del df_merged["Age"]
del df_merged["Gender"]

In [30]:
df_merged.to_csv('../data/merged.csv')

In [31]:
print(df_merged.columns)

Index(['Transact ID', 'Transact Date', 'Transact Time', 'Outlet',
       'Outlet District', 'Transact Details ID', 'Item', 'Item Description',
       'Quantity', 'Price', 'Spending', 'Transact_DateTime', 'customer_id',
       'age', 'age_group'],
      dtype='object')


In [32]:
# Time of day function
def tod(d):
    today6am = datetime(12,1,1,6,0,0).time()
    today12pm = datetime(12,1,1,12,0,0).time()
    today6pm = datetime(12,1,1,18,0,0).time()
    today12am = datetime(12,1,1,23,59,59).time()
    
    if d.time() > today6am and d.time() < today12pm: # greater than 6am and less than 12pm:
        return "BREAKFAST"
    elif d.time() >= today12pm and d.time() < today6pm: # greater than 12pm and less than 6pm
        return "LUNCH"
    elif d.time() >= today6pm and d.time() <= today12am:
#     else:
        return "DINNER"
    else:
        return "SUPPER"
    
df_merged["time_of_day"] = df_merged["Transact_DateTime"].apply(tod)

In [33]:
# Number of unique transactions in each segment of the day.
df_merged.groupby("time_of_day")['Transact ID'].nunique()

time_of_day
BREAKFAST     4021
DINNER       72760
LUNCH        55767
SUPPER        3901
Name: Transact ID, dtype: int64

In [34]:
print(df_merged.columns)

Index(['Transact ID', 'Transact Date', 'Transact Time', 'Outlet',
       'Outlet District', 'Transact Details ID', 'Item', 'Item Description',
       'Quantity', 'Price', 'Spending', 'Transact_DateTime', 'customer_id',
       'age', 'age_group', 'time_of_day'],
      dtype='object')


In [35]:
df_derivedItems = pd.read_csv("../data/derived_items.csv")

In [36]:
df_derivedItems.groupby(['Item Type']).size()

Item Type
Beverage    39
Dessert     12
Drink        2
Main        61
Side        46
Soup         1
TAKEAWAY     1
dtype: int64

In [37]:
df_merged['Item'] = df_merged['Item'].astype(str)
df_merged['Item'] = df_merged['Item'].str.strip()
df_derivedItems['Item'] = df_derivedItems['Item'].astype(str)
# del df_merged['Item Description']

In [38]:
print(len(df_merged.index))
print(len(df_derivedItems.index))

556580
162


In [39]:
df_merged["Item"].dtype

dtype('O')

In [40]:
df_derivedItems['Item'].dtype

dtype('O')

In [41]:
df_merged_ = pd.merge(df_merged,df_derivedItems, 'left', 'Item')

In [42]:
df_merged_ = df_merged_[df_merged_['Item Type'] != "TAKEAWAY"]

In [43]:
df_merged_.groupby(['Item Type']).size()

Item Type
Beverage    171006
Dessert      27523
Drink        16422
Main        191887
Side        133959
Soup          8009
dtype: int64

In [44]:
len(df_merged_.index)

548806

In [45]:
df_merged_.isnull().any()

Transact ID            False
Transact Date          False
Transact Time          False
Outlet                 False
Outlet District        False
Transact Details ID    False
Item                   False
Item Description_x     False
Quantity               False
Price                  False
Spending               False
Transact_DateTime      False
customer_id            False
age                    False
age_group              False
time_of_day            False
Item Description_y     False
Item Type              False
Item Category          False
dtype: bool

In [46]:
df_grouped = df_merged_.groupby(['Transact ID','Item Type']).size()

In [47]:
df_merged_['Item Type'] = df_merged_['Item Type'].replace('Beverage','Drink')

In [48]:
df_merged_.groupby(['Item Type']).size()

Item Type
Dessert     27523
Drink      187428
Main       191887
Side       133959
Soup         8009
dtype: int64

In [49]:
df_merged_.to_csv('../data/merged.csv')

In [50]:
mainWeight = 0.3
drinksWeight = 0.3
sidesWeight = 0.2
dessertWeight = 0.2

def noPax(d):
    mains = d['Main']
    drinks = d['Drink']
    sides = d['Side']
    dessert = d['Dessert']
    soup = d["Soup"]
    # Number of mains shall be equal to the number of people.
    if mains > 0:
        return mains
    elif drinks > 0:
        return drinks
    elif sides > 0:
        return sides / 2
    elif soup > 0:
        return soup / 2
    elif dessert > 0:
        return dessert

# if number of mains
#     noPax = mainWeight * d['Main'] + drinksWeight * d['Drinks'] 
#     + sidesWeight * d['Sides'] + dessertWeight * d['Dessert']
#     return noPax

In [51]:
output = {}
for transaction, subframe in df_grouped.groupby(level=0):
    temp_dict = subframe.to_dict()
    new_dict = {}
    for k1,k2 in temp_dict:
        val = temp_dict[(k1,k2)]
        new_dict[k2] = val
    
    if 'Main' not in new_dict: 
        new_dict['Main'] = 0
    
    if 'Drink' not in new_dict: 
        new_dict['Drink'] = 0
        
    if 'Side' not in new_dict: 
        new_dict['Side'] = 0
        
    if 'Dessert' not in new_dict: 
        new_dict['Dessert'] = 0
        
    if 'Soup' not in new_dict: 
        new_dict['Soup'] = 0
    
#     print(new_dict)
    numPax = noPax(new_dict)
#     print(numPax)
    numPax = np.ceil(numPax)
    if numPax < 1: numPax = 1
    output[transaction] = numPax
    
        


AttributeError: 'NoneType' object has no attribute 'ceil'

In [None]:
numpax_df = pd.DataFrame(list(output.items()))

In [None]:
numpax_df.columns

In [None]:
numpax_df  = numpax_df.rename(columns={0:'transactionId',1:'NumPax'})

In [None]:
numpax_df['transactionId'] = numpax_df['transactionId'].astype(int)
numpax_df['NumPax'] = numpax_df['NumPax'].astype(int)

In [None]:
df_merged_.columns

In [None]:
df_merged_final = pd.merge(df_merged_, numpax_df, how='left', left_on='Transact ID', right_on='transactionId'
                          )

In [None]:
# df_merged_final.head()
len(df_merged_final.index)

In [None]:
df_merged_final.columns

In [None]:
# df_merged_total = 
profile_item_total_df =  df_merged_final.groupby(['age_group',
                                                  'time_of_day',
                                                  'NumPax'])['Quantity'].sum().reset_index()


In [None]:
item_total_df = df_merged_final.groupby(['age_group',
                                            'time_of_day',
                                            'NumPax',
                                            'Item Description_y',
                                           'Item Type',
                                           'Item Category']).size().reset_index().rename(columns={0:'item_total'})

In [None]:
test_df = pd.merge(profile_item_total_df,item_total_df,how='left',on=['age_group','time_of_day'])

In [None]:
hello_df = test_df.groupby(['age_group','time_of_day','NumPax_x']).size()

In [None]:
test_df = test_df.rename(columns={'NumPax_x':'numpax','Item Description_y':'item_dec'})

In [None]:
del test_df['NumPax_y']

In [None]:
test_df['uscore'] = test_df['item_total'] / test_df['Quantity']

In [None]:
test_df.to_csv('uscore.csv')

In [None]:
df_merged_final.columns

In [None]:
df_merged_final.to_csv('merged_final.csv')

In [None]:
# uscore_dict = {}
# for row in df_merged_final.iterrows():
#     row_details = row[1]
#     t_details_id = row_details['Transact Details ID']
#     age_group = row_details['age_group']
#     tod = row_details['time_of_day']
#     numpax = row_details['NumPax']
#     item_desc = row_details['Item Description_y']
#     item_type = row_details['Item Type']
#     item_cat = row_details['Item Category']
    
#     item_total = item_total_df['item_total'].ix[(item_total_df['age_group'] == age_group) &
#                                                (item_total_df['time_of_day'] == tod) &
#                                                (item_total_df['NumPax'] == numpax) &
#                                                (item_total_df['Item Description_y'] == item_desc) &
#                                                (item_total_df['Item Type'] == item_type) &
#                                                (item_total_df['Item Category'] == item_cat)].item()
    
#     profile_total = profile_item_total_df['Quantity'].ix[(profile_item_total_df['age_group'] == age_group) &
#                                                          (profile_item_total_df['time_of_day'] == tod) &
#                                                          (profile_item_total_df['NumPax'] == numpax)].item()
# #     print(item_total)
# #     print(profile_total)
#     uscore_dict[t_details_id] = item_total / profile_total

In [None]:
df_merged_output.groupby()