In [1]:
import pandas as pd
import numpy as np

## Load csv. csv does not have header in first row, set header=None
df = pd.read_csv('chesterfield_25-08-2021_09-00-00.csv', header=None)


##TODO EDA notes

# store_name can be dropped and used as table name. Make sript to:
# 1. Cleanse data
# 2. Generate db table with table name of store-name_payment-type
# 3. Normalise and split tables
# 4. df.to_sql(table name, etc.)

## List for setting df column names, set column names, set index name to 'id'
df.columns = ['timestamp', 'store_name', 'customer_name', 'basket_items', 'total_price', 'payment_type', 'card_number']
sql_table_name = list(df.store_name.unique())[0]

## Remove sensitive data (customer_name and card_number)
df.drop(['customer_name', 'card_number'], axis=1, inplace=True)

## Convert date-time to timestamp
df['timestamp'] = (pd.to_datetime(df['timestamp']).view(np.int64) / 10**9).astype(int)

## unittest timestamp conversion assert
# df['timestamp'] = pd.to_datetime(df['timestamp'], unit='s')

# inspect
df.head(10)

Unnamed: 0,timestamp,store_name,basket_items,total_price,payment_type
0,1629882000,Chesterfield,Regular Flavoured iced latte - Hazelnut - 2.75...,5.2,CARD
1,1629882120,Chesterfield,"Large Flavoured iced latte - Caramel - 3.25, R...",17.3,CARD
2,1629882240,Chesterfield,"Large Flat white - 2.45, Regular Latte - 2.15",4.6,CARD
3,1629882360,Chesterfield,"Regular Flavoured latte - Hazelnut - 2.55, Lar...",5.0,CARD
4,1629882480,Chesterfield,"Regular Latte - 2.15, Large Latte - 2.45",4.6,CASH
5,1629882600,Chesterfield,"Large Flavoured iced latte - Caramel - 3.25, R...",12.95,CASH
6,1629882720,Chesterfield,"Large Flavoured latte - Hazelnut - 2.85, Regul...",17.4,CARD
7,1629882840,Chesterfield,"Regular Flavoured iced latte - Vanilla - 2.75,...",13.55,CARD
8,1629883020,Chesterfield,"Regular Flavoured iced latte - Caramel - 2.75,...",12.55,CARD
9,1629883140,Chesterfield,"Large Flat white - 2.45, Large Flavoured latte...",5.3,CARD


In [2]:
## Split basket_items by comma-separated values - not expanding:
df['basket_items'] = df['basket_items'].str.split(', ')
df = df.explode('basket_items')

## Convenient NF1 for basket_items, rsplit items into: 'item', 'price'
df[['item', 'price']] = df['basket_items'].str.rsplit(' - ', n=1, expand=True)

## Further split 'size', 'product/drink'
df[['drink_size', 'product_drink']] = df['item'].str.split(' ', n=1, expand=True)
df.rename(columns={'total_price':'basket_total_price'}, inplace=True)

# is more splits meaningful? (probably not)

## New processed table, drop 'item' as it is processed?
df_processed = df.drop(['store_name','basket_items', 'item', 'basket_total_price'], axis=1)

# inspect
df_processed.head(15)

Unnamed: 0,timestamp,payment_type,price,drink_size,product_drink
0,1629882000,CARD,2.75,Regular,Flavoured iced latte - Hazelnut
0,1629882000,CARD,2.45,Large,Latte
1,1629882120,CARD,3.25,Large,Flavoured iced latte - Caramel
1,1629882120,CARD,2.75,Regular,Flavoured iced latte - Hazelnut
1,1629882120,CARD,2.75,Regular,Flavoured iced latte - Caramel
1,1629882120,CARD,3.25,Large,Flavoured iced latte - Hazelnut
1,1629882120,CARD,2.55,Regular,Flavoured latte - Hazelnut
1,1629882120,CARD,2.75,Regular,Flavoured iced latte - Hazelnut
2,1629882240,CARD,2.45,Large,Flat white
2,1629882240,CARD,2.15,Regular,Latte


In [3]:
## Timestamp as index? (cannot)
def uniq_vals(dataframe):
    return print(f'Unique vals: {dataframe.value_counts()}')
# uniq_vals('timestamp') has one repeat timestamp at 25/08/2021 10:56 (separate purchases, payment type - valid)

## Alternatively: split store_table by payment_type, assuming two cashiers at same time, each working one order a time
# split tables by payment_type, then drop payment_type
card_orders = df_processed[df['payment_type'] == 'CARD'].drop(['payment_type'], axis=1)
cash_orders = df_processed[df['payment_type'] == 'CASH'].drop(['payment_type'], axis=1)

# inspect
card_orders
# cash_orders

Unnamed: 0,timestamp,price,drink_size,product_drink
0,1629882000,2.75,Regular,Flavoured iced latte - Hazelnut
0,1629882000,2.45,Large,Latte
1,1629882120,3.25,Large,Flavoured iced latte - Caramel
1,1629882120,2.75,Regular,Flavoured iced latte - Hazelnut
1,1629882120,2.75,Regular,Flavoured iced latte - Caramel
...,...,...,...,...
263,1629909900,3.25,Large,Flavoured iced latte - Vanilla
263,1629909900,2.45,Large,Latte
263,1629909900,3.25,Large,Flavoured iced latte - Vanilla
263,1629909900,3.25,Large,Flavoured iced latte - Caramel
