In [None]:
# import libraries

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import missingno
from datetime import datetime

In [None]:
#load the data sets
transaction = pd.read_csv("../input/quantium-data-analytics-virtual-experience-program/Transactions.csv")
purchase = pd.read_csv("../input/quantium-data-analytics-virtual-experience-program/PurchaseBehaviour.csv")

# Transaction Data

In [None]:
#take a first look at the data set from transaction
transaction.head()

In [None]:
#check data types
transaction.dtypes

In [None]:
#check if any missing entry
transaction.isnull().sum()

In [None]:
#check missing entry using visualization: no white lines, so no missing entry
missingno.matrix(transaction, figsize=(15,8))

No missing data entry in the transaction table.

In [None]:
#look at the first 20 product names (114 total)
transaction["PROD_NAME"].unique()[:20]

There might be salsa products that are not chips. Let's idenfity and remove them.

In [None]:
#Take a look at all product names with "salsa" in it.
with_salsa_name = transaction[transaction['PROD_NAME'].str.contains('salsa', case=False)]['PROD_NAME'].unique()
print(with_salsa_name)

Based on Google, some of these products are salsa flavor chips.

Only "Old El Paso Salsa" and "Woolworths Mild/Medium Salsa" are actually salsa.  Others are chips with salsa flavors, so we will keep them.

In [None]:
#List of indices of salsa products
salsa_index = transaction.index[(transaction['PROD_NAME'].str.contains('salsa',case=False)
                          & (transaction['PROD_NAME'].str.contains('Old',case=False)
                             | transaction['PROD_NAME'].str.contains('Woolworths',case=False)))]
#Drop salsa products
chips_tran = transaction.drop(salsa_index, axis=0).reset_index(drop=True)
print("Transactions without Salsa: " + str(len(chips_tran)))

In [None]:
# Add in brand Column
chips_tran['BRAND'] = chips_tran['PROD_NAME'].apply(lambda x : x.strip().split()[0])
brands = {'Dorito':'Doritos','Infzns':'Infuzions',
          'Snbts':'Sunbites','Grain':'Grain Wave',
          'RRD':'Red Rock Deli','Smith':'Smiths',
          'GrnWves':'Grain Wave','WW':'Woolworths',
          'NCC':'Natural','Red':'Red Rock Deli'}
chips_tran['BRAND'] = chips_tran['BRAND'].replace(brands)
chips_tran.head()

In [None]:
#Check if any brand is repeated.
chips_tran.BRAND.value_counts().sort_index()

In [None]:
#total number of brands
len(chips_tran["BRAND"].unique())

In [None]:
#Clean up product names by removing brand names from it.
chips_tran['PROD_NAME'] = chips_tran['PROD_NAME'].apply(lambda x: x.split(' ', 1)[1])
chips_tran.head()

There are other information in PROD_NAME column, let's find the packet size and calculate the unit price.

In [None]:
#Define a function to get product packet size
def get_size(item):
    size=[]
    for i in item:
        if i.isdigit():
            size.append(i)
    return int("".join(size))

chips_tran['PKG_SIZE'] = chips_tran['PROD_NAME'].apply(lambda x: get_size(x))

#Look at the table with added column about package size
chips_tran.head()

In [None]:
#Find product unit price and add it as a column to the table
chips_tran['UNIT_PRICE'] = chips_tran['TOT_SALES']/chips_tran['PROD_QTY']
chips_tran.head()

In [None]:
#check the size of the table
chips_tran.shape

In [None]:
#Again look at data types and if there's empty/missing entry.
chips_tran.info()

Hooray! No empty entry! Let's look at some numerical summaries.

In [None]:
#data summary from transaction table
chips_tran.describe()

It seems like there might be outliers.  Between 75% quantile and max, it's a huge jump for both PROD_QTY and TOT_SALES.

Let's look at the boxplot to identify them.

In [None]:
#Use boxplot to see if there is any outlier.
sns.boxplot(chips_tran.PROD_QTY)

There are some data with product quantity a lot larger than 10, we will set 10 to be the threshold and remove them.

In [None]:
#Outliers! Remove them and plot again
tran = chips_tran[chips_tran.PROD_QTY < 10]
print(tran.shape)
sns.boxplot(tran.PROD_QTY)

In [None]:
#look at product quantity summary after removing outliers
tran.PROD_QTY.describe()

In [None]:
#look at total sales summary after removing outliers
tran.TOT_SALES.describe()

Check the distribution after removing outliers. And it looks reasonable.

In [None]:
#Notice column "DATE" is not in standard form.
tran['DATE'].head()

Notice the entries in DATE column are in numbers, let's convert them to standard form with yyyy-mm-dd.

In [None]:
#define a function to convert the dates to standard form: YYYY-MM-DD
def convert_to_datetime(num):
    dt = datetime.fromordinal(datetime(1900,1,1).toordinal() + num - 2)
    return dt

#Convert dates to form YYYY-MM-DD
tran['DATE'] = tran['DATE'].apply(convert_to_datetime)
tran.head()

In [None]:
#Look at the range of dates
print(tran.DATE.min(), tran.DATE.max())

The table is for the transaction between 2018-7-1 and 2019-6-30, exactly one year.

In [None]:
#Use histogram to look at the distribution of transactions by date: balanced.
sns.set_style('whitegrid')
tran.DATE.hist(figsize=(10,6))

Transactions throughout the year are about the same.

In [None]:
#display the unique number from DATE, STORE_NBR, LYLTY_CARD_NBR, TXN_ID, PROD_NBR columns
#Notice there are repeatitions
print(tran.DATE.nunique())
print(tran.STORE_NBR.nunique())
print(tran.LYLTY_CARD_NBR.nunique())
print(tran.TXN_ID.nunique())
print(tran.PROD_NBR.nunique())

There are repeated entries in DATE, STORE_NBR, LYLTY_CARD_NBR, TXN_ID, PROD_NBR columns.

In [None]:
#One day is missing, use line chart to find the missing day.
graph = tran[['DATE', 'TXN_ID']].groupby('DATE').count().sort_values(by='DATE')
ax = graph.plot(figsize=(12,6))
plt.show()

Don't see line break anywhere from the line chart above. 
Try another way to identify the missing day.

In [None]:
#Find the missing date(s) - Christmas!
from datetime import date, timedelta
dates = sorted(tran.DATE)
date_set = set(dates[0] + timedelta(x) for x in range((dates[-1] - dates[0]).days))
missing = date_set - set(dates)
print(missing)

2018-12-25 is a holiday, so it's normal with no transaction on that day.

In [None]:
#Another way to find the missing day
dategroup = tran.groupby('DATE')[['TXN_ID']].count()
pd.date_range(start = '2018-07-01', end = '2019-06-30').difference(dategroup.index)

In [None]:
#A visualization: transaction over time
import plotly.express as px
dategroup = dategroup.reindex(pd.date_range("2018-07-01", "2019-06-30"), fill_value=0)
dategroup['TXN_ID'] = dategroup['TXN_ID'].astype('int')
px.line(dategroup, dategroup.index, dategroup['TXN_ID'])

No sales on Christmas, and the sales peak right before Christmas.

In [None]:
#Review the clean transaction table.
tran.head()

In [None]:
#look at store numbers (recall there are 272 distinct store numbers)
tran.STORE_NBR.hist()

Transactions in different store numbers seem to be similar.

In [None]:
#Now take a look at some duplicated transaction IDs.
tran[tran.duplicated(['TXN_ID'])].head()

It seems like there are duplicate TXN_ID, and people bought multiple products together. That is common.

In [None]:
#Look at the products in TXN_ID being 48887, and there are two products.
tran.loc[tran['TXN_ID']==48887, :]

In [None]:
#look at PROD_NBR
tran.PROD_NBR.hist()

In [None]:
#total sales histogram
sns.distplot(tran.TOT_SALES, kde=False)

In [None]:
#look at unit price distribution
sns.distplot(tran.UNIT_PRICE)

In [None]:
#Counts of different brands according to packet size
tran.groupby(['BRAND'])['PKG_SIZE'].value_counts()

Some brands have multiple chips sizes!

In [None]:
#Plot data by brands, 'deep' for categorical variables.
fig,ax = plt.subplots(figsize=(20,18))
plt.subplot(2,1,1)
sns.countplot(tran['BRAND'],palette='deep').set(ylabel='TRANSACTIONS')
plt.title("Transactions of Different Brands")

#Plot data by package size, 'rocket' for quantitative variables.
fig,ax = plt.subplots(figsize=(20,18))
plt.subplot(2,1,2)
sns.countplot(tran['PKG_SIZE'],palette='rocket_r').set(ylabel='TRANSACTIONS')
plt.title("Transactions of Different Package Sizes")

It seems that Kettle has most transactions.

And package size 175g is the most popular one.

# Purchase Data

In [None]:
#Take a quick look at the purchase data table
purchase.head()

In [None]:
#look at data types
purchase.info()

It seems there is no missing entry in the purchase table.

In [None]:
#check size of the table
purchase.shape

In [None]:
#check total distinct entries in the 3 columns
purchase.nunique()

All the LYLTY_CARD_NBR are distinct in the purchase table.  Let's see if these numbers match with those in the original transaction table.

In [None]:
#compare LYLTY_CARD_NBR from both original data sets
set(purchase.LYLTY_CARD_NBR.unique()) == set(transaction.LYLTY_CARD_NBR.unique())

Awesome! The LYLTY_CARD_NBR in two tables are exactly the same. So the tables refer to the same group of customers.

In [None]:
#plot lifestage data to check distribution
plt.figure(figsize=(18,8))
sns.countplot(purchase['LIFESTAGE'],palette='rocket_r').set(ylabel='LIFESTAGE')
plt.title("Purchases of Different LIFESTAGES", {'fontsize':15})

From the bar graph, we see that most customers are retirees, older singles/couples, and young singles/couples, while least customers are new families.

In [None]:
#Visualize the premium customers using pie chart and bar graph.
fig,axarr = plt.subplots(1, 2, figsize=(10,5))
purchase['PREMIUM_CUSTOMER'].value_counts().plot.pie(ax=axarr[0])
purchase['PREMIUM_CUSTOMER'].value_counts().plot.bar(ax=axarr[1])

Most customers are Mainstream, and least customers are premium.

In [None]:
#Combine LIFESTAGE and PREMIUM_CUSTOMER for comparison.
plt.figure(figsize=(20,8))
sns.countplot(purchase['LIFESTAGE'],palette='deep',hue=purchase['PREMIUM_CUSTOMER'])
plt.title("Total Number of Customers by Groups", {'fontsize':15})

Overall, the largest group of customers are the Mainstream young singles/couples, and the smallest group of customers are all from the new families.

# Merge Data

In [None]:
#merge data from the two tables (with outliers removed) - left join.
df = tran.merge(purchase, how='left', on='LYLTY_CARD_NBR')
df.shape

In [None]:
#Look at the first 5 rows of the merged table.
df.head()

We are interested in knowing which group of customers has the largest transactions of chips.

Let's group customers based on lifestage and premium type.

In [None]:
#Find the total sales, total product quantities, and number of customers for each group.
df_total = df.groupby(['LIFESTAGE','PREMIUM_CUSTOMER']).agg({'TOT_SALES':'sum',
                        'PROD_QTY':'sum','TXN_ID':'count'}).reset_index()

#Group lifestage and premium customer colomns to form a new column "GROUP".
df_total['GROUP'] = df_total.LIFESTAGE + '_' + df_total.PREMIUM_CUSTOMER
df_total.head()

In [None]:
#Use bar graph to compare total sales.
df_sales = df_total.sort_values('TOT_SALES')
plt.figure(figsize=(20,8))
sns.barplot(x='LIFESTAGE',y='TOT_SALES',hue='PREMIUM_CUSTOMER',data=df_sales)
plt.title("Total Sales by Lifestages and Premium Types", {'fontsize':15})

Older families-Budget group has the highest sales, followed by young singles/couples-Mainstream group and retirees-Mainstream goup.

In [None]:
#Create 3 horizontal bar graphs to display total sales, 
#product quantities, and number of customers by group.
fig, ax = plt.subplots(figsize=(20,30))
plt.subplot(3,1,1)
df_TA = df_total.sort_values('TOT_SALES')
sns.barplot(y='GROUP', x='TOT_SALES', data=df_TA, orient='h', palette='rocket_r')
plt.title("Total Sales by Customer Groups")

plt.subplot(3,1,2)
df_PQ = df_total.sort_values('PROD_QTY')
sns.barplot(y='GROUP', x='PROD_QTY', data=df_PQ, orient='h', palette='rocket_r')
plt.title("Total Quantities Purchased by Customer Groups")

plt.subplot(3,1,3)
df_TI = df_total.sort_values('TXN_ID')
sns.barplot(y='GROUP', x='TXN_ID', data=df_TI, orient='h', palette='rocket_r')
plt.title("Total Number of Customers by Groups")

The customers group "OLDER FAMILIES_Budget" has the most sales, highest quantities purchased, and most number of customers, while NEW FAMILIES in general have the least of all.

We can also analyze:
1. total sales per customer
2. total quantities purchased per customer
3. average price per bag of chips by group

Let's break down the table more.

In [None]:
#Add three more numerical columns.
#Add a new column for sales per customer.
df_total['SALES_PC'] = df_total.TOT_SALES / df_total.TXN_ID

#Add a new column for quantities purchased per customer.
df_total['QTY_PC'] = df_total.PROD_QTY / df_total.TXN_ID

#Add a new column for average dollar amount per bag of chips by group.
df_total['AVG_PQ'] = df_total.TOT_SALES / df_total.PROD_QTY

df_total.head()

In [None]:
#Look at simple numerical summaries of the table.
df_total.describe()

In [None]:
#Create 3 horizontal bar graphs to display total sales, 
#product quantities, and number of customers by group.
fig, ax = plt.subplots(figsize=(20,30))
plt.subplot(3,1,1)
df_SP = df_total.sort_values('SALES_PC')
sns.barplot(y='GROUP', x='SALES_PC', data=df_SP, orient='h', palette='rocket_r')
plt.title("Avarage Sales per Customer by Groups")

plt.subplot(3,1,2)
df_QP = df_total.sort_values('QTY_PC')
sns.barplot(y='GROUP', x='QTY_PC', data=df_QP, orient='h', palette='rocket_r')
plt.title("Average Quantities Purchased per Customer by Groups")

plt.subplot(3,1,3)
df_AP = df_total.sort_values('AVG_PQ')
sns.barplot(y='GROUP', x='AVG_PQ', data=df_AP, orient='h', palette='rocket_r')
plt.title("Average Unit Price of Chips by Groups")

There is no big difference among customer groups for average sales per customer, average quantities purchased, or average unit price of chips.