In [1]:
# Load in libraries

import warnings
warnings.filterwarnings('ignore')

#libraries for handling data
import pandas as pd
import numpy as np
import math
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import r2_score, mean_squared_error

#label encoders
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder()


#libraries for data visualization
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
%matplotlib inline
import seaborn as sns
sns.set_palette('Set2')

#libaries for modelling
# Regression Modelling Algorithms
import statsmodels.api as sm
#from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LinearRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor , GradientBoostingRegressor

In [2]:
filepath = "/Users/reejungkim/Documents/Git/Production analysis/Online Retail.xlsx"
pd.ExcelFile(filepath).sheet_names

['Online Retail']

In [3]:
df = pd.read_excel(filepath, sheet_name = 'Online Retail')
df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom


In [4]:
df['TotalPurchase'] = df.Quantity * df.UnitPrice
df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,TotalPurchase
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom,15.3
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,20.34
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom,22.0
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,20.34
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,20.34


In [5]:
df.describe()

Unnamed: 0,Quantity,UnitPrice,CustomerID,TotalPurchase
count,541909.0,541909.0,406829.0,541909.0
mean,9.55225,4.611114,15287.69057,17.987795
std,218.081158,96.759853,1713.600303,378.810824
min,-80995.0,-11062.06,12346.0,-168469.6
25%,1.0,1.25,13953.0,3.4
50%,3.0,2.08,15152.0,9.75
75%,10.0,4.13,16791.0,17.4
max,80995.0,38970.0,18287.0,168469.6


In [6]:
df.Country.unique()

array(['United Kingdom', 'France', 'Australia', 'Netherlands', 'Germany',
       'Norway', 'EIRE', 'Switzerland', 'Spain', 'Poland', 'Portugal',
       'Italy', 'Belgium', 'Lithuania', 'Japan', 'Iceland',
       'Channel Islands', 'Denmark', 'Cyprus', 'Sweden', 'Austria',
       'Israel', 'Finland', 'Bahrain', 'Greece', 'Hong Kong', 'Singapore',
       'Lebanon', 'United Arab Emirates', 'Saudi Arabia',
       'Czech Republic', 'Canada', 'Unspecified', 'Brazil', 'USA',
       'European Community', 'Malta', 'RSA'], dtype=object)

In [7]:
uk_data = df.loc[df.Country=='United Kingdom']

In [8]:
uk_data.head(2)

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,TotalPurchase
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom,15.3
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,20.34


In [9]:
uk_data_group=uk_data.groupby('CustomerID').agg({'InvoiceDate': lambda date: (date.max() - date.min()).days,
                                        'InvoiceNo': lambda num: len(num),
                                        'Quantity': lambda quant: quant.sum(),
                                        'TotalPurchase': lambda price: price.sum()})

uk_data_group

Unnamed: 0_level_0,InvoiceDate,InvoiceNo,Quantity,TotalPurchase
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
12346.0,0,2,0,0.00
12747.0,366,103,1275,4196.01
12748.0,372,4642,24210,29072.10
12749.0,209,231,1422,3868.20
12820.0,323,59,722,942.34
...,...,...,...,...
18280.0,0,10,45,180.60
18281.0,0,7,54,80.82
18282.0,118,13,98,176.60
18283.0,333,756,1397,2094.88


In [10]:
uk_data_group.columns=['num_days','num_transactions','num_units','spent_money']
uk_data_group.head()

Unnamed: 0_level_0,num_days,num_transactions,num_units,spent_money
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
12346.0,0,2,0,0.0
12747.0,366,103,1275,4196.01
12748.0,372,4642,24210,29072.1
12749.0,209,231,1422,3868.2
12820.0,323,59,722,942.34


#CLTV = ((Average Order Value x Purchase Frequency)/Churn Rate) x Profit margin.
#Customer Value = Average Order Value * Purchase Frequency

In [11]:
uk_data_group['avg_order_value']=uk_data_group['spent_money']/uk_data_group['num_transactions']

In [12]:
uk_data_group.head()

Unnamed: 0_level_0,num_days,num_transactions,num_units,spent_money,avg_order_value
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
12346.0,0,2,0,0.0,0.0
12747.0,366,103,1275,4196.01,40.737961
12748.0,372,4642,24210,29072.1,6.262839
12749.0,209,231,1422,3868.2,16.745455
12820.0,323,59,722,942.34,15.971864


In [13]:
purchase_frequency=sum(uk_data_group['num_transactions'])/uk_data_group.shape[0]
purchase_frequency

91.6146835443038

In [14]:
repeat_rate= (
        uk_data_group[uk_data_group.num_transactions > 1].shape[0]/uk_data_group.shape[0]
)

In [15]:
repeat_rate

0.9807594936708861

In [16]:
#Churn Rate
churn_rate=1-repeat_rate

In [17]:
purchase_frequency,repeat_rate,churn_rate

(91.6146835443038, 0.9807594936708861, 0.019240506329113893)

In [18]:
# Profit Margin
#assume the business has approx 5% profit on the total sale.
uk_data_group['profit_margin']=uk_data_group['spent_money']*0.05

In [19]:
# Customer Value
uk_data_group['CLV']=(uk_data_group['avg_order_value']*purchase_frequency)/churn_rate


In [20]:
#Customer Lifetime Value
uk_data_group['cust_lifetime_value']=uk_data_group['CLV']*uk_data_group['profit_margin']


In [21]:
uk_data['month_yr'] = uk_data['InvoiceDate'].apply(lambda x: x.strftime('%b-%Y'))

In [22]:
sale = uk_data.pivot_table(index=['CustomerID'],columns=['month_yr'],
                           values='TotalPurchase',aggfunc='sum',fill_value=0).reset_index()

In [24]:
sale.head()

month_yr,CustomerID,Apr-2011,Aug-2011,Dec-2010,Dec-2011,Feb-2011,Jan-2011,Jul-2011,Jun-2011,Mar-2011,May-2011,Nov-2011,Oct-2011,Sep-2011
0,12346.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,12747.0,0.0,301.7,706.27,438.5,0.0,303.04,0.0,376.3,310.78,771.31,312.73,675.38,0.0
2,12748.0,1100.37,659.42,4177.68,1070.27,389.64,418.77,1113.27,2006.26,1011.94,2224.42,9295.57,1385.84,4218.65
3,12749.0,0.0,1750.45,0.0,763.06,0.0,0.0,0.0,0.0,0.0,782.1,572.59,0.0,0.0
4,12820.0,0.0,0.0,0.0,210.35,0.0,170.46,0.0,0.0,0.0,0.0,0.0,343.76,217.77
