# Customer Lifetime Value
In this notebook we are going to calculate Customer Lifetime Value in Python. This notebook follows the steps of the tutorial made available by DataCamp and DigitalJ2

Source: https://www.datacamp.com/community/tutorials/customer-life-time-value

Source: https://blog.digitalj2.com/a-simple-customer-lifetime-value-formula

In [1]:
# Library import
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from pandas.util import hash_pandas_object

In [2]:
# Read data
data_woo = pd.read_csv("./data/data_woo.csv")
data_ccv = pd.read_csv("./data/data_ccv.csv")
combined_data = pd.concat([data_woo, data_ccv])

In [3]:
# Convert date datatypes
combined_data["today"] = "2020-06-18"
combined_data["today"] = pd.to_datetime(combined_data["today"])
combined_data["last_order_date"] = pd.to_datetime(combined_data["last_order_date"], errors="coerce")
combined_data["days_since_order"] = combined_data["today"] - combined_data["last_order_date"]
combined_data["days_since_order"] = combined_data["days_since_order"].astype(str)
combined_data["days_since_order"] = combined_data["days_since_order"].str.split("days").str[0]

In [4]:
# Create lambda to join order numbers
combine_text = lambda x: ", ".join(x.unique())

# combined_data["days_since_order"] = combined_data["days_since_order"].astype(int)

# Combine customer data in case they are in both datasets
data = combined_data.groupby(["email"]).agg({"first_order_date":np.min,"last_order_date":np.max,"order_number":combine_text,"orders":"sum","quantity":"sum","revenue":"sum"}).reset_index()
# data.columns=["email","revenue","quantity","order_number"]

In [5]:
# Drop unnecessary column
drop_cols = ["order_number","orders","revenue","quantity","first_order_date","last_order_date"]
combined_data = combined_data.drop(drop_cols,axis=1)

data = pd.merge(data,combined_data,how="left",on="email")

In [6]:
# Drop duplicates (Some customers have duplicate info because they have other adress)
before = len(data)
data = data.sort_values(by="days_since_order",ascending=False)
data = data.drop_duplicates(subset="email", keep="first")
after = len(data)
print("Number of rows dropped",str(before-after))

Number of rows dropped 4


In [7]:
# Check NaN values
data.isnull().sum()

email               0
first_order_date    0
last_order_date     2
order_number        0
orders              0
quantity            0
revenue             0
postal_code         2
city                2
country             0
today               0
days_since_order    0
dtype: int64

In [8]:
# Last order data is sometimes empty
data["last_order_date"] = data["last_order_date"].fillna(data["first_order_date"])
  
# data[(data["last_order_date"].isnull()==True)]

In [9]:
# Change datatypes
data["first_order_date"] = pd.to_datetime(data["first_order_date"])
data["last_order_date"] = pd.to_datetime(data["last_order_date"])

In [10]:
data["id"] = hash_pandas_object(data["email"])
data["order_ids"] = hash_pandas_object(data["order_number"])
data = data.drop(["email","order_number"],axis=1)
data

Unnamed: 0,first_order_date,last_order_date,orders,quantity,revenue,postal_code,city,country,today,days_since_order,id,order_ids
41,2017-07-25 08:03:53,2017-07-25 08:03:53,4,9,40.57,1509GP,Zaandam,NL,2020-06-18,NaT,2239494964570881686,1478727216377870648
412,2018-02-21 20:40:44,2018-02-21 20:40:44,4,7,93.79,2151KV,Nieuw-Vennep,NL,2020-06-18,NaT,2236530259561204867,7439576930578883431
268,2017-09-26 08:46:32,2017-09-26 08:46:32,5,6,254.50,7361ET,Beekbergen,NL,2020-06-18,995,16345102021806026516,376077500108802426
426,2017-09-27 15:35:54,2017-09-27 15:35:54,1,1,5.37,6216BW,Maastricht,NL,2020-06-18,994,13468216973299895554,975545377752792286
348,2020-03-10 14:10:00,2020-03-10 14:10:00,1,3,29.00,3024 AC,Rotterdam,NL,2020-06-18,99,1535920729230190226,1292319315654061820
...,...,...,...,...,...,...,...,...,...,...,...,...
486,2017-09-15 17:02:52,2017-09-15 17:02:52,1,2,5.94,1051XS,Amsterdam,NL,2020-06-18,1006,12245595291383653070,15020420061133478993
444,2020-03-09 12:50:00,2020-03-09 12:50:00,1,3,29.00,8911 DS,Leeuwarden,NL,2020-06-18,100,2923691820477789580,9875154782655611572
258,2020-03-09 21:22:00,2020-03-09 21:22:00,1,3,29.00,2991 LD,Barendrecht,NL,2020-06-18,100,11955326381462714367,226852968295271807
316,2020-06-07 21:56:00,2020-06-07 21:56:00,1,2,23.00,1016 HZ,Amsterdam,NL,2020-06-18,10,6696390494039661292,15993959785218874342


In [11]:
# Export data to csv
data.to_csv("./data/data.csv", index=False)