In [1]:
import pandas as pd
import sklearn as sk
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import chart_studio.plotly as py
import plotly.offline as pyoff
import plotly.graph_objs as go
from databricks import koalas as ks
from pyspark.sql import SparkSession

#inititate Plotly
pyoff.init_notebook_mode()
pd.options.mode.chained_assignment = None

df = pd.read_csv("master_table.csv")

print(df.dtypes)
print(df.head(5))

In [2]:
# Remove index column and change order date to datetime
df = df.drop(columns=['Unnamed: 0'])

df['Order_Date_Id'] = df['Order_Date_Id'].astype(dtype='datetime64[ns]')
print(df.dtypes)
print(df.shape)

##### RFM Feature engineering #####

# Get only rows with a valid sales price value
df = df.query('Sales_Price != 0')
print(df.shape)

# Drop dupes
df = df.drop_duplicates()
print(df.shape)

In [3]:
### Recency ###

# Create df with unique customer IDs
df_cust = pd.DataFrame(df['Customer_Id'].unique())
df_cust.columns = ['Customer_Id']
print(df_cust)

In [4]:
# Get the most recent date for each customer in seperate df
df_mp = df.groupby('Customer_Id').Order_Date_Id.max().reset_index()
df_mp.columns = ['Customer_Id','MostRecent']
print(df_mp.head(10))

In [5]:
# Take most recent date in dataset to find recency for each observation
df_mp['Recency'] = (df_mp['MostRecent'].max() - df_mp['MostRecent']).dt.days

# Merge dataframes
df_cust = pd.merge(df_cust, df_mp[['Customer_Id','Recency']], on='Customer_Id')

print(df_cust.head(10))
print(df_cust.describe())

In [6]:
# Recency histogram
plot_recency = [
    go.Histogram(
        x=df_cust['Recency']
    )
]

plot_layout = go.Layout(
        title='Recency'
    )

fig = go.Figure(data=plot_recency, layout=plot_layout)
pyoff.iplot(fig)

In [7]:
# Find optimal clusters for Recency score, using elbow method
from sklearn.cluster import KMeans

sse={}
df_recency = df_cust[['Recency']]
for k in range(1, 10):
    kmeans = KMeans(n_clusters=k, max_iter=1000).fit(df_recency)
    df_recency["clusters"] = kmeans.labels_
    sse[k] = kmeans.inertia_ 
plt.figure()
plt.plot(list(sse.keys()), list(sse.values()))
plt.xlabel("Number of clusters")
plt.show()

In [8]:
# Build 3 clusters (elbow method) for Recency and add it to df
kmeans = KMeans(n_clusters=3)
kmeans.fit(df_cust[['Recency']])
df_cust['RecencyCluster'] = kmeans.predict(df_cust[['Recency']])

# Function for ordering cluster numbers (0 cluster number is low recency, 2 is high recency)
def order_cluster(cluster_field_name, target_field_name, df, ascending):
    #new_cluster_field_name = 'new_' + cluster_field_name
    df_new = df.groupby(cluster_field_name)[target_field_name].mean().reset_index()
    df_new = df_new.sort_values(by = target_field_name ,ascending = ascending).reset_index(drop=True)
    df_new['index'] = df_new.index
    df_final = pd.merge(df, df_new[[cluster_field_name,'index']], on = cluster_field_name)
    df_final = df_final.drop([cluster_field_name], axis=1)
    df_final = df_final.rename(columns = {"index":cluster_field_name})
    return df_final

df_cust = order_cluster('RecencyCluster', 'Recency', df_cust, False)
print(df_cust)

# Summary about Recency Clusters
df_cust.groupby('RecencyCluster')['Recency'].describe()

In [9]:
### Frequency ###

# Order counts per customer
df_freq = df.groupby('Customer_Id').Order_Date_Id.count().reset_index()
df_freq.columns = ['Customer_Id','Frequency']
print(df_freq)

# Add to main df
df_cust = pd.merge(df_cust, df_freq, on='Customer_Id')
print(df_cust.head(10))

In [10]:
df_cust2 =  df_cust.drop(columns = ['Recency', 'RecencyCluster'], axis=1)
print(df_cust2.describe())

In [11]:
# Frequency histogram
plot_freq = [
    go.Histogram(
        x=df_cust2['Frequency'],
        y = df_cust2['Customer_Id']
    )
]

plot_layout = go.Layout(
        title='Frequency'
    )
fig = go.Figure(data=plot_freq, layout=plot_layout)
pyoff.iplot(fig)

In [12]:
# Find number of clusters for Frequency
sse={}
df_frequency = df_cust[['Frequency']]
for k in range(1, 10):
    kmeans = KMeans(n_clusters=k, max_iter=1000).fit(df_frequency)
    df_frequency["clusters"] = kmeans.labels_
    sse[k] = kmeans.inertia_ 
plt.figure()
plt.plot(list(sse.keys()), list(sse.values()))
plt.xlabel("Number of clusters")
plt.show()

In [13]:
# Build 3 clusters (elbow method) for Frequency and add it to df
kmeans = KMeans(n_clusters=3)
kmeans.fit(df_cust[['Frequency']])
df_cust['FrequencyCluster'] = kmeans.predict(df_cust[['Frequency']])

# Order Frequency clusters
df_cust = order_cluster('FrequencyCluster', 'Frequency', df_cust, True)
print(df_cust)

# Summary about Frequency Clusters
df_cust.groupby('FrequencyCluster')['Frequency'].describe()

In [14]:
### Revenue ###

# Find revenue for each customer
df['Revenue'] = df['Sales_Price'] * df['Quantity']
df_rev = df.groupby('Customer_Id').Revenue.sum().reset_index()

# Add to main df
df_cust = pd.merge(df_cust, df_rev, on='Customer_Id')
print(df_cust.head(10))

In [15]:
#df_cust3 =  df_cust.drop(columns = ['Recency', 'RecencyCluster', 'Frequency', 'FrequencyCluster', 'RevenueCluster', 'OverallScore'], axis=1)
#print(df_cust3.describe())

In [16]:
# Monetary Value histogram
plot_rev = [
    go.Histogram(
        x=df_cust['Revenue']
    )
]

plot_layout = go.Layout(
        title='Monetary Value'
    )
fig = go.Figure(data=plot_rev, layout=plot_layout)
pyoff.iplot(fig)

In [17]:
# Find number of clusters for Revenue
sse = {}
df_revenue = df_cust[['Revenue']]
for k in range(1, 10):
    kmeans = KMeans(n_clusters = k, max_iter = 1000).fit(df_revenue)
    df_revenue["clusters"] = kmeans.labels_
    sse[k] = kmeans.inertia_ 
plt.figure()
plt.plot(list(sse.keys()), list(sse.values()))
plt.xlabel("Number of clusters")
plt.show()

In [18]:
# Build 3 clusters (elbow method) for Revenue and add it to df
kmeans = KMeans(n_clusters = 3)
kmeans.fit(df_cust[['Revenue']])
df_cust['RevenueCluster'] = kmeans.predict(df_cust[['Revenue']])

# Order Frequency clusters
df_cust = order_cluster('RevenueCluster', 'Revenue', df_cust, True)
print(df_cust)

# Summary about Frequency Clusters
df_cust.groupby('RevenueCluster')['Revenue'].describe()

In [19]:
# Get Overall Score and look at mean
df_cust['OverallScore'] = df_cust['RecencyCluster'] + df_cust['FrequencyCluster'] + df_cust['RevenueCluster']
RFM = ['Recency','Frequency','Revenue']
print(df_cust.groupby('OverallScore')[RFM].mean())

In [20]:
# Name buckets
df_cust['Segment'] = 'Low-Value'
df_cust.loc[df_cust['OverallScore']>2,'Segment'] = 'Mid-Value' 
df_cust.loc[df_cust['OverallScore']>4,'Segment'] = 'High-Value'
print(df_cust['Segment'].value_counts())