In [None]:
# Import the libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy import stats
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import PIL as pil


#reading in the file that we cleaned
df = pd.read_csv('../input/retail-data/cleaned_data.csv', parse_dates=True, index_col=['InvoiceDate'], low_memory=False)

# Work around because pandas keeps adding an index column even if i specify i don't want one
df.drop(df.filter(regex="Unname"),axis=1, inplace=True)

# Making a Revenue Column
df['Revenue'] = df['Price'] * df['Quantity']

# UA of the Quantity Column
# sns.displot(df['Quantity'])

# Questions to answer: #1. How many invoices were there and how many of each?

# grouping data by the invoice count
invoiceCount = df.groupby(['Invoice']).size().sort_values(ascending=False);

# #2. Who bought the most most stuff from the comapany?
customerCount = df.groupby(['Customer ID']).size().sort_values(ascending=False);

#2a. Okay, do we have more large customers or a bunch of small ones

sns.displot(customerCount, bins=50);
plt.clf()

# Response: it appears that we have more large customers than small ones in terms of order counts
# How do we know?
# Test to find that out: look at the median compared to the mean
print(np.median(customerCount)) # 53.0
print(np.mean(customerCount)) # 137

# Can you give me a count of everything? 

colNames = list(df.columns)
countArray = []

for name in colNames:
    Count =   df.groupby([name]).size().sort_values(ascending=False);
    countArray.append(Count)

# print(countArray)
"""
countArray is an array that holds the result of a size sort by columns, 
its purpose is to hold the counts of all variables sorted by largest first

"""

# Boss ask: I wanna know which country gave us the most revenue

revenueByCountry = df.groupby(['Country']).sum().sort_values('Revenue', ascending=False)

# Resampling data by month (specificallu for revenue and Quantity)
monthResample = df[['Quantity', 'Revenue']].resample('M').sum()


# Plotting the Quantity by months
plt.plot(monthResample['Quantity'])
plt.clf()

# Plotting Revenue by Month
plt.plot(monthResample['Revenue'])
plt.clf()

#turning the description column into a set
descriptionSet = list(set(df['Description']))

# Getting an ddea of the types of product colors most bought by our customers
lowerDescriptionSet = ''.join([word.lower() for word in descriptionSet])
descriptionSetCleaned = lowerDescriptionSet.replace(' ', '')

# With the data prepared, the stop words needs to be prepared

stopWords = ''.join(list(STOPWORDS)).replace(' ', '')

# Generating the wordCloud by setting the stopwords agrument to our cleaned list and explicitly typing the description list
wordCloud = WordCloud(stopwords=stopWords,max_words=200).generate(str(descriptionSetCleaned))


# Having Python generate the image
plt.imshow(wordCloud, interpolation='bilinear')


# reseting the index -- This is important for the other analysis we intented to run
df = df.reset_index()

In [None]:
# Boss is asking for a RFM Score for customers to know if he can treat his customeres differently

"""

The point of figuring out the RFM scores for customers is that it lets you get an idea of the type of customer that you have,
you score them on three seperate metrics

(R) - Recentcy - A measure of when they made the last purchase of a product or item, basically the sort by most recent orders

(F) - Fequency - A measure of how much/how often they purchase our products

(M) - Monetary - A measure of the overall value of the customer (total investment in our products)


"""

# Step 1. Format data

# Cleaning the columns
df = df.dropna()

# Converting the columns into strings
df[['Customer ID']] = df[["Customer ID"]].astype(int).astype(str)

# Converting the InvoiceDate into dataTime objects
df["InvoiceDate"] = pd.to_datetime(df['InvoiceDate'])


# Step 2. Gathering the R score!

# R - needs a benchmark of recentcy (or a reference date for all the measurements)
# Using the pandas method of to_datetime the string above was converted into a datetime object

recentcyDate = pd.to_datetime('20120101', format='%Y%m%d')

# Creating a column the locks at the dates between the date of the purchase and the reference date
# done by converting the difference(which returns a timedelta64[ns]) to day format and taking the abs value of that

df['DaysSincePurchase'] = abs((df[["InvoiceDate"]] - recentcyDate).astype("timedelta64[D]"))

# Grouping by CustomerID using the minimum date as the returned Value

Recent = df.groupby(["Customer ID"], as_index=False)['DaysSincePurchase'].min()

# Renaming the columns headers then setting the Customer ID as the index
Recent.columns = ['Customer ID', 'Recency']


# This final step provides us with a formated dataframe
Recent_df = Recent.set_index('Customer ID')

print(Recent_df)

# Step 3 -- Gathering the F

"""

There is a couple of ways of doing this, and I will show a groupby first then an easier approach
----
The group by methodology is as follows, 
1. Group by the customerID and the Invoice columns and summing up the revenue
2. With that group, further group this by customerID and and then getting the values of revenue by size

----
The easier methodology is:
1. Drop all duplicates from the df based on the Inovie column subset
2. take the values of the customerID column and value count them.
3. Then plug it back into a df format

"""

# Groupby method
# This group by lets me see the invoices by the customerID
cId_invoice = df.groupby(['Customer ID', 'Invoice'], as_index=False)['Revenue'].sum()

# Further grouping this by ID but then taking the 
FrequencyGroupBy = cId_invoice.groupby(['Customer ID'], as_index=False)['Revenue'].size()
FrequencyGroupBy.columns = ['Customer ID','Frequency']
FrequencyGroupBy.set_index('Customer ID', inplace = True)

# Renaming the size column as Frequency

print(FrequencyGroupBy)


# Easier Method

# Dropping nas from the invoice column
Frequency_df = df
Frequency_df = Frequency_df.drop_duplicates(subset='Invoice')
# After Droping duplicates from the data you can grab your counts
Counts = Frequency_df['Customer ID'].value_counts()
# Turning the counts back into a dataFrame
Counts = pd.DataFrame(Counts)

# Step 4 - The M

# Calculating the Montery Engagement from all of the customers

"""
This is a fairly simple one as we are just going to group by the customer id and sum up the revenue column
 
"""

# groupby
Monetary_df = df.groupby(["Customer ID"],as_index=False)["Revenue"].sum()
print()
Monetary_df.columns = ["Customer ID", 'Monetary']
Monetary_df.set_index("Customer ID", inplace=True)

print(Monetary_df)


In [None]:
# Combining all of these steps together!

# You need to run all above steps in order to proceed with this.
# The purpose of leaving the list in this command is so that you can try to make this df howevery you name you df or whatever ones you decide to use
rfm_dataFrame_list = []
rfm_dataFrame_list.append(Recent_df)
rfm_dataFrame_list.append(FrequencyGroupBy)
rfm_dataFrame_list.append(Monetary_df)

# combine all the dataframes along the same axis
rfm_data = pd.concat(rfm_dataFrame_list, axis=1)

# This will be used in the last part of this article (Not Mandatory as I keep all the work in a single work book)
rfm_data.to_csv('RFM_cluster.csv')

This next section I will be using the **.qcut** function that is provided by pandas. 
This is done for a couple of reasons that are explained below.

#1 It simplifies binning by handling the math of figureing them out.
#2 It allows you to label the bins in whatever fashion you want as we will see



In [None]:
# creating a seperate container for the rfm value
rfmScoreTable = rfm_data

# creating labels for R tab 
# the labels are backwards do to the fact that the bigger the number the less recent the purchase was
R_labels = ['5', '4', '3', '2', '1']

# Creating the Recency Score
rfmScoreTable['Recency Score'] = pd.qcut(rfmScoreTable['Recency'], q=5, labels=R_labels)

# --- REPEATING FOR THE NEXT TWO MEASUREMENTS --- #

# F-Labels 
F_labels = ['1', '2', '3', '4', '5']

# ranking the values so that they all have a unqiue value
fre_ranked = rfmScoreTable['Frequency'].rank(method='first')

# This would throw an error because of the way the bin edges are defined, so ranking them is easier to do
# You could do the ranking for each measure when you automating it
# alt_rfmScoreTable['Frequency Score'] = pd.qcut(rfmScoreTable['Frequency'], q=R_bins, labels=R_labels, duplicates='drop')

# Creating the Frequency Score
rfmScoreTable['Frequency Score'] = pd.qcut(fre_ranked, q=5,labels=F_labels)

# M-Labels
M_labels = ['1', '2', '3', '4', '5']
rfmScoreTable['Monetary Score'] = pd.qcut(rfmScoreTable['Monetary'], q=5, labels=M_labels)

# Creating a final score column
rfmScoreTable['RFM Total'] = rfmScoreTable['Recency Score'].astype(str) + rfmScoreTable['Frequency Score'].astype(str) + rfmScoreTable['Monetary Score'].astype(str)

# Finished RFM Table
print(rfmScoreTable) 

rfm_classification = []


# setting up tuples for comparision
Hibernating = ('11', '12', '21', '22')
At_risk = ('13','14','23','24')
Cant_lose = ('15', '25')
About_to_sleep = ('31', '32')
Need_Attention = ('33')
Loyal_customers = ('34','35','44','45')
Promising = ('41')
New_customers = ('51')
Potential_loyalist = ('42','43','52','53')
Champions = ('54','55')


# Assessing the values and labeling them based on the tuple
for values in rfmScoreTable['RFM Total']:
    if values.startswith(Hibernating):
        rfm_classification.append('Hibernating')
    elif values.startswith(At_risk):
        rfm_classification.append('At Risk')
    elif values.startswith(Cant_lose):
        rfm_classification.append('Cant Lose')
    elif values.startswith(About_to_sleep):
        rfm_classification.append('About to sleep')
    elif values.startswith(Need_Attention):
        rfm_classification.append('Need Attention')
    elif values.startswith(Loyal_customers):
        rfm_classification.append('Loyal customers')
    elif values.startswith(Promising):
        rfm_classification.append('Promising')
    elif values.startswith(New_customers):
        rfm_classification.append('New customers')
    elif values.startswith(Potential_loyalist):
        rfm_classification.append('Potential loyalist')
    elif values.startswith(Champions):
        rfm_classification.append('Champions')
    else:
        pass
    
rfmScoreTable['RFM Segment'] = rfm_classification
print(rfmScoreTable)


# --- WORK ON THIS ---- #
plt.pie(rfmScoreTable['RFM Segment'].value_counts())
plt.show()

In [None]:
# This section will focus on using clustering to classify instead of what we did with the known clusters above
# We will need to import a couple of things
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from mpl_toolkits import mplot3d

# Step 1 --- Import the dateframe with the RFM values already established
cluster_df = pd.read_csv('./RFM_cluster.csv')
cluster_df.set_index("Customer ID", inplace=True)

# Step 2 -- Define the a scalar object and fit it to our data
scaler = StandardScaler()
scaled_array = scaler.fit_transform(cluster_df)

# Step 3 -- Transform the array back into a df
scaled_df = pd.DataFrame(scaled_array)
scaled_df.columns = ['RecencyScaled', 'FrequencyScaled', 'MonetaryScaled']

# Step 4 -- Visuallising the transformed data (ADD COLORS)
fig = plt.figure(figsize=(8,10))
ax = fig.add_subplot(projection='3d')

# Step 5 -- Start the Clustering
# Declare the model
model = KMeans(n_clusters = 10)
# Fit the model to the data
model.fit(scaled_df)
# Predict labels
labels = model.predict(scaled_df)

# Adding label to the df
scaled_df['Label'] = labels

# Plotting with colors auto generated by the label column 
ax.scatter(xs = scaled_df['RecencyScaled'], ys = scaled_df['FrequencyScaled'], zs = scaled_df['MonetaryScaled'], c=scaled_df['Label'])
ax.set_xlabel("Recency")
ax.set_ylabel("Frequency")
ax.set_zlabel('Monetary')
ax.set_title('Customer Distrubition')
ax.view_init(30, 60)


In [None]:
# Step 6 -- using the model data

# Getting the cluster centers!
KC = model.cluster_centers_
ClusterCenter = pd.DataFrame(KC)

print(ClusterCenter)

# Step 7 -- Plotting the cluster centers and giving the 3d plot some more data 
fig = plt.figure(figsize=(8,10))
ax = fig.add_subplot(projection='3d')
xs,ys,zs = [ClusterCenter[0], ClusterCenter[1], ClusterCenter[2]]
labels = scaled_df['Label'].unique()
ax.scatter(xs, ys, zs, s=100, c=labels)

# Enhancing the visual appeal of the graph

# adding '--' to each label in the labels array
pointLabels = []
for label in labels:
    pointLabels.append('#'+label.astype(str))
    
    
for x,y,z,i in zip(xs+(abs(.1*xs)),ys+(abs(.1*ys)),zs,pointLabels):
    ax.text(x,y,z,i)
ax.set_xlabel("Recency")
ax.set_ylabel("Frequency")
ax.set_zlabel('Monetary')
ax.set_title('Cluster Centers')

plt.show()


In [None]:
# --- This section is to optimize the K cluster through a WCSS and reduce the sum to a reasonable value --- #
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from mpl_toolkits import mplot3d


def wcssScoreGenerator(df, maxClusters = 10):
    # @arg MaxClusters -- The amount of clusters you want the function to recurcisily check
    # @arg df -- dataframe you want the to fit 
    # @return -- array of the individual sums at each cluster level
    
    # Error control
    if df.empty:
        print('No dataframe was provided')
        return
    
    # Setting up an empty array to hold the WCSS scores (will be returned)
    wcssScores = []
    
    # Establish loop
    for x in range (1, maxClusters):
        # Declare Model
        model = KMeans(n_clusters = x)
        model.fit(df)
        wcssScores.append(model.inertia_)
        
    return(wcssScores)

# importing csv and scaling it
cluster_df = pd.read_csv('./RFM_cluster.csv')
cluster_df.set_index("Customer ID", inplace=True)

# Step 2 -- Define the a scalar object and fit it to our data
scaler = StandardScaler()
scaled_array = scaler.fit_transform(cluster_df)

# Step 3 -- Transform the array back into a df
scaled_df = pd.DataFrame(scaled_array)
scaled_df.columns = ['RecencyScaled', 'FrequencyScaled', 'MonetaryScaled']


Scores = wcssScoreGenerator(scaled_df, 15)
plt.plot(Scores, 'x')
plt.title('The Elbow Point Graph')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.show()
    
#
