In [11]:
# Import Data
import pandas as pd

dataframe = pd.read_csv("Online Retail.csv", encoding = "ISO-8859-1")
dataframe["InvoiceDate"] = pd.to_datetime(dataframe["InvoiceDate"])
dataframe.head()


# Calculate RFM
dataframe['TotalSum'] = dataframe['Quantity'] * dataframe['UnitPrice']
snapshot_date = max(dataframe.InvoiceDate)
datamart = dataframe.groupby('CustomerID').agg(
    {
        'InvoiceDate': lambda x: (snapshot_date - x.max()).days,
        'InvoiceNo': 'count',
        'TotalSum': 'sum'
    }
).round(1)

datamart.rename(

    columns={'InvoiceDate':'Recency', 'InvoiceNo': 'Frequency', 'TotalSum':'Monetary'}, 
    inplace=True
)

# Normalize Data
from pre_processing import pre_processing
datamart_normalized = pre_processing(datamart)
print(datamart_normalized)

[[ 1.15675823 -2.23027241 -6.28046783]
 [-1.84932793  1.13341891  1.10386252]
 [ 0.3876675  -0.1864624   0.6060052 ]
 ...
 [-0.8379603  -0.83449302 -0.7145243 ]
 [-1.27833502  2.19530371  0.69324521]
 [ 0.09328903  0.42090602  0.61856518]]


In [12]:
# Practical implementation of k-means clustering

from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=2, random_state=1)
kmeans.fit(datamart_normalized)
cluster_labels = kmeans.labels_
print(cluster_labels)
print(kmeans.cluster_centers_)


[1 0 1 ... 1 0 0]
[[-0.61546258  0.81083453  0.62331087]
 [ 0.5071083  -0.66808436 -0.51357488]]


In [13]:
# Analyze Clusters

datamart_RFM_K2 = datamart.assign(Cluster = cluster_labels)

datamart_RFM_K2.groupby('Cluster').agg(
    {
        'Recency': 'mean',
        'Frequency': 'mean',
        'Monetary': ['mean', 'count']
    }
).round(1)

Unnamed: 0_level_0,Recency,Frequency,Monetary,Monetary
Unnamed: 0_level_1,mean,mean,mean,count
Cluster,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
0,30.2,177.2,3670.2,1978
1,141.3,23.5,434.6,2394
