In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import preprocessing, metrics, cluster

%matplotlib inline

In [None]:
df = pd.read_csv("../input/onlineretail/OnlineRetail.csv", encoding='windows-1252')
df.head()

In [None]:
df.info()

In [None]:
df.isna().any()

In [None]:
df = df[~df.CustomerID.isna()]
df.info()

In [None]:
df.nunique()

In [None]:
df.describe()

In [None]:
df = df[df.Quantity>0]
df.info()

In [None]:
df.describe()

In [None]:
df.head()

In [None]:
df.InvoiceDate = pd.to_datetime(df.InvoiceDate)

In [None]:
df["TotalPrice"] = df.Quantity * df.UnitPrice
df.head()

In [None]:
last_date = df.InvoiceDate.max()
last_date

In [None]:
rfm = df.groupby("CustomerID").agg({
    "InvoiceDate": lambda values: (last_date - values.max()).days,
    "InvoiceNo" : lambda values: len(values),
    "TotalPrice": lambda values: np.sum(values)
})

rfm.head()

In [None]:
rfm.columns = ["recency", "frequency", "monetary"]
rfm.head()

In [None]:
quantiles = np.arange(1, 6) * 20
quantiles

In [None]:
rfm["r_score"] = np.digitize(rfm.recency, bins = np.percentile(rfm.recency, quantiles)
                           , right=True)

rfm["m_score"] = np.digitize(rfm.monetary, bins = np.percentile(rfm.monetary, quantiles)
                           , right=True)

rfm["f_score"] = np.digitize(rfm.frequency, bins = np.percentile(rfm.frequency, quantiles)
                           , right=True)

rfm["r_score"] = 4 - rfm["r_score"]

rfm["r_score"] = rfm["r_score"] + 1
rfm["f_score"] = rfm["f_score"] + 1
rfm["m_score"] = rfm["m_score"] + 1


rfm.head()

In [None]:
rfm.sample(10, random_state=123)

In [None]:
scaler = preprocessing.StandardScaler()
X = rfm[["r_score", "f_score", "m_score"]].values
X = scaler.fit_transform(X.astype("float32"))
X

In [None]:
inertias = {}
for k in range(2, 10): 
    kmeans = cluster.KMeans(n_clusters=k, random_state=1)
    kmeans.fit(X)
    inertias[k] = kmeans.inertia_
    
pd.Series(inertias).plot()
plt.xlabel("K (num of clusters)")
plt.ylabel("Inertia Score")

In [None]:
k = 5
kmeans = cluster.KMeans(n_clusters=k, random_state = 1)
rfm["cluster"] = kmeans.fit_predict(X)

In [None]:
rfm.cluster.value_counts()

In [None]:
rfm["distance"] = 0.0
for i in range(k):
    centroid = kmeans.cluster_centers_[i].reshape(1, -1)
    cluster_points = X[rfm.cluster == i]
    rfm["distance"][rfm.cluster == i] = metrics.euclidean_distances(centroid, cluster_points).flatten()
rfm.sample(20)

In [None]:
rfm.groupby("cluster").distance.agg(["mean", "count"])