In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split


In [None]:
df = pd.read_csv("market_seg.csv")

# Feature Eng

In [None]:
np.sum(df.isna())
df.dropna(inplace = True)
cancellation = df["InvoiceNo"].str.contains("C")
df = df.loc[cancellation != True]
df["InvoiceDate"] = pd.to_datetime(df["InvoiceDate"])
today = pd.Timestamp(pd.Timestamp.now())
df["Days"] = df["InvoiceDate"].agg(lambda x: (today-x).days)
Recency = df.groupby("CustomerID").Days.min()
Frequency = df.groupby("CustomerID").InvoiceNo.unique().agg(len)
df["Cost"] = df["Quantity"] * df["UnitPrice"]
Monetary = df.groupby("CustomerID").Cost.sum()
Recency = pd.DataFrame(Recency)
Frequency = pd.DataFrame(Frequency)
Monetary = pd.DataFrame(Monetary)
X = pd.concat((Recency, Frequency, Monetary), axis = 1)
X.columns = ["Recency", "Frequency", "Monetary"]


# outlier

# Clustering without scaling

In [None]:
n_clusters = 3
kmeans = KMeans(n_clusters = n_clusters).fit(X)
centroids = pd.DataFrame(kmeans.cluster_centers_, columns = X.columns)

# Clustering with scaling

In [None]:
scaler = StandardScaler()
X = scaler.fit_transform(X)
X = pd.DataFrame(X, columns = ["Recency", "Frequency", "Monetary"])
n_clusters = 3
kmeans = KMeans(n_clusters = n_clusters).fit(X)
centroids = pd.DataFrame(kmeans.cluster_centers_, columns = X.columns)
labels = kmeans.labels_
np.sum(silhouette_samples(X, labels) < 0) / X.shape[0]
silhouette_score(X, labels)

# elbow method

In [None]:
ssd = []
for i in range(1, 11):
    kmeans = KMeans(n_clusters = i).fit(X)
    ssd.append(kmeans.inertia_)

plt.plot(range(1, 11), ssd, c = "r", marker = "o")
plt.ylabel("Sum Of Squarde Distance")
plt.xlabel("Number Of Clusters")