In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
from datetime import datetime, date, time
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.cluster import KMeans

In [None]:
df = pd.read_csv('../input/ecommerce-data/data.csv',encoding='cp874')

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
# consider only valid transactions (not null, not returned items)
df_valid = df.dropna()
df_valid = df_valid[(df_valid['Quantity'] > 0) & (df_valid['UnitPrice'] > 0)]
df_valid.describe()

In [None]:
# consider only number of invoices
df_per_invoice = df_valid.loc[:,['InvoiceNo','CustomerID']].drop_duplicates()
df_per_invoice.describe()

In [None]:
# count shopping frequency
shopping_freq = df_per_invoice.groupby(['CustomerID']).count().reset_index()
shopping_freq.head()

In [None]:
shopping_freq.describe()

In [None]:
df_time = df_valid.loc[:,['CustomerID','InvoiceDate']].drop_duplicates()
df_time['InvoiceDate'] = pd.to_datetime(df_time['InvoiceDate'])

In [None]:
df_time = df_time.drop_duplicates(subset = 'CustomerID')

In [None]:
most_recent_time = df_time['InvoiceDate'].max()
print(most_recent_time)

In [None]:
df_time['NormDate'] = most_recent_time - df_time['InvoiceDate']

In [None]:
df_time.describe()

In [None]:
days = []
for d in df_time['NormDate']:
    days.append(d.days)
df_time['NormDate'] = days
df_time = df_time.rename(columns = {'NormDate': 'LatestDay'})
df_time.describe()

In [None]:
df_time.head()

In [None]:
df_time.drop('InvoiceDate', axis =1, inplace = True)

In [None]:
df_time.describe()

In [None]:
df_spending = df_valid[['CustomerID', 'InvoiceNo', 'Quantity', 'UnitPrice']]

In [None]:
df_spending.head()

In [None]:
df_spending['MoneySpent'] = df_spending['Quantity'] * df_spending['UnitPrice']
df_spending = df_spending.groupby('CustomerID')[['MoneySpent']].sum().reset_index()
df_spending.head()

In [None]:
df_spending.describe()

In [None]:
shopping_freq = shopping_freq.drop_duplicates()
df_time = df_time.drop_duplicates()
df_spending = df_spending.drop_duplicates()

In [None]:
shopping_freq.describe()

In [None]:
df_time.describe()

In [None]:
df_spending.describe()

In [None]:
df_processed = shopping_freq.merge(df_time).merge(df_spending)
df_processed['SpendingPerInvoice'] = df_processed['MoneySpent'] / df_processed['InvoiceNo']

In [None]:
df_processed.drop('MoneySpent', axis =1, inplace = True)

In [None]:
df_processed.describe()

In [None]:
df_processed.head()

In [None]:
q1 = df_processed['InvoiceNo'].quantile(0.25)
q2 = df_processed['InvoiceNo'].quantile(0.50)
q3 = df_processed['InvoiceNo'].quantile(0.75)

In [None]:
df_quartiles = pd.DataFrame()
df_quartiles['R'] = pd.qcut(df_processed['LatestDay'], 4, labels=[1, 2, 3, 4])
df_quartiles['F_temp'] = df_processed['InvoiceNo']
df_quartiles['M'] = pd.qcut(df_processed['SpendingPerInvoice'], 4, labels=[1, 2, 3, 4])

f_list = []
for f in df_quartiles['F_temp']:
    if f <= q1:
        f_list.append(1)
    elif f <= q2:
        f_list.append(2)
    elif f <= q3:
        f_list.append(3)
    else:
        f_list.append(4)

df_quartiles['F'] = f_list
df_quartiles.drop('F_temp', axis =1, inplace = True)

In [None]:
df_quartiles

In [None]:
sils = []
for i in range(2,10) :
    model = KMeans(n_clusters = i)
    y_kmeans = model.fit(df_quartiles)
    sil_avg = silhouette_score(df_quartiles,model.labels_).round(4)
    sils.append([sil_avg,i])
print(sils)

In [None]:
model = KMeans(n_clusters = 8)
y_kmeans = model.fit_predict(df_quartiles)

In [None]:
df_result = df_processed
df_result['Cluster'] = y_kmeans

In [None]:
df_result

In [None]:
clusters = df_result.groupby('Cluster').mean()
clusters['Counts'] = df_result['Cluster'].value_counts()

In [None]:
clusters

In [None]:
silhouette_vals = silhouette_samples(df_quartiles, y_kmeans)
y_ticks = []
y_lower = y_upper = 0

fig = plt.figure(figsize=(8, 5))
ax = fig.add_subplot(111)

for i, cluster in enumerate(np.unique(y_kmeans)):
    cluster_silhouette_vals = silhouette_vals[y_kmeans == cluster]
    cluster_silhouette_vals.sort()
    y_upper += len(cluster_silhouette_vals)
    
    # Plot the barh
    ax.barh(range(y_lower,y_upper), cluster_silhouette_vals, height =1);
    ax.text(-0.03,(y_lower+y_upper)/2,str(i+1))
    y_lower += len(cluster_silhouette_vals)

    # Plot the average silhouette score 
    avg_score = np.mean(silhouette_vals)
    ax.axvline(avg_score,linestyle ='--',
    linewidth =2,color = 'green')
    ax.set_yticks([])
    ax.set_xlim([-0.1, 1])
    ax.set_xlabel('Silhouette coefficient values')
    ax.set_ylabel('Cluster labels')
    ax.set_title('Silhouette plot for the various clusters')

    plt.tight_layout()
    
plt.show()