In [37]:
# Import Data
import pandas as pd

dataframe = pd.read_csv("Online Retail.csv", encoding = "ISO-8859-1")
dataframe["InvoiceDate"] = pd.to_datetime(dataframe["InvoiceDate"])
dataframe.head()


Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom


In [38]:
# Calculate RFM

#Recency: How recently a customer has made a purchase
#Frequency: How often a customer makes a purchase
#Monetary Value: How much money a customer spends on purchases

dataframe['TotalSum'] = dataframe['Quantity'] * dataframe['UnitPrice']
snapshot_date = max(dataframe.InvoiceDate)
datamart = dataframe.groupby('CustomerID').agg(
    {
        'InvoiceDate': lambda x: (snapshot_date - x.max()).days,
        'InvoiceNo': 'count',
        'TotalSum': 'sum'
    }
).round(3)

datamart.rename(

    columns={'InvoiceDate':'Recency', 'InvoiceNo': 'Frequency', 'TotalSum':'Monetary'}, 
    inplace=True
)

print(datamart)

            Recency  Frequency  Monetary
CustomerID                              
12346.0         325          2      0.00
12347.0           1        182   4310.00
12348.0          74         31   1797.24
12349.0          18         73   1757.55
12350.0         309         17    334.40
...             ...        ...       ...
18280.0         277         10    180.60
18281.0         180          7     80.82
18282.0           7         13    176.60
18283.0           3        756   2094.88
18287.0          42         70   1837.28

[4372 rows x 3 columns]


In [57]:
# Preprocessing Data

# K-Means Keys
    # 1- Data must have not skewness
    # 2- Centering and Scaling data to normal (0, 1)

from sklearn.preprocessing import StandardScaler
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

datamart = datamart.where(np.isnan(datamart) == False)

#1-1 Analys data for Skewness 
sns.displot(datamart['Recency'])
plt.plot()

sns.displot(datamart['Frequency'])
plt.plot()

sns.displot(datamart['Monetary'])
plt.plot()

#1-2 Use Logarithmic algorithm to remove skew
datamart[datamart <= 0] = .01
datamart_skewless = np.log(datamart)
print(datamart_skewless)

#2-1 Analys data for centering and scaling
datamart_skewless.describe()

#2-2 Use Scaler algorithm to remove skew
scaler = StandardScaler().fit(datamart_skewless)
datamart_normalized = scaler.transform(datamart_skewless)

print('Mean: {}'.format(datamart_normalized.mean(axis=0).round(2)))
print('Std: {}'.format(datamart_normalized.std(axis=0).round(2)))

             Recency  Frequency  Monetary
CustomerID                               
12346.0     5.783825   0.693147 -2.302585
12347.0     0.000000   5.204007  8.368693
12348.0     4.304065   3.433987  7.494007
12349.0     2.890372   4.290459  7.471676
12350.0     5.733341   2.833213  5.812338
...              ...        ...       ...
18280.0     5.624018   2.302585  5.196285
18281.0     5.192957   1.945910  4.392224
18282.0     1.945910   2.564949  5.173887
18283.0     1.098612   6.628041  7.647252
18287.0     3.737670   4.248495  7.516041

[4372 rows x 3 columns]
Mean: [-0.  0. -0.]
Std: [1. 1. 1.]
