In [None]:
import pandas as pd
import numpy as np
import time
import datetime as dt
import sklearn.cluster as cluster
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_samples, silhouette_score
from pandas.plotting import scatter_matrix
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

In [None]:
retail = pd.read_csv("../input/ecommerce-data/data.csv",encoding='cp874')
retail.head()

In [None]:
retail.shape

In [None]:
retail.info()

### Checking the null values in the dataset

In [None]:
retail.isna().sum().sort_values(ascending=False)

In [None]:
pd.DataFrame(data = (retail.isna().sum() / retail.shape[0]) * 100, index = retail.columns, columns = ['% Null Values'])

#### Around 25% of transactions do not have a CustomerID and 2.68% of transactions do not have a Description of product.

### Dropping the rows with null values in CustomerID column

In [None]:
retail.dropna(subset=['CustomerID'],how='any',inplace=True)
retail.shape

In [None]:
retail.isna().sum()

### Checking duplicate rows in the dataset

In [None]:
retail.duplicated().sum()

In [None]:
retail.drop_duplicates(inplace=True)

In [None]:
retail.shape

### Removing the cancelled orders from the dataset

In [None]:
retail = retail[retail['Quantity'] > 0]

In [None]:
retail.shape

In [None]:
pd.DataFrame(data=[retail['InvoiceNo'].nunique(),retail['StockCode'].nunique(),retail['CustomerID'].nunique()],columns=['Count'],
                   index=['Number of Transactions','Number of Unique Products Bought','Number of Unique Customers'])

# RFM Analysis
RFM (**Recency, Frequency, Monetary**) analysis is a customer segmentation technique that uses past purchase behaviour to divide customers into groups. <br> RFM helps divide customers into various categories or clusters to identify customers who are more likely to respond to promotions and also for future personalization services.
- RECENCY (R): Days since last purchase 
- FREQUENCY (F): Total number of purchases 
- MONETARY VALUE (M): Total money this customer spent.

We will create those 3 customer attributes for each customer.

## Recency
To calculate recency, we need to choose a date point from which we evaluate **how many days ago was the customer's last purchase**.

In [None]:
retail['InvoiceDate'] = retail['InvoiceDate'].astype('datetime64')
retail['InvoiceDate'].max()

In [None]:
now = dt.date(2011,12,9)
print(now)

In [None]:
retail['Date'] = retail['InvoiceDate'].apply(lambda x: x.date())

In [None]:
retail.head()

In [None]:
recency_df = retail.groupby(by='CustomerID', as_index=False)['Date'].max()
recency_df.columns = ['CustomerID','LastPurshaceDate']
recency_df.head()

In [None]:
recency_df['Recency'] = recency_df['LastPurshaceDate'].apply(lambda x: (now - x).days)
recency_df.head()

In [None]:
recency_df.drop('LastPurshaceDate',axis=1,inplace=True)
recency_df.head()

## Frequency
Frequency helps us to know **how many times a customer purchased from us**. To do that we need to check how many invoices are registered by the same customer.

In [None]:
temp = retail.copy()
temp.drop_duplicates(['InvoiceNo','CustomerID'],keep='first',inplace=True)
frequency_df = temp.groupby(by=['CustomerID'], as_index=False)['InvoiceNo'].count()
frequency_df.columns = ['CustomerID','Frequency']
frequency_df.head()

## Monetary
Monetary attribute answers the question: **How much money did the customer spent over time?**

To do that, first, we will create a new column total cost to have the total price per invoice.

In [None]:
retail['TotalCost'] = retail['Quantity'] * retail['UnitPrice']

In [None]:
retail.head()

In [None]:
monetary_df = retail.groupby(by='CustomerID',as_index=False).agg({'TotalCost': 'sum'})
monetary_df.columns = ['CustomerID','Monetary']
monetary_df.head()

# Create RFM Table

In [None]:
rfm_df = recency_df.merge(frequency_df,on='CustomerID').merge(monetary_df,on='CustomerID')
rfm_df.set_index('CustomerID',inplace=True)
rfm_df.head()

## Customer segments with RFM Model
Before moving to customer segments, Let's see the application of Pareto Principle – commonly referred to as the 80-20 rule on our dataset by applying it to our RFM variables.

Pareto’s rule says **80% of the results come from 20% of the causes**.

Similarly, **20% customers contribute to 80% of your total revenue**. Let's verify that because that will help us know which customers to focus on when marketing new products.

### Applying 80-20 rule

In [None]:
pareto_cutoff = rfm_df['Monetary'].sum() * 0.8
print("The 80% of total revenue is: ",round(pareto_cutoff,2))

In [None]:
customers_ranked = rfm_df
customers_ranked['Rank'] = customers_ranked['Monetary'].rank(ascending=False)
customers_ranked.head()

In [None]:
customers_ranked.sort_values(by='Rank',ascending=True,inplace=True)
customers_ranked.head()

In [None]:
# Get top 20% of the customers
top_20_cutoff = 4339 * 20 /100
top_20_cutoff

In [None]:
# Sum the monetary values over the customer with rank <= 868
revenueByTop20 = customers_ranked[customers_ranked['Rank'] <= 868]['Monetary'].sum()
revenueByTop20

#### We observe that the revenue generated by 20% of the top customers is somewhat less than 80% of the total revenue. However, these two numbers are not exactly the same always but they are very close to each other in our case. So, it gave us a good indication of Pareto's rule holding true.

### Applying RFM Score Formula

The simplest way to create customers segments from RFM Model is to use **Quartiles**. We assign a score from 1 to 4 to Recency, Frequency and Monetary. Four is the best/highest value, and one is the lowest/worst value. A final RFM score is calculated simply by combining individual RFM score numbers.

Note: Quintiles (score from 1-5) offer better granularity, in case the business needs that but it will be more challenging to create segments since we will have 5 * 5 * 5 possible combinations. So, we will use quartiles.

#### RFM Quartiles

In [None]:
quantiles = rfm_df.quantile(q=[0.25,0.5,0.75])
quantiles

In [None]:
quantiles.to_dict()

### Creation of RFM segmentation table

We will create two segmentation conditions, one for recency and other for fequency and monetary. It's because high recency is bad, while high frequency and monetary value is good.

In [None]:
# Arguments (x = value, p = recency, monetary_value, frequency, d = quartiles dict)
def RScore(x,p,d):
    if x <= d[p][0.25]:
        return 4
    elif x <= d[p][0.50]:
        return 3
    elif x <= d[p][0.75]: 
        return 2
    else:
        return 1

In [None]:
# Arguments (x = value, p = recency, monetary_value, frequency, k = quartiles dict)
def FMScore(x,p,d):
    if x <= d[p][0.25]:
        return 1
    elif x <= d[p][0.50]:
        return 2
    elif x <= d[p][0.75]: 
        return 3
    else:
        return 4

In [None]:
# Create rfm segmentation table
rfm_segmentation = rfm_df
rfm_segmentation['R_Quartile'] = rfm_segmentation['Recency'].apply(RScore, args=('Recency',quantiles,))
rfm_segmentation['F_Quartile'] = rfm_segmentation['Frequency'].apply(FMScore, args=('Frequency',quantiles,))
rfm_segmentation['M_Quartile'] = rfm_segmentation['Monetary'].apply(FMScore, args=('Monetary',quantiles,))

In [None]:
rfm_segmentation.head()

Now that we have the score of each customer, we can represent our customer segmentation.<br>
First, we need to combine the scores (R_Quartile, F_Quartile,M_Quartile) together.

In [None]:
rfm_segmentation['RFMScore'] = rfm_segmentation.R_Quartile.map(str) \
                            + rfm_segmentation.F_Quartile.map(str) \
                            + rfm_segmentation.M_Quartile.map(str)
rfm_segmentation.head()

Best Recency score = 4: most recently purchased. <br>
Best Frequency score = 4: most quantity purchase. <br>
Best Monetary score = 4: spent the most.

Let's see who are our **Champions** (Top 10 customers).

In [None]:
rfm_segmentation[rfm_segmentation['RFMScore']=='444'].sort_values('Monetary', ascending=False).head(10)

**How many customers do we have in each segment?**

In [None]:
print("Best Customers: ",len(rfm_segmentation[rfm_segmentation['RFMScore']=='444']))
print('Loyal Customers: ',len(rfm_segmentation[rfm_segmentation['F_Quartile']==4]))
print("Big Spenders: ",len(rfm_segmentation[rfm_segmentation['M_Quartile']==4]))
print('Customers at risk of churning: ', len(rfm_segmentation[rfm_segmentation['RFMScore']=='244']))
print('Almost Churned Customers: ',len(rfm_segmentation[rfm_segmentation['RFMScore']=='144']))
print('Churned Customers: ',len(rfm_segmentation[rfm_segmentation['RFMScore']=='111']))

Now that we knew our customers segments we can choose how to target or deal with each segment.

For example:

**Best Customers - Champions**: Reward them. They can be early adopters to new products. Suggest them to share your products with their friends or family using "Referral Program" feature and when any of their referrals make their first purchase then they will also get some cashback or discount on products. It will help to increase conversion rates.

**Loyal Customers and Big Spenders**: Recommend your annual or quarterly membership program to them with additional benefits. By doing so, they will shop more frequently and for more amount.  

**Customers at the risk of churning**: Suggest your "Referral Program" and "Annual Membership Program" both to prevent these customers from churning as they were frequent and high spenders in the past. These should be focussed upon more.

**Almost Churned Customers**: Send them personalized emails and encourage them to shop. Along with that, recommend your top benefits program as they were also the best customers in the past.

**Churned Customers**: They probably bought once or very few times and they bought for very less amount. These should not be focussed more as they are already churned.

### Conclusion
To gain even further insight into customer behavior, we can dig deeper in the relationship between RFM variables.  

RFM model can be used in conjunction with certain predictive models like **K-means clustering**, **Logistic Regression** and **Recommendation Engines** to produce better informative results on customer behavior.

We will go for K-means since it has been widely used for Market Segmentation and it offers the advantage of being simple to implement.

In [None]:
rfm_data = rfm_df.drop(['R_Quartile','F_Quartile','M_Quartile','RFMScore','Rank'],axis=1)
rfm_data.head()

In [None]:
features = rfm_data.columns

In [None]:
sns.pairplot(rfm_data,diag_kind='kde')

### *Inferences:*
- All the features are highly right skewed.

In [None]:
sns.heatmap(rfm_data.corr(),annot=True)

### *Inferences:*
- There is some decent positive correlation between Monetary and Frequency features.

### We will apply Power Transformation to convert these features into a normal distribution.

In [None]:
from sklearn.preprocessing import PowerTransformer
pt = PowerTransformer()
rfm_data = pd.DataFrame(pt.fit_transform(rfm_data))
rfm_data.columns = features
rfm_data.head()

In [None]:
sns.pairplot(rfm_data,diag_kind='kde')

In [None]:
sns.heatmap(rfm_data.corr(),annot=True)

### *Inferences:*
- There is high positive correlation between Frequency and Monetary features after applying Power transformation.

# PCA

Applying PCA to reduce the the dimensions and the correlation between Frequency and Monetary features.

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
rfm_scaled = sc.fit_transform(rfm_data)
rfm_scaled[:5]

In [None]:
from sklearn.decomposition import PCA

pca = PCA()
pca_transformed_data = pca.fit_transform(rfm_scaled)

In [None]:
pca.explained_variance_

In [None]:
var_exp = pca.explained_variance_ratio_
var_exp

In [None]:
pca.components_

In [None]:
np.cumsum(var_exp)

In [None]:
plt.figure(figsize=(6,4))
plt.bar(range(3), var_exp, alpha=0.5, align='center', label='Individual explained variance')
plt.step(range(3), np.cumsum(var_exp), where='mid', label='Cumulative explained variance')
plt.ylabel('Explained Variance Ratio')
plt.xlabel('Principal Components')
plt.legend(loc='best')
plt.tight_layout()
plt.show()

In [None]:
X = rfm_scaled.copy()
pca = PCA(n_components=2)
df_pca = pca.fit_transform(X)

In [None]:
df_pca = pd.DataFrame(df_pca)
df_pca.head()

# K-Means Clustering

In [None]:
X = df_pca.copy()

In [None]:
from sklearn.cluster import KMeans

cluster_range = range(1, 15)
cluster_errors = []
cluster_sil_scores = []

for num_clusters in cluster_range:
  clusters = KMeans( num_clusters, n_init = 100,init='k-means++',random_state=0)
  clusters.fit(X)
  labels = clusters.labels_                     # capture the cluster lables
  centroids = clusters.cluster_centers_         # capture the centroids
  cluster_errors.append( clusters.inertia_ )    # capture the intertia

# combine the cluster_range and cluster_errors into a dataframe by combining them
clusters_df = pd.DataFrame({ "num_clusters":cluster_range, "cluster_errors": cluster_errors} )
clusters_df[0:10]

In [None]:
# Elbow plot

plt.figure(figsize=(12,6))
plt.plot(clusters_df['num_clusters'], clusters_df['cluster_errors'], marker = "o" )
plt.xlabel('Number of Clusters')
plt.ylabel('Cluster Errors')

In [None]:
for k in range(2,16):
    cluster = KMeans(n_clusters=k, random_state=0)
    labels = cluster.fit_predict(df_pca)
    
    sil_avg = silhouette_score(df_pca, labels)
    print('For',k,'clusters, average silhoutte score =',sil_avg)

### *Inferences:*
- We observe from the elbow plot a sharp bend after the number of clusters increase by 2.
- Silhoutte Score is also the highest for 2 clusters.
- But, there is also a significant reduce in cluster error as number of clusters increase from 2 to 4 and after 4, the reduction is not much.
- So, we will choose n_clusters = 4 to properly segment our customers.

In [None]:
kmeans = KMeans(n_clusters=4)
kmeans = kmeans.fit(df_pca)
labels = kmeans.predict(df_pca)
centroids = kmeans.cluster_centers_

print('Centroid Values:')
print(centroids)

In [None]:
# creating new column in df_pca dataframe for cluster number  
df_pca['Cluster'] = labels
df_pca.head()

In [None]:
df_pca['Cluster'].value_counts()

In [None]:
sns.pairplot(df_pca,diag_kind='kde',hue='Cluster')

In [None]:
df_pca.boxplot(by='Cluster', figsize=(15, 10))
plt.show()

In [None]:
customers_grouped = pd.DataFrame(pt.inverse_transform(rfm_data),columns=rfm_data.columns,index=rfm_df.index)
customers_grouped['Cluster'] = df_pca['Cluster'].values
customers_grouped['RFMScore'] = rfm_segmentation['RFMScore'].values
customers_grouped.head()

In [None]:
top_spenders_and_loyal_customers = customers_grouped[(customers_grouped['RFMScore'] == '444') | (customers_grouped['RFMScore'] == '443') | (customers_grouped['RFMScore'] == '434')]
top_spenders_and_loyal_customers

In [None]:
customers_churned = customers_grouped[(customers_grouped['RFMScore'] == '111') | (customers_grouped['RFMScore'] == '112') | (customers_grouped['RFMScore'] == '121')]
customers_churned

In [None]:
customers_at_risk_of_churning = customers_grouped[(customers_grouped['RFMScore'] == '144') | (customers_grouped['RFMScore'] == '143') | (customers_grouped['RFMScore'] == '134') | (customers_grouped['RFMScore'] == '133') | (customers_grouped['RFMScore'] == '142') | (customers_grouped['RFMScore'] == '124')]
customers_at_risk_of_churning

In [None]:
new_customers_or_avg_spenders = customers_grouped[(customers_grouped['RFMScore'] == '422') | (customers_grouped['RFMScore'] == '411') | (customers_grouped['RFMScore'] == '412') | (customers_grouped['RFMScore'] == '421') | (customers_grouped['RFMScore'] == '413') | (customers_grouped['RFMScore'] == '431')]
new_customers_or_avg_spenders

#### Inferences:
- **Loyal Customers and Big Spenders**: Reward them. They can be early adopters to new products. Suggest them to share your products with their friends or family using "Referral Program" feature and when any of their referrals make their first purchase then they will also get some cashback or discount on products. It will help to increase conversion rates. Recommend your annual or quarterly membership program to them with additional benefits. By doing so, they will shop more frequently and for more amount.

- **Customers Churned**: They probably bought once or very few times and they bought for very less amount. These should not be focussed more as they are already churned.

- **Customers at the risk of churning**: Suggest your "Referral Program" and "Annual Membership Program" both to prevent these customers from churning as they were frequent and high spenders in the past. These should be focussed upon the most to avoid churning.

- **New customers or Average Spenders**: Customers in this category are either new customers who shopped recently but they didn't spend much or the customers who shop frequently but spend very less amount. These customers should also be focussed more as they can turn out to be the best customers in the future by giving them relevant offers and discounts so that they will shop for more and more.

# Modelling

## Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, plot_roc_curve

In [None]:
y = df_pca['Cluster']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df_pca, y, test_size=0.3, random_state=42, stratify=y)

In [None]:
lr = LogisticRegression(max_iter=1000,random_state=0)
lr.fit(X_train, y_train)

In [None]:
y_test_predicted = lr.predict(X_test)
y_train_predicted = lr.predict(X_train)

In [None]:
accuracy_train = accuracy_score(y_train, y_train_predicted)
accuracy_test = accuracy_score(y_test, y_test_predicted)
print('Train Set Accuracy for Power Transformed Data:',round(accuracy_train*100,2),'%')
print('Test Set Accuracy for Power Transformed Data:',round(accuracy_test*100,2),'%')

In [None]:
kf= KFold(shuffle=True, n_splits=5, random_state=0)
score = cross_val_score(lr, df_pca, y, cv=kf, scoring='f1_weighted')
print('Bias Error:',1-np.mean(score))
print('Variance Error:',np.std(score,ddof=1))

In [None]:
cm = confusion_matrix(y_test, y_test_predicted)
print(cm)

- #### Precision = TruePositives / (TruePositives + FalsePositives)

- #### Recall = TruePositives / (TruePositives + FalseNegatives)

In [None]:
print(classification_report(y_test,y_test_predicted))

## Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB

In [None]:
nb = GaussianNB()
score = cross_val_score(nb, df_pca, y, cv=kf, scoring='f1_weighted')
print('Bias Error:',1-np.mean(score))
print('Variance Error:',np.std(score,ddof=1))

In [None]:
nb.fit(X_train,y_train)

In [None]:
y_train_predicted = nb.predict(X_train)
y_test_predicted = nb.predict(X_test)

accuracy_train = accuracy_score(y_train, y_train_predicted)
accuracy_test = accuracy_score(y_test, y_test_predicted)

print('Train Set Accuracy for Power Transformed Data:',round(accuracy_train*100,2),'%')
print('Test Set Accuracy for Power Transformed Data:',round(accuracy_test*100,2),'%')

In [None]:
print(confusion_matrix(y_test, y_test_predicted))

In [None]:
print(classification_report(y_test, y_test_predicted))

# Conclusion
We saw that using classification models like Logisitc Regression and Naive Bayes, we predicted the clusters for customers using RFM dataset as independent variables and Cluster as the target variable. The clusters predicted by the classification models perfectly aligns with K-Means clustering. So, we can conclude that our clusters are correct.