In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np
import pandas as pd
import datetime as dt

pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: '%.5f' % x)

import matplotlib.pyplot as plt
import squarify
import seaborn as sns


import os
print(os.listdir('../input/'))


import warnings
warnings.filterwarnings("ignore")
warnings.simplefilter(action='ignore', category=FutureWarning)

# read data

In [None]:
df = pd.read_csv('../input/onlineretail/OnlineRetail.csv', encoding = 'unicode_escape')
print(df.shape)
df.head()

In [None]:
df.info()

# Data Preperation

In [None]:
df.dropna(inplace=True)
# there are negative values on Quantity variable, this is caused by the refund invoices (Invoices containing the letter "C"), reassign df without refund invoices
df = df[~df["InvoiceNo"].str.contains("C", na=False)]

In [None]:
df.describe([0.01, 0.05, 0.10, 0.25, 0.50, 0.75, 0.90, 0.95, 0.99]).T

#### negative values are excluded. We are not removing outliers (such as the max value on Quantity and Price variables)

# data visualization

In [None]:
df[["Quantity", "UnitPrice"]].boxplot();

In [None]:
# checking the different values for country in the dataset

plt.rcParams['figure.figsize'] = (12, 10)
a = df['Country'].value_counts().head(21)[1:]
sns.barplot(x = a.values, y = a.index, palette = 'PuBuGn_d')
plt.title('Top 20 Countries having Online Retail Market except UK', fontsize = 20)
plt.xlabel('Names of Countries')
plt.ylabel('Count')
plt.show()

In [None]:
# looking at each country's sales
color = plt.cm.viridis(np.linspace(0, 1, 20))
df['Sales'] = df['UnitPrice'] * df['Quantity']
df['Sales'].groupby(df['Country']).agg('sum').sort_values(ascending = False).head(21)[1:].plot.bar(figsize = (15, 7),color = color)
#sns.barplot(x = b.values, y = b.index, palette = 'magma')
plt.title('Top 20 Sales of all the Countries Except UK', fontsize = 20)
plt.xlabel('Names of the Countries')
plt.ylabel('Number of sales')
plt.show()

In [None]:
from wordcloud import WordCloud
from wordcloud import STOPWORDS

stopwords = set(STOPWORDS)
wordcloud = WordCloud(background_color = 'white', width = 900, height = 900).generate(str(df['Description']))

print(wordcloud)
plt.rcParams['figure.figsize'] = (12, 12)
plt.axis('off')
plt.imshow(wordcloud)
plt.title('Most Occuring word in the Description list', fontsize = 20)
plt.show()

In [None]:
# checking how many unique customer IDs and different number of unique countriesare there

x = df['CustomerID'].nunique()
y = df['Country'].nunique()

# printing the value
print("There are {} number of different customers".format(x))
print("There are {} number of different countries who do online retailing from UK".format(y))


In [None]:
# time-series plot for Australia

dataset = df[df['Country'] == 'Australia']
dataset.plot(x = 'InvoiceDate', y = 'Sales')
plt.title('Time-Series for Australia', fontsize = 20)
plt.xlabel('Date of Purchase')
plt.ylabel('Sales Amount')
plt.show()

# Cohort Analysis

In [None]:
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])
df.head(2)

In [None]:
#creating invoice month column to see first month when customer purchased 
df['InvoiceMonth'] = df['InvoiceDate'].apply(lambda x: dt.datetime(x.year, x.month, 1))

#assign smallest invoice value to each customer
df['CohortMonth'] = df.groupby('CustomerID')['InvoiceMonth'].transform('min')
df.head()

In [None]:
#function to extract year, month, day as integers
def get_date_int(df, column):
    year = df[column].dt.year
    month = df[column].dt.month
    day = df[column].dt.day
    return year, month, day

In [None]:
#extract month
invoice_year, invoice_month, _ = get_date_int(df, 'InvoiceMonth')
cohort_year, cohort_month, _ = get_date_int(df, 'CohortMonth')

In [None]:
years_diff = invoice_year - cohort_year
months_diff = invoice_month - cohort_month

In [None]:
# Extract the difference in days from all previous values
df['CohortIndex'] = years_diff * 12 + months_diff + 1
df.head(2)

In [None]:
#count monthly active customers from each cohort
cohort_data = df.groupby(['CohortMonth', 'CohortIndex'])['CustomerID'].apply(pd.Series.nunique).reset_index()
cohort_counts = cohort_data.pivot(index='CohortMonth', columns = 'CohortIndex', values='CustomerID')

In [None]:
#Customer retention
cohort_sizes = cohort_counts.iloc[:,0]
retention = cohort_counts.divide(cohort_sizes, axis=0)
retention = retention.round(3) * 100
retention.head(20)

In [None]:
month_list = ["Dec '10", "Jan '11", "Feb '11", "Mar '11", "Apr '11",\
              "May '11", "Jun '11", "Jul '11", "Aug '11", "Sep '11", \
              "Oct '11", "Nov '11", "Dec '11"]

plt.figure(figsize=(15,8))
plt.title('Retention by Monthly Cohorts')
sns.heatmap(data=retention,
            annot = True,
            cmap = "Greens",
            vmin = 0.0,
            vmax = list(retention.max().sort_values(ascending = False))[1]+3,
            fmt = '.1f',
            linewidth = 0.3,
            yticklabels=month_list)

plt.show()

# RFM analysis

### create a new df called rfm in order to calculate Recency, Frequency and Monetary values.

* the number of days between the last purchase date of this customer is Recency
* the number of unique invoices of this customer is Frequency
* the sum of sales is this customer's Monetary

In [None]:
last_date = df['InvoiceDate'].max() #+ dt.timedelta(days=1)
last_date

In [None]:
rfm = df.groupby('CustomerID').agg({'InvoiceDate': lambda date: (last_date - date.max()).days,
                                    'InvoiceNo': lambda inv: inv.nunique(),
                                    'Sales': lambda price: price.sum()})
rfm.columns = ['Recency', 'Frequency', 'Monetary']
rfm.head()

In [None]:
#check if there are any zeros in rfm:
rfm.describe([0.01, 0.05, 0.10, 0.25, 0.50, 0.75, 0.90, 0.95, 0.99]).T

# RFM Scores
* the min number of Recency metric means that this customer has just purchased, so the highest score (5) should be given to the lower number of Recency.
* the max number of Frequency and Monetary metrics mean that the customer is purchasing frequently and spending more money, so the highest score (5) should be given to the highest Frequency and Monetary values.

In [None]:
rfm["RecencyScore"] = pd.qcut(rfm['Recency'], 5, labels=[5, 4, 3, 2, 1])

rfm["FrequencyScore"] = pd.qcut(rfm['Frequency'].rank(method="first"), 5, labels=[1, 2, 3, 4, 5])

rfm["MonetaryScore"] = pd.qcut(rfm['Monetary'], 5, labels=[1, 2, 3, 4, 5])

In [None]:
rfm["RFM_SCORE"] = (rfm['RecencyScore'].astype(str) +
                    rfm['FrequencyScore'].astype(str) +
                    rfm['MonetaryScore'].astype(str))

rfm.head()

In [None]:
# display some of the customers with the highest scores:
rfm[rfm['RFM_SCORE'] == "555"].head()

# Naming the RFM Scores

In [None]:
# the following dict has been made according to the famous RFM graphic
seg_map = {
    r'[1-2][1-2]': 'Hibernating',      # Customer's shopped long ago but with less frequency and monetary value
    r'[1-2][3-4]': 'At_Risk',          # Customer's shopping less often now who used to shop a lot
    r'[1-2]5': 'Cant_Lose',            # Customer's shopped long ago who used to shop a lot.
    r'3[1-2]': 'About_to_Sleep',
    r'33': 'Need_Attention',           # High monetary value but good recency and frequency values
    r'[3-4][4-5]': 'Loyal_Customers',  # High frequency as well as monetary value with good recency
    r'41': 'Promising',
    r'51': 'New_Customers',            # Customer's who recently started shopping a lot but with less monetary value
    r'[4-5][2-3]': 'Potential_Loyalists', # High recency and monetary value, average frequency
    r'5[4-5]': 'Best Customers'        # Highest frequency as well as monetary value with least recenc
}

In [None]:
#we will be using Recency and Frequency scores for customer segmentation. 
#We are assuming that a customer who has recently purchased and who is often purchasing should have high RFM scores.
rfm['Segment'] = rfm['RecencyScore'].astype(str) + rfm['FrequencyScore'].astype(str)
rfm['Segment'] = rfm['Segment'].replace(seg_map, regex=True)
rfm=rfm.reset_index()
rfm.head(2)

In [None]:
rfm[["Segment", "Recency", "Frequency", "Monetary"]].groupby("Segment").agg(["mean", "count"])

In [None]:
rfm.head()

In [None]:
retail_rfm_segments = rfm.groupby('Segment')['CustomerID'].count().reset_index(name='counts')
retail_rfm_segments.head(15)

In [None]:
#let's exclude others segment for visualization
segment = list(retail_rfm_segments.Segment)
score = list(retail_rfm_segments.counts)
color_list = ["#248af1", "#eb5d50", "#8bc4f6", "#8c5c94", "#a170e8", "#fba521", "#75bc3f",'#50ebde','#808080']
plt.figure(figsize=(12,8))
plt.title('Customer Segments distribution')
squarify.plot(sizes=score, label=segment,color=color_list, alpha=0.7)

plt.show()