In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
online_retial = pd.read_csv('../input/online-retail-ii-uci/online_retail_II.csv')
print(online_retial.shape)

In [None]:
online_retial.head()

In [None]:
online_retial.describe()

In [None]:
online_retial.head().describe(include=['O'])

In [None]:
online_retial.info()

In [None]:
online_retial['InvoiceDate'] = pd.to_datetime(online_retial['InvoiceDate'])

In [None]:
#replace null customer with 99999
online_retial[['Customer ID']] =online_retial[['Customer ID']].fillna(99999)
#replace null description values with 'Customer_Null'
online_retial[['Description']] =online_retial[['Description']].fillna('Customer_Null')

In [None]:
cancelled_orders = online_retial.loc[online_retial['Invoice'].str.contains("C", regex=False, na=False)]
cancelled_orders.head()

cancelled_orders.shape

In [None]:
indx = cancelled_orders.index
online_retial1 = online_retial.drop(indx)
online_retial2 = online_retial1.drop(online_retial1.loc[online_retial1.Quantity<0].index)
print(online_retial.shape)
print(online_retial1.shape)
print(online_retial2.shape)

In [None]:
online_retial2["Customer ID"] = online_retial2["Customer ID"].astype(str) 
online_retial2["Invoice"] = online_retial2["Invoice"].astype(str) 

In [None]:
#Highest sold items
online_retial2.groupby("Description").agg({"Quantity":"sum"}).sort_values("Quantity", ascending = False).head()

In [None]:
#top 5 countries with the highest number of orders
online_retial2["Country"].value_counts().head()

In [None]:
stockcode_outlier = online_retial2[online_retial2['StockCode'].str.contains('^[a-zA-Z]+', regex=True)]['StockCode'].unique()
stockcode_outlier 

In [None]:
for code in stockcode_outlier : 
    online_retial2 = online_retial2[online_retial2['StockCode']!= code]

# RFM Analysis

In [None]:
amount = pd.DataFrame(online_retial2.Quantity * online_retial2.Price, columns = ['Amount'])

amount

In [None]:
#In Sterling currency 
print(amount.sum())

In [None]:
#Spend - Top 5 and least 5 customers 

data_cust = np.array(online_retial2['Customer ID'], dtype=np.object)

data_cust = pd.DataFrame(data_cust, columns = ["Customer ID"])

data_cust = pd.concat(objs = [data_cust, amount], axis = 1, ignore_index = False)

monetary = data_cust.groupby(by = ["Customer ID"]).Amount.sum()

monetary = monetary.reset_index()
monetary = monetary[monetary['Customer ID'] != 99999]
monetary.sort_values(by=['Amount'], ascending=False)

In [None]:
#Frequency - Top 5 and least 5 customers 

frequency = online_retial2[['Customer ID', 'Invoice']]

frequency_df = frequency.groupby("Customer ID").Invoice.count()
frequency_df = pd.DataFrame(frequency_df)
frequency_df = frequency_df.reset_index()
frequency_df.columns = ["Customer ID", "Frequency"]
frequency_df = frequency_df[frequency_df['Customer ID'] != 99999]
frequency_df.sort_values(by=['Frequency'], ascending=False)

In [None]:
import datetime as dt  
online_retial2["InvoiceDate"].max()

In [None]:
latest_date = dt.datetime(2011,12,9)

recency = (latest_date - online_retial2.groupby("Customer ID").agg({"InvoiceDate":"max"}))
# Rename column name as Recency
recency.rename(columns = {"InvoiceDate":"Recency"}, inplace = True)
# Change the values to day format
recency_df = recency["Recency"].apply(lambda x: x.days)
recency_df

In [None]:
RFM = frequency_df.merge(monetary, on = "Customer ID")
RFM = RFM.merge(recency_df, on = "Customer ID")
RFM

In [None]:
RFM = RFM[(RFM["Recency"]) > 0 & (RFM["Frequency"] > 0)]

In [None]:
RFM

In [None]:
import plotly.express as px

fig = px.box(RFM, y = [ 'Frequency', 'Amount'] )
fig.show()