<a href="https://colab.research.google.com/github/radrams/olist_predict_clv_segments/blob/master/RetentionRate_Visualization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import random
import plotly.offline as pyoff
import plotly.graph_objs as go

np.random.seed(42)
random.seed(42)
warnings.filterwarnings('ignore')

# Make the default figures a bit bigger
plt.rcParams['figure.figsize'] = (7,4.5)
plt.rcParams["figure.dpi"] = 140

# Plotting parameters
sns.set(style="ticks")
sns.set_context("poster", font_scale = .5, rc={"grid.linewidth": 5})

############## Retention Rate ###############################
# Read the data
df1 = pd.read_csv('./data/olist_orders_dataset.csv')
df2 = pd.read_csv('./data/olist_customers_dataset.csv')
df3 = pd.read_csv('./data/olist_order_payments_dataset.csv')
df4 = pd.read_csv('./data/olist_order_items_dataset.csv')

# To create a dataframe contaning CustomerID and first purchase date
cols = ['customer_id', 'customer_unique_id']
customers = df2[cols]
customers = customers.set_index('customer_id')

# Get only the required columns
cols = ['order_id', 'customer_id', 'order_purchase_timestamp', 'order_status']
orders = df1[cols]
orders = orders.set_index('customer_id')

# Get only the required columns
cols = ['order_id', 'price']
order_items = df4[cols]
# we need even duplicates to calculate the price
order_items = order_items.set_index('order_id')
# Combine the prices of items within same order
order_items = order_items.groupby(['order_id'])['price'].sum().reset_index()

customers_orders = pd.merge(orders, customers, on='customer_id')
customers_orders['PurchaseTimeStamp'] = pd.to_datetime(customers_orders['order_purchase_timestamp'])
customers_orders ['PurchaseYearMonth'] = customers_orders['PurchaseTimeStamp'].map(lambda date: 100*date.year + date.month)
customers_orders['PurchaseYear'] = customers_orders['PurchaseTimeStamp'].map(lambda date: date.year)

orders_customers_price = pd.merge(customers_orders, order_items, on='order_id')

#calculate the Revenue per month
orders_customers_price = orders_customers_price.groupby(['PurchaseYearMonth','customer_unique_id'])['price'].sum().reset_index()

#create retention matrix with crosstab
cust_retention = pd.crosstab(orders_customers_price['customer_unique_id'], orders_customers_price['PurchaseYearMonth']).reset_index()

cust_retention.head()

#create an array of dictionary which keeps Retained & Total User count for each month
months = cust_retention.columns[1:]
retention_array = []
for i in range(len(months)-1):
    retention_data = {}
    selected_month = months[i+1]
    prev_month = months[i]
    retention_data['InvoiceYearMonth'] = int(selected_month)
    retention_data['TotalUserCount'] = cust_retention[selected_month].sum()
    # pd_series_selected = (cust_retention[selected_month] > 0)
    # pd_series_prev = (cust_retention[prev_month] > 0)
    # pd_Series_curr_prev = (cust_retention[selected_month] > 0) & (cust_retention[prev_month] > 0)
    # pd_S = pd_series_selected.index[pd_series_selected]
    # pd_P = pd_series_prev.index[pd_series_prev]
    # pd_P_S = pd_Series_curr_prev.index[pd_Series_curr_prev]
    retention_data['RetainedUserCount'] = cust_retention[(cust_retention[selected_month]>0) & (cust_retention[prev_month]>0)][selected_month].sum()
    retention_array.append(retention_data)

# convert the array to dataframe and calculate Retention Rate
cust_retention = pd.DataFrame(retention_array)
cust_retention['RetentionRate'] = cust_retention['RetainedUserCount'] / cust_retention['TotalUserCount']

# plot the retention rate graph
plot_data = [
    go.Scatter(
        x=cust_retention.query("InvoiceYearMonth>201610 & InvoiceYearMonth<201809")['InvoiceYearMonth'],
        y=cust_retention.query("InvoiceYearMonth>201610 & InvoiceYearMonth<201809")['RetentionRate']*100,
        name="organic"
    )
]

plot_layout = go.Layout(
    xaxis={"type": "category", 'title': "Period"},
    title='Monthly Retention Rate',
    yaxis= {'title': "RetentionRate"},
)
fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.iplot(fig)