# Calculating and rendering range retention in Pyhton

2/12/2019

Raphael Vannson

In [1]:
import pandas as pd
from datetime import datetime

# Load data from CSV

In [2]:
orders_df = pd.read_csv('../data/orders.csv')
orders_df.dtypes

FileNotFoundError: [Errno 2] File b'data/orders.csv' does not exist: b'data/orders.csv'

In [None]:
orders_df.head()

# Change the `orderdate` column type from String to a Datetime

In [None]:
# used to set column type
orders_df['orderdate'] = orders_df['orderdate'].apply(lambda x: datetime.strptime(x, '%Y-%m-%d'))
orders_df.dtypes

# Range retention

In [None]:
# Define cohorts by the month of the first order

# Take the first order date per customer
cohorts_df = orders_df \
.groupby('customerid')['orderdate'] \
.agg('min') \
.pipe(pd.DataFrame)


# Customerid is in the index
# Create a column with the customerid
cohorts_df = cohorts_df.reset_index()


# Extract the month from the order date
def get_month(date):
    return(date.month)

cohorts_df['join_month'] = cohorts_df['orderdate'].apply(lambda x: get_month(x))


# Drop the date column
cohorts_df = cohorts_df.drop('orderdate', axis = 1)

# Each customer Id is associated with the month
# they first purchased (join_month) - that is also
# the cohort ID
cohorts_df.head()

In [None]:
# Count cohort sizes

cohort_sizes_df = cohorts_df \
.groupby('join_month')['customerid'] \
.agg('count') \
.rename('cohort_size') \
.pipe(pd.DataFrame) \
.reset_index()

cohort_sizes_df

In [None]:
# Cohort activity for each month
# One row per customer who came back at least once
# 'activity_index' months after activation

# Get the activity month
df = pd.merge(orders_df, cohorts_df,
         how = 'inner',
         left_on = 'customerid',
         right_on = 'customerid')


# Add the activity month and index
df['activity_month'] = df['orderdate'].apply(lambda x: get_month(x))
df['activity_index'] = df['activity_month'] - df['join_month']


# Remove activity for the first month
df = df[df.activity_index > 0]


# Make sure there is a max of one row per customer per month
# (customers who came back at least once in a month)
df = df[['customerid', 'join_month', 'activity_index']].drop_duplicates()

df.head()

In [None]:
# Count the number of returning customers
# per cohort, per month

activity_size_df = df \
.groupby(['join_month', 'activity_index'])['customerid'] \
.agg('count') \
.rename('activity_size') \
.pipe(pd.DataFrame) \
.reset_index()

activity_size_df.head()

In [None]:
# Calculate the retention rate
# per month after activation for each cohort

retention_df = pd.merge(cohort_sizes_df, activity_size_df,
        how = 'inner',
        left_on = 'join_month',
        right_on = 'join_month')


retention_df['retention_rate'] = round(retention_df['activity_size'] / retention_df['cohort_size'] * 100, 2)
retention_df.head()

In [None]:
retention_tbl = retention_df \
.pivot_table(values = 'retention_rate',
             index = ['join_month', 'cohort_size'], 
             columns = 'activity_index')

retention_tbl

"Plot" the table to make it easier to find the high / low retention cells and visally detect wether there is a pattern appears.

In [None]:
# NaNs are replaced by 0s style.background_gradient() does not like columns with NaNs starting with python 3.7...
retention_tbl.fillna(0).style.background_gradient(cmap = 'Greens')

# See https://matplotlib.org/tutorials/colors/colormaps.html
# for valid cmaps. Examples:
#retention_tbl.style.background_gradient(cmap = 'Blues')
#retention_tbl.style.background_gradient(cmap = 'Wistia')