In [113]:
import plotly.plotly as py
import plotly.offline as off
from plotly.graph_objs import Scatter, Layout
from plotly import tools

In [114]:
import datetime as dt
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import pandas as pd

from collections import OrderedDict

from plotly import __version__
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)
import plotly.plotly as py
import plotly.graph_objs as go

In [115]:
df_plot = pd.read_pickle('dataframe_for_plots.pkl')
df_plot['info_level'].iloc[2] = 'first_purchase'

In [116]:
info_level_list=[]
for entry in df_plot['info_level']:
    info_level_list.append(entry.replace('_',' '))
    
df_plot['info_level'] = info_level_list

In [117]:
for col in ['1000','2000','5000','10000']:
    info = []
    for entry in df_plot[col]:
        info.append(entry.round(1))

    df_plot[col]=info

In [118]:
thres_df = pd.DataFrame(['£100', '£200','£500','£1000'])
thres_df.columns=['High value threshold']
thres_df['Revenue (%)'] = [88.3,82.5,71.0, 60.8]
thres_df['Customer (%)'] = [17.0,10.9,5.4, 2.8]

trans_df = thres_df.transpose()
trans_df.columns=['£100', '£200','£500','£1000']
trans_df = trans_df[1:]

Predicting high value customers is important two different reasons. 

The first is to allow flagging up predicted high value customers to allow for resources to be targeted more efficently. 

The second is to allow marketing to optimise as quickly as possible advertising and channels which are bringing in high value customers over channels which bring in people for cheaper but significantly lower in value. 

### What is a high value customer?

In this work we have defined a high value customer as someone who spends above a given threshold in the first six months. As in our previous work, we have seen that only a few users significantly change their behaviour, so this simplification allows for very concrete metrics without losing the details. In the following work, we have used the following different thresholds: £100, £200, £500, £1000.

The following graph shows the percentage of customers above each threshold and the percentage of the revenues that they contribute to

In [119]:
data=[]
for col in trans_df.columns:
    trace = go.Bar(
        x=trans_df.index,
        y=trans_df[col],
        name=col
        )
    
    data.append(trace)


layout = go.Layout(
    title='Comparsion of the different thresholds',
    barmode= 'overlay',
    xaxis=dict(
    ),
    yaxis=dict(
        title = "Revenue (%)"
    )
)
fig = go.Figure(data=data, layout=layout)
off.iplot(fig, filename='side-by-side-subplot')

In [120]:
trace1 = go.Bar(
    x=thres_df['High value threshold'],
    y=thres_df['Revenue (%)'],
    name='Revenue'
)

trace2 = go.Bar(
    x=thres_df['High value threshold'],
    y=thres_df['Customer (%)'],
    xaxis='x2',
    yaxis='y2',
    name='Customers'
)

data = [trace1, trace2]
layout = go.Layout(
    title='Comparsion of the different thresholds',
    xaxis=dict(
        domain=[0, 0.4]
    ),
    yaxis=dict(
        title = "Revenue (%)"
    ),
    xaxis2=dict(
        domain=[0.55, 1]
    ),
    yaxis2=dict(
        anchor='x2',
        title = "Customers (%)"
    )
)
fig = go.Figure(data=data, layout=layout)
off.iplot(fig, filename='side-by-side-subplot')

In [121]:
trace1 = go.Scatter(
    y=thres_df['Revenue (%)'],
    x=thres_df['Customer (%)'],
    text=thres_df['High value threshold'],
    mode = 'markers',
    marker = dict(
        size = 15,
        color = 'rgba(255, 182, 193, .9)',
        line = dict(
            width = 4,
        )
    )
)


data = [trace1]
layout = go.Layout(
    barmode='group',
    yaxis = dict(
    title='Revenue (%)',
    range=[0,100]
    
    ),
    xaxis = dict(
    title='Customer (%)',
    range=[0,25]
    ),
    title='Comparsion of the different thresholds'
)

fig = go.Figure(data=data, layout=layout)
off.iplot(fig, filename='grouped-bar')


The threshold choice should not be set in stone, but it should depend on the different tasks. For instance, for personalised engagement the £500 threshold might be appropriate, whereas for prioritising initial phone calls and introductory packages the £100 threshold might be more appropriate.

## The model

After an initial exploration phase, we have selected relevant features and we have built a Machine Learning model. The model assigns a probability to each customer of being HVC. If we rank the customers depending on the associated probability, the model is an effective tool for prioritising the customers to be targeted. 

The blue line on the plot shows a random sorting of the clients. If we don't have any model the HVCs will be randomly inserted in the list of clients, and in the first 20% of the list we can expect to find 20% of HVC, in 50% of the list 50% of HVCs and so on. 

If we sort the list according to the probability assigned by our model, however, we can see that 50% of HVCs can be found in less than 10% of the list of customers. This means a large part of HCVs can be reached by targeting a small percentage of customers. 

As an example, if we chose the £200 threshold and then look at data available after 1 week we see we would need to call 8.3% of customers to reach 50% of the customers who will go on to spend over £200 in the first six months. To get the same coverage of HVCs using a random system to call them, we would have to call 50% of the users, so this model is 5 times more effective.


## Model performance
The following graph shows how the model performs at each stage in the customer journey from sign up through to when they have been users for 30 days.

As a metric to compare the models we have used "% customers needed to interact with to hit 50% of the HVC's". 



In [122]:
trace1 = go.Bar(
    x=df_plot['info_level'],
    y=df_plot['1000'],
    name='£100',
)

trace2 = go.Bar(
        x=df_plot['info_level'],
    y=df_plot['2000'],
    name='£200',
)

trace3 = go.Bar(
        x=df_plot['info_level'],
    y=df_plot['5000'],
    name='£500',
)

trace4 = go.Bar(
        x=df_plot['info_level'],
    y=df_plot['10000'],
    name='£1000',
)

data = [trace1,trace2,trace3,trace4]
layout = go.Layout(
    barmode='overlay',
    xaxis = dict(
    title='Postion in journey'
    ),
    yaxis = dict(
    title='Percentage of customers to reach 50% of HV customers'
    ),
    title='Performance of model'
)

fig = go.Figure(data=data, layout=layout)
off.iplot(fig, filename='grouped-bar')
