In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from math import exp
import pickle



### Logistic Regression

We forecast the probability that a customer in a particular cohort will churn during the lead time before their next renewal. 

A key feature of the model for forecasting churn and retention is that the relationship between engagement and retention is subject to diminishing returns, i.e.
- even the most engaged customer has a chance of churning
- even the lear engaged customer has a chance of being retained

Although engagement is not directly measurable, we assume that behaviour can be estimated from the customer metrics that we've produced. 

Each behavioural metric score is multiplied by an engagement strength (weight/coefficient) that captures how much the behaviour (or group of behaviours) contributes to engagement. Overall engagement is the sum of the contributions for each behaviour, plus an _intercept_ term which shifts the sigmoidal curve such that a user with zero engagement (average user) has a realistic probability forecast for retention and churn.

We set up the model to predict _retention_ because this is easier to interpret: a positive number ro represent something good is more intuitive than a negative number.

<u>Relationship between metrics and retention probability</u>

The _retention impact_ of a metric or group of metrics is the difference that it makes to the retention probability for a customer to be one standard deviation above the average in this metric, assuming that all the other metrics are exactly average.

If the retention impact for a metric is 2%, a customer who is one standard deviation
above average on that metric and average in all the other metrics has a forecast retention probability 2% higher than the average retention probability.

In [2]:
# Put the data in the form needed for regression
def prepare_data(data_set_path, ext='_groupscore', as_retention=True):
    score_save_path = data_set_path.replace('.csv', '{}.csv'.format(ext))
    grouped_data = pd.read_csv(score_save_path, index_col=[0, 1])
    
    # Separate the outcome and convert it to Boolean (True for retention)
    y = grouped_data['is_churn'].astype(np.bool)
    if as_retention: y=~y

    # Separate the metrics
    X = grouped_data.drop(['is_churn'], axis=1)

    return X,y

In [3]:
# Sigmoid function
def s_curve(x):
    return 1.0 - (1.0 / 1.0 + exp(-x))

In [None]:
# Calculate the impact of being one standard deviation above average
def calculate_impacts(retain_reg):
    # Calculate the churn of a perfectly average customer
    average_retain = s_curve(-retain_reg.intercept_)
    # For every coefficient, calculate the impact
    one_stdev_retain = np.array([s_curve(-retain_reg.intercept_-c) for c in  retain_reg.coef_[0]])
    # The impact is the probability difference for one standard deviation above average
    one_stdev_impact = one_stdev_retain - average_retain
    
    return one_stdev_impact, average_retain

In [None]:
# Save a summary of the regression model
def save_regression_summary(data_set_path, retain_reg,ext=''):
    one_stdev_impact, average_retain = calculate_impacts(retain_reg)

    # Reuse the metrics in each group in the summary
    group_lists = pd.read_csv(data_set_path.replace('.csv', '_groupmets.csv'), index_col=0)
    
    # Create a DataFrame combining the results
    coef_df = pd.DataFrame.from_dict(
        {'group_metric_offset':  np.append(group_lists.index, 'offset'),
         'weight': np.append(retain_reg.coef_[0], retain_reg.intercept_),
         'retain_impact' : np.append(one_stdev_impact, average_retain),
         'group_metrics' : np.append(group_lists['metrics'], '(baseline)')})
    save_path = data_set_path.replace('.csv', '_logreg_summary{}.csv'.format(ext))
    coef_df.to_csv(save_path, index=False)

In [None]:
# Save the regression model itself by pickling it
def save_regression_model(data_set_path, retain_reg, ext=''):
    pickle_path = data_set_path.replace('.csv', '_logreg_model{}.pkl'.format(ext))
    with open(pickle_path, 'wb') as fid:
        pickle.dump(retain_reg, fid)

In [None]:
def save_dataset_predictions(data_set_path, retain_reg, X,ext=''):
    predictions = retain_reg.predict_proba(X)
    
    # Make a new DataFrame for saving the predictions
    predict_df = pd.DataFrame(predictions, index=X.index, columns=['churn_prob','retain_prob'])
    predict_path = data_set_path.replace('.csv', '_predictions{}.csv'.format(ext))
    predict_df.to_csv(predict_path,header=True)
    print('Saved dataset predictions to ' + predict_path)

The `LogisticRegression` object takes a few parameters:
- `fit_intercept=True` - tells the logistic regression that an offset is included in the model.
- `solver='liblinear'`, `penalty='l1'` - control the method used to find weights in the offset. This model uses a ridge regression method, which performs well when many metrics can have correlation.

In [None]:
# Perform logistic regression
def logistic_regression(data_set_path, as_retention=True):

    # Call the helper function for preparing the data
    X,y = prepare_data(data_set_path, as_retention=as_retention)

    #Fit the model coefficients based on the churn data
    retain_reg = LogisticRegression(penalty='l1', solver='liblinear', fit_intercept=True)
    retain_reg.fit(X, y)
    
    # Save a summary of the result, the model, and its predictions
    file_ext = '' if as_retention else '_churn'
    save_regression_summary(data_set_path, retain_reg, file_ext)
    save_regression_model(data_set_path, retain_reg, file_ext)
    save_dataset_predictions(data_set_path, retain_reg,X, file_ext)