In [2]:
import pandas as pd
import numpy as np

In this notebook, we score the current customer dataset in preparation for prediction. This involves the following steps:

1) Reload the score parameters saved from the historical customer dataset.
2) Convert the current customer metrics to scores using the historical dataset
statistics.
3) Reload the loading matrix created from the historical dataset.
4) Calculate average group scores for the current customers using the reloaded
loading matrix.

In [3]:
# Transform skewed scores
def transform_skew_columns(data, skew_col_names):
    for col in skew_col_names:
        data[col] = np.log(1.0 + data[col])

When a distribution is normal or has thin tails, the most extreme values aren't too extreme relative to the middle of the distribution.

If the ditribution of a metric has fat tails, the extreme values are further from the middle of the range, and there are more extreme values. Because of this, we use a 'fat-tails' version of the scoring formula:

In [4]:
# Loop over all columns with fat tails and apply a fat-tail score formula
def transform_fattail_columns(data, fattail_col_names):
    for col in fattail_col_names:
        data[col] = np.log(data[col] + np.sqrt(np.power(data[col],2) + 1.0))

In [11]:
# Reload the previously saved data with the appropriate index columns
def reload_churn_data(data_set_path, suffix, is_customer_data):
    data_path = data_set_path.replace('.csv', '_{}.csv'.format(suffix))
    ic = [0,1] if is_customer_data else 0
    churn_data = pd.read_csv(data_path, index_col=ic)
    return churn_data

NOTE: Generally, it's a good idea to use scores for all metric groups, and regular (natural scale) metrics for any metric that were not grouped because this is easier for businesspeople to understand. For this reason, `save_segment_data` akes the columns for the groups and then adds the original unscaled metrics.

In [6]:
# Take the columns for the groups and add the original unscaled metrics.
def save_segment_data(current_data_grouped, current_data, load_mat_df, data_set_path):
    # Determine columns for group metrics
    group_cols =  load_mat_df.columns[load_mat_df.astype(bool).sum(axis=0) > 1]
    no_group_cols = load_mat_df.columns[load_mat_df.astype(bool).sum(axis=0) == 1]
    # Make a version of the dataet for segmenting
    segment_df = current_data_grouped[group_cols].join(current_data[no_group_cols])
    segment_df.to_csv(data_set_path.replace('.csv', '_current_groupmets_segment.csv'),header=True)


In [7]:
# Group the current customer data
def group_current_data(scaled_data, load_mat_df, data_set_path):
    # Ensure the dataset columns match the loading matrix order
    scaled_data = scaled_data[load_mat_df.index.values]
    # Apply the loading matrix to calculate average group scores
    grouped_ndarray = np.matmul(scaled_data.to_numpy(), load_mat_df.to_numpy())
    
    # Convert the result to a DataFrame
    current_data_grouped = pd.DataFrame(grouped_ndarray,columns=load_mat_df.columns.values, index=scaled_data.index)
    
    # Save the result
    score_save_path = data_set_path.replace('.csv','_current_groupscore.csv')
    current_data_grouped.to_csv(score_save_path,header=True)
    print('Saving grouped results to %s' % score_save_path)
    return current_data_grouped

In [8]:
def score_current_data(current_data, score_df, data_set_path):
    # Ensure the dataset columns match the score param column
    current_data = current_data[score_df.index.values]
    
    # Subtract the mean and divide by the standard deviation
    scaled_data = (current_data-score_df['mean'])/score_df['std']
    score_save_path = data_set_path.replace('.csv','_current_scores.csv')
    scaled_data.to_csv(score_save_path, header=True)
    print('Saving score results to %s' % score_save_path)
    return scaled_data

In [9]:
def rescore_metrics(data_set_path):

    # Reload the loading matrix
    load_mat_df = reload_churn_data(data_set_path, 'load_mat', is_customer_data=False)
    
    # Reload the parameters saved during scoring
    score_df = reload_churn_data(data_set_path, 'score_params', is_customer_data=False)
    
    # Load the current customer data
    current_data = reload_churn_data(data_set_path, 'current', is_customer_data=True)
    
    assert set(score_df.index.values) == set(current_data.columns.values), "Data to re-score does not match transform params"
    assert set(load_mat_df.index.values) ==set(current_data.columns.values), "Data to re-score does not match loading matrix"

    # Transform any columns which were determined to be skewed (i.e. has a skew score)
    transform_skew_columns(current_data, score_df[score_df['skew_score']].index.values)
    
    # Transform any columns which were determined to be fat-tailed (i.e. has a fat-tail score)
    transform_fattail_columns(current_data, score_df[score_df['fattail_score']].index.values)
    
    scaled_data = score_current_data(current_data, score_df, data_set_path)
    grouped_data = group_current_data(scaled_data, load_mat_df, data_set_path)
    save_segment_data(grouped_data, current_data, load_mat_df, data_set_path)


In [12]:
data_set_path = '../../output/socialnet7_dataset2.csv'
rescore_metrics(data_set_path)

Saving score results to ../../output/socialnet7_dataset2_current_scores.csv
Saving grouped results to ../../output/socialnet7_dataset2_current_groupscore.csv
