This notebook is to fine tune the XGBoost model and perform evaulation on user data for 7 day Free Trial.

The whole process has been divided into 2 notebooks: 

- Part 1: Data Preprocessing: 6.0_sk_fine_tuning_FT_propensity_data_preprocessing.ipynb 
- part 2: Data Modeling and Evaluation: 6.0_sk_fine_tuning_FT_propensity_data_modeling.ipynb (this notebook)

In [369]:
!free -m

             total       used       free     shared    buffers     cached
Mem:         70342      39816      30526          0        936       8534
-/+ buffers/cache:      30345      39997
Swap:            0          0          0


In [371]:
import warnings
warnings.filterwarnings('ignore')

In [303]:
import os
import json
import numpy as np
import pandas as pd

In [5]:
from sklearn.metrics import accuracy_score, auc, classification_report, confusion_matrix, roc_auc_score, roc_curve
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

In [6]:
import matplotlib.pyplot as plt
import seaborn as sn

In [7]:
import boto3
import sagemaker
from sagemaker.amazon.amazon_estimator import get_image_uri
from sagemaker.predictor import csv_serializer
from sagemaker.tuner import IntegerParameter, CategoricalParameter, ContinuousParameter, HyperparameterTuner
import sagemaker.xgboost as xgboost

In [8]:
RANDOM_STATE=101
SMALL_DATASET=False

In [9]:
TARGET_COL="FLG_TARGET"
SEGMENT_COL="FT_SEGMENT"

In [10]:
BUCKET = "datascience-hbo-users"
PREFIX = "users/sk/FT_propensity/7_day"
DATA_PREFIX=PREFIX+"/model_input_data"
MODEL_PREFIX=PREFIX+"/model_artifacts"
INFERENCE_PREFIX=PREFIX+"/inference"

# 1. Data Ingestion

In [316]:
import gc
gc.collect()

0

# 2. Data Processing/Cleaning

## Sanity Check of S3 data

In [372]:
train_file_name="train.csv"
test_file_name="test.csv"
val_file_name="val.csv"

s3_input_train = sagemaker.s3_input(s3_data='s3://{}/{}/{}'.format(BUCKET, DATA_PREFIX, train_file_name), content_type='csv')
s3_input_test = sagemaker.s3_input(s3_data='s3://{}/{}/{}'.format(BUCKET, DATA_PREFIX, test_file_name), content_type='csv')
s3_input_val = sagemaker.s3_input(s3_data='s3://{}/{}/{}'.format(BUCKET, DATA_PREFIX, val_file_name), content_type='csv')



In [13]:
TRAIN_DATA_URL=s3_input_train.config["DataSource"]["S3DataSource"]["S3Uri"]
TEST_DATA_URL=s3_input_test.config["DataSource"]["S3DataSource"]["S3Uri"]
VAL_DATA_URL=s3_input_val.config["DataSource"]["S3DataSource"]["S3Uri"]

# df_train_s3=pd.read_csv(TRAIN_DATA_URL , header=None)
#df_test_s3=pd.read_csv(TEST_DATA_URL, header=None)
# df_val_s3=pd.read_csv(VAL_DATA_URL, header=None)

In [14]:
# assert not df_train_s3.isnull().sum().any()
# assert not df_test_s3.isnull().sum().any()
# assert not df_val_s3.isnull().sum().any()

In [15]:
train_side_info_file_name="train_side_info.csv"
test_side_info_file_name="test_side_info.csv"
val_side_info_file_name="val_side_info.csv"

s3_input_train_side_info = sagemaker.s3_input(s3_data='s3://{}/{}/{}'.format(BUCKET, DATA_PREFIX, train_side_info_file_name), content_type='csv')
s3_input_test_side_info = sagemaker.s3_input(s3_data='s3://{}/{}/{}'.format(BUCKET, DATA_PREFIX, test_side_info_file_name), content_type='csv')
s3_input_val_side_info = sagemaker.s3_input(s3_data='s3://{}/{}/{}'.format(BUCKET, DATA_PREFIX, val_side_info_file_name), content_type='csv')

's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.
's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.
's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.


In [86]:
# TRAIN_DATA_SIDE_INFO_URL=s3_input_train_side_info.config["DataSource"]["S3DataSource"]["S3Uri"]
TEST_DATA_SIDE_INFO_URL=s3_input_test_side_info.config["DataSource"]["S3DataSource"]["S3Uri"]
VAL_DATA_SIDE_INFO_URL=s3_input_val_side_info.config["DataSource"]["S3DataSource"]["S3Uri"]
side_info_cols=["UNIQUE_ID", "HBO_UUID", "PERIOD_RANK", "FT_SEGMENT"]
# df_train_side_info_s3=pd.read_csv(TRAIN_DATA_SIDE_INFO_URL, names=side_info_cols)
df_test_side_info_s3=pd.read_csv(VAL_DATA_SIDE_INFO_URL, names=side_info_cols)
df_val_side_info_s3=pd.read_csv(TEST_DATA_SIDE_INFO_URL, names=side_info_cols)

In [288]:
df_test_side_info_s3.shape

(684451, 5)

In [370]:
# assert df_train_s3.shape[1]==df_test_s3.shape[1]
# assert df_train_s3.shape[1]==df_val_s3.shape[1]

In [62]:
# del df_train_s3
# del df_test_s3
# del df_val_s3

# 4. Feature Engineering

In [None]:
# Create a flag STREAM for NUM_STREAM_ADJ>0
# Discuss with Cindy to create more features

# 5. Model Training

In [24]:
region = boto3.Session().region_name
smclient = boto3.Session().client('sagemaker')
role = sagemaker.get_execution_role()
sess = sagemaker.Session()
container = get_image_uri(region, 'xgboost', repo_version='latest')

'get_image_uri' method will be deprecated in favor of 'ImageURIProvider' class in SageMaker Python SDK v2.
	get_image_uri(region, 'xgboost', '1.0-1').


## 5.1 Model 1

In [25]:
xgb_model_1 = sagemaker.estimator.Estimator(container,
                                    role, 
                                    train_instance_count=5, 
                                    train_instance_type='ml.m4.4xlarge',
                                    output_path='s3://{}/{}/{}'.format(BUCKET, MODEL_PREFIX, 'baseline'),
                                    sagemaker_session=sess)
xgb_model_1.set_hyperparameters(
                        eval_metric='auc'
                        , alpha=1.218487609
                        , eta=0.225242353
                        , max_depth=10
                        , min_child_weight=2.284773815
                        , num_round=100
                        , objective='binary:logistic'
                        , rate_drop=0.3
                        , tweedie_variance_power=1.4
                        )

xgb_model_1.fit({'train': s3_input_train, 'validation': s3_input_val})



2020-07-15 21:36:12 Starting - Starting the training job...
2020-07-15 21:36:16 Starting - Launching requested ML instances......
2020-07-15 21:37:30 Starting - Preparing the instances for training.........
2020-07-15 21:39:07 Downloading - Downloading input data...............
2020-07-15 21:41:34 Training - Downloading the training image..[33mArguments: train[0m
[33m[2020-07-15:21:41:55:INFO] Running distributed xgboost training.[0m
[36mArguments: train[0m
[36m[2020-07-15:21:41:55:INFO] Running distributed xgboost training.[0m
[32mArguments: train[0m
[35mArguments: train[0m
[35m[2020-07-15:21:41:55:INFO] Running distributed xgboost training.[0m
[32m[2020-07-15:21:41:56:INFO] Running distributed xgboost training.[0m
[34mArguments: train[0m
[34m[2020-07-15:21:41:56:INFO] Running distributed xgboost training.[0m
[33m[2020-07-15:21:41:58:INFO] Number of hosts: 5, master IP address: 10.2.149.189, host IP address: 10.2.149.194.[0m
[33m[2020-07-15:21:41:58:INFO] Finish

[34m2020-07-15 21:44:06,247 INFO [14]#011train-auc:0.896495#011validation-auc:0.893952[0m
[34m2020-07-15 21:44:11,425 INFO [15]#011train-auc:0.897159#011validation-auc:0.894474[0m
[34m2020-07-15 21:44:16,749 INFO [16]#011train-auc:0.89776#011validation-auc:0.894941[0m
[34m2020-07-15 21:44:22,027 INFO [17]#011train-auc:0.898277#011validation-auc:0.895342[0m
[34m2020-07-15 21:44:27,399 INFO [18]#011train-auc:0.898804#011validation-auc:0.895748[0m
[34m2020-07-15 21:44:32,608 INFO [19]#011train-auc:0.899202#011validation-auc:0.896036[0m
[34m2020-07-15 21:44:37,827 INFO [20]#011train-auc:0.899694#011validation-auc:0.896417[0m
[34m2020-07-15 21:44:42,919 INFO [21]#011train-auc:0.900196#011validation-auc:0.896803[0m
[34m2020-07-15 21:44:48,068 INFO [22]#011train-auc:0.900694#011validation-auc:0.897155[0m
[34m2020-07-15 21:44:53,421 INFO [23]#011train-auc:0.901198#011validation-auc:0.897564[0m
[34m2020-07-15 21:44:58,840 INFO [24]#011train-auc:0.901712#011validation-auc:0.

[35m[2020-07-15:21:52:21:INFO] Master host is not alive. Training might have finished. Shutting down.... Check the logs for algo-1 machine.[0m
[32m[2020-07-15:21:52:24:INFO] Master host is not alive. Training might have finished. Shutting down.... Check the logs for algo-1 machine.[0m
[33m[2020-07-15:21:52:54:INFO] Master host is not alive. Training might have finished. Shutting down.... Check the logs for algo-1 machine.[0m

2020-07-15 21:53:11 Uploading - Uploading generated training model
2020-07-15 21:53:11 Completed - Training job completed
[36m[2020-07-15:21:52:55:INFO] Master host is not alive. Training might have finished. Shutting down.... Check the logs for algo-1 machine.[0m
Training seconds: 4220
Billable seconds: 4220


## 5.3 Hyperparameter Tuning Job 1

## 5.4 Hyperparameter Tuning Job 2

## 5.5 Hyperparameter Tuning Job 3

# 6. Model Prediction

## Batch Prediction

In [41]:
def batch_transform(model, s3_data_path, dataset_type="train", bucket=BUCKET, prefix=INFERENCE_PREFIX):
    model_transformer = model.transformer(instance_count=2,
                                  instance_type='ml.m4.xlarge',
                                  strategy='MultiRecord',
                                  assemble_with='Line',
                                  output_path='s3://{}/{}/transform/{}'.format(bucket, prefix, dataset_type)
                                  , accept="text/csv")

    return model_transformer.transform(data=s3_data_path, content_type='text/csv', split_type='Line', input_filter='$[1:]', join_source='Input', output_filter='$[0,-1]')

In [42]:
batch_transform(model=xgb_model_1, s3_data_path=VAL_DATA_URL, dataset_type="val")



In [43]:
batch_transform(model=xgb_model_1, s3_data_path=TEST_DATA_URL, dataset_type="test")



In [52]:
val_batch_output_file_name='s3://{}/{}/transform/val/{}.out'.format(BUCKET, INFERENCE_PREFIX, val_file_name)

val_batch_output_file_name

's3://datascience-hbo-users/users/sk/FT_propensity/7_day/inference/transform/val/val.csv.out'

In [53]:
test_batch_output_file_name='s3://{}/{}/transform/test/{}.out'.format(BUCKET, INFERENCE_PREFIX, test_file_name)

test_batch_output_file_name

's3://datascience-hbo-users/users/sk/FT_propensity/7_day/inference/transform/test/test.csv.out'

In [60]:
df_val_batch_results_s3=pd.read_csv(val_batch_output_file_name, header=None, names=["ACTUAL", "PROBABILITY"])

df_val_batch_results_s3.head()

Unnamed: 0,ACTUAL,PROBABILITY
0,0,0.088643
1,0,0.032327
2,1,0.806418
3,1,0.919064
4,1,0.829879


In [81]:
df_val_batch_results_s3.shape

(616006, 2)

In [323]:
df_test_batch_results_s3=pd.read_csv(test_batch_output_file_name, header=None, names=["ACTUAL", "PROBABILITY"])

df_test_batch_results_s3.head()

Unnamed: 0,ACTUAL,PROBABILITY
0,1,0.875091
1,1,0.768698
2,1,0.77561
3,1,0.931324
4,1,0.521629


In [324]:
val_batch_auc=roc_auc_score(df_val_batch_results_s3["ACTUAL"],df_val_batch_results_s3["PROBABILITY"]) 
test_batch_auc=roc_auc_score(df_test_batch_results_s3["ACTUAL"],df_test_batch_results_s3["PROBABILITY"])

In [325]:
val_batch_auc

0.9106199423653188

In [326]:
test_batch_auc

0.9111673832627701

In [327]:
df_test_side_info_s3.PERIOD_RANK.value_counts()/df_test_side_info_s3.shape[0]

7    0.142858
6    0.142858
5    0.142858
4    0.142858
2    0.142858
3    0.142856
1    0.142856
Name: PERIOD_RANK, dtype: float64

In [328]:
df_test_side_info_s3[["ACTUAL"]]=df_test_batch_results_s3[["ACTUAL"]]

In [329]:
df_test_side_info_s3.groupby(by=["PERIOD_RANK", "ACTUAL"]).size().reset_index().rename(columns={0:'COUNT_UUID'})

Unnamed: 0,PERIOD_RANK,ACTUAL,COUNT_UUID
0,1,0,42296
1,1,1,55482
2,2,0,42297
3,2,1,55482
4,3,0,42296
5,3,1,55482
6,4,0,42297
7,4,1,55482
8,5,0,42297
9,5,1,55482


In [330]:
df_test_batch_results_s3[["FT_SEGMENT"]]=df_test_side_info_s3[["FT_SEGMENT"]]

In [331]:
df_test_batch_results_s3_step_1.head()

Unnamed: 0,ACTUAL,PROBABILITY,FT_SEGMENT,PREDICTED,COUNT_FT_SEGMENT
0,1,0.875091,04: Hooked & Ongoing,1,16172
1,1,0.768698,01: Friends & BBT Fan,1,42499
2,1,0.77561,12: Never Stream,1,173851
3,1,0.931324,05: Hooked On Library,1,79133
4,1,0.521629,09: Mobile First,1,66415


In [332]:
assert df_test_batch_results_s3.ACTUAL.isnull().sum()==0
assert df_test_batch_results_s3.PROBABILITY.isnull().sum()==0

# 7. Model Evaluation

## Find the optimal threshold using AUC curve

In [333]:
THRESHOLD=0.5

test_pred=np.where(df_test_batch_results_s3[["PROBABILITY"]] > THRESHOLD, 1, 0)

df_test_batch_results_s3["PREDICTED"]=test_pred

assert df_test_batch_results_s3.PREDICTED.isnull().sum()==0

In [334]:
df_test_batch_results_s3_step_1.head()

Unnamed: 0,ACTUAL,PROBABILITY,FT_SEGMENT,PREDICTED,COUNT_FT_SEGMENT
0,1,0.875091,04: Hooked & Ongoing,1,16172
1,1,0.768698,01: Friends & BBT Fan,1,42499
2,1,0.77561,12: Never Stream,1,173851
3,1,0.931324,05: Hooked On Library,1,79133
4,1,0.521629,09: Mobile First,1,66415


In [335]:
val_pred=np.where(df_val_batch_results_s3[["PROBABILITY"]] > THRESHOLD, 1, 0)

df_val_batch_results_s3["PREDICTED"]=val_pred

assert df_val_batch_results_s3.PREDICTED.isnull().sum()==0

In [336]:
def plot_confusion_matrix(mtx):
    sn.set(font_scale=1.4) 
    sn.heatmap(pd.DataFrame(mtx), annot=True, annot_kws={"size": 12}, fmt='g') 
    plt.show() 


def plot_auc(train_labels, preds_train_xgb, val_labels, preds_val_xgb, test_labels, preds_test_xgb):
    print("Training AUC", roc_auc_score(train_labels, preds_train_xgb)) 
    print("Test AUC", roc_auc_score(test_labels, preds_test_xgb) )
    print("Validation AUC", roc_auc_score(val_labels, preds_val_xgb) )
    
    fpr, tpr, thresholds = roc_curve(val_labels, preds_val_xgb)
    roc_auc = auc(fpr, tpr) 

    plt.figure()
    plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % (roc_auc))
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic')
    plt.legend(loc="lower right")
    ax2 = plt.gca().twinx()
    ax2.set_ylim([thresholds[-1],thresholds[0]])
    ax2.set_xlim([fpr[0],fpr[-1]])
    print(plt.figure())

## 7.1 AUC

In [337]:
# train_auc=roc_auc_score(df_train_predictions["ACTUAL"], df_train_predictions["PROBABILITY"])
val_batch_auc=roc_auc_score(df_val_batch_results_s3["ACTUAL"],df_val_batch_results_s3["PROBABILITY"]) 
test_batch_auc=roc_auc_score(df_test_batch_results_s3["ACTUAL"],df_test_batch_results_s3["PROBABILITY"])

In [338]:
val_batch_auc

0.9106199423653188

In [339]:
test_batch_auc

0.9111673832627701

## 7.2 Classification Report

In [340]:
# train_classification_report=classification_report(df_train_predictions["ACTUAL"]
#                                           , df_train_predictions["PREDICTED"])

val_classification_report=classification_report(df_val_batch_results_s3["ACTUAL"]
                                          , df_val_batch_results_s3["PREDICTED"])

test_classification_report=classification_report(df_test_batch_results_s3["ACTUAL"]
                                          , df_test_batch_results_s3["PREDICTED"])

In [341]:
print(val_classification_report)

              precision    recall  f1-score   support

           0       0.89      0.73      0.80    266469
           1       0.82      0.93      0.87    349537

    accuracy                           0.84    616006
   macro avg       0.85      0.83      0.84    616006
weighted avg       0.85      0.84      0.84    616006



In [342]:
print(test_classification_report)

              precision    recall  f1-score   support

           0       0.89      0.73      0.80    296077
           1       0.82      0.93      0.87    388374

    accuracy                           0.85    684451
   macro avg       0.85      0.83      0.84    684451
weighted avg       0.85      0.85      0.84    684451



## 7.3 Accuracy

In [343]:
# train_accuracy_score=accuracy_score(df_train_predictions["ACTUAL"]
#                                           , df_train_predictions["PREDICTED"])

val_accuracy_score=accuracy_score(df_val_batch_results_s3["ACTUAL"]
                                          , df_val_batch_results_s3["PREDICTED"])

test_accuracy_score=accuracy_score(df_test_batch_results_s3["ACTUAL"]
                                          , df_test_batch_results_s3["PREDICTED"])

In [344]:
val_accuracy_score

0.8444057363077633

In [345]:
test_accuracy_score

0.8451035939753174

## 7.4 Confusion Matrix

In [346]:
# train_confusion_matrix=confusion_matrix(df_train_predictions["ACTUAL"]
#                                           , df_train_predictions["PREDICTED"])

# val_confusion_matrix=confusion_matrix(df_val_predictions["ACTUAL"]
#                                           , df_val_predictions["PREDICTED"])

test_confusion_matrix=confusion_matrix(df_test_batch_results_s3["ACTUAL"]
                                          , df_test_batch_results_s3["PREDICTED"])

In [347]:
test_confusion_matrix

array([[217117,  78960],
       [ 27059, 361315]])

# 8.  Segment Summary

In [362]:
df_test_batch_results_s3=df_test_batch_results_s3[df_test_batch_results_s3.FT_SEGMENT!="missing"]

df_segment=df_test_batch_results_s3.groupby(SEGMENT_COL).agg({SEGMENT_COL:["count"]}).reset_index().rename(columns={"count":f"COUNT_{SEGMENT_COL}"})
df_segment.columns=df_segment.columns.values[1]

df_segment.sort_values(SEGMENT_COL, inplace=True)

#df_test_batch_results_s3[SEGMENT_COL]=df_test_batch_results_s3[SEGMENT_COL].apply(lambda x : x.split(":")[1])

df_test_batch_results_s3_step_1=df_test_batch_results_s3.merge(df_segment, on=SEGMENT_COL, how="left")

df_test_batch_results_s3_model_1=df_test_batch_results_s3_step_1[["ACTUAL", SEGMENT_COL, f"COUNT_{SEGMENT_COL}", "PREDICTED", "PROBABILITY"]]

col_names=["ACTUAL", SEGMENT_COL, f"COUNT_{SEGMENT_COL}", "PREDICTED", "PROBABILITY"]
df_test_batch_results_s3_model_1.columns=col_names

def report_aggregate_accuracy(data):
    return data.groupby(SEGMENT_COL).apply(lambda group: accuracy_score(group.ACTUAL, group.PREDICTED)).reset_index().rename(columns={0:"ACCURACY"})

df_acc_result_model_1=report_aggregate_accuracy(df_test_batch_results_s3_model_1)

df_acc_result_model_1.columns=[SEGMENT_COL,"BASE_MODEL_ACC"]

df_test_acc_final=df_segment.merge(df_acc_result_model_1, on=SEGMENT_COL, how="left")


# test_accuracy_score_model_1=accuracy_score(df_test_batch_results_s3["ACTUAL"]
#                                           , df_test_batch_results_s3["PREDICTED"])

df_test_acc_final.loc[len(df_test_acc_final)] = ["13: Overall"
                                                  , df_test_batch_results_s3.shape[0]
                                                  , test_accuracy_score
                                                ]
df_test_acc_final

Unnamed: 0,FT_SEGMENT,COUNT_FT_SEGMENT,BASE_MODEL_ACC
0,01: Friends & BBT Fan,42499,0.89595
1,02: Talk Show & News Fan,9790,0.892646
2,03: Series Viewer,65403,0.877529
3,04: Hooked & Ongoing,16172,0.88072
4,05: Hooked On Library,79133,0.87516
5,06: Series Dabbler,38691,0.879145
6,07: All Caught Up,33872,0.880019
7,08: Series Abandoner,35706,0.838122
8,09: Mobile First,66415,0.805074
9,10: Movie Exclusive,97978,0.839811


In [349]:
import time
TIME_STR = time.strftime("%Y%m%d-%H%M%S")

In [350]:
RESULT_FILE_NAME=f"unbalanced_data_result_{TIME_STR}.csv"
df_test_acc_final.to_csv(RESULT_FILE_NAME, index=False)

In [351]:
df_test_batch_results_s3.FT_SEGMENT.value_counts(dropna=False)

12: Never Stream            173851
10: Movie Exclusive          97978
05: Hooked On Library        79133
09: Mobile First             66415
03: Series Viewer            65403
01: Friends & BBT Fan        42499
06: Series Dabbler           38691
08: Series Abandoner         35706
07: All Caught Up            33872
11: Special Interest         24939
04: Hooked & Ongoing         16172
02: Talk Show & News Fan      9790
Name: FT_SEGMENT, dtype: int64

In [363]:
def roc_auc_score_with_value_error_handler(y_true, y_pred):
    try:
        return roc_auc_score(y_true, y_pred)
    except:
        return np.nan

In [364]:
df_test_batch_results_s3_model_1.head()

Unnamed: 0,ACTUAL,FT_SEGMENT,COUNT_FT_SEGMENT,PREDICTED,PROBABILITY
0,1,04: Hooked & Ongoing,16172,1,0.875091
1,1,01: Friends & BBT Fan,42499,1,0.768698
2,1,12: Never Stream,173851,1,0.77561
3,1,05: Hooked On Library,79133,1,0.931324
4,1,09: Mobile First,66415,1,0.521629


In [365]:
def report_aggregate_roc_auc(data):
    return data.groupby(SEGMENT_COL).apply(lambda group: roc_auc_score_with_value_error_handler(group.ACTUAL, group.PROBABILITY)).reset_index().rename(columns={0:"AUC"})

df_auc_result_model_1=report_aggregate_roc_auc(df_test_batch_results_s3_model_1)

df_auc_result_model_1.columns=[SEGMENT_COL,"BASE_MODEL_AUC"]

df_test_auc_final=df_segment.merge(df_auc_result_model_1, on=SEGMENT_COL, how="left")
df_test_auc_final

Unnamed: 0,FT_SEGMENT,COUNT_FT_SEGMENT,BASE_MODEL_AUC
0,01: Friends & BBT Fan,42499,0.911798
1,02: Talk Show & News Fan,9790,0.937311
2,03: Series Viewer,65403,0.902559
3,04: Hooked & Ongoing,16172,0.921688
4,05: Hooked On Library,79133,0.901649
5,06: Series Dabbler,38691,0.918221
6,07: All Caught Up,33872,0.93206
7,08: Series Abandoner,35706,0.91056
8,09: Mobile First,66415,0.886274
9,10: Movie Exclusive,97978,0.908077


# 9. Accuracy on balanced test dataset

In [355]:
def create_balanced_test_data_set(data, random_state=RANDOM_STATE):
    df_majority=data[data.ACTUAL==1]
    df_minority=data[data.ACTUAL==0]
    num_minority_class=df_minority.shape[0]
    df_majority_frac=df_majority.sample(num_minority_class, random_state=random_state)
    df_balanced_test=df_minority.append(df_majority_frac)
    return df_balanced_test
    
df_balanced_test_s3=create_balanced_test_data_set(df_test_batch_results_s3)

In [356]:
df_balanced_test_s3.head()

Unnamed: 0,ACTUAL,PROBABILITY,FT_SEGMENT,PREDICTED
5,0,0.004202,09: Mobile First,0
7,0,0.759041,05: Hooked On Library,1
10,0,0.013879,09: Mobile First,0
14,0,0.263935,12: Never Stream,0
17,0,0.128843,01: Friends & BBT Fan,0


In [357]:
df_test_batch_results_s3.ACTUAL.value_counts()

1    388373
0    296076
Name: ACTUAL, dtype: int64

In [358]:
df_balanced_test_s3.ACTUAL.value_counts()

1    296076
0    296076
Name: ACTUAL, dtype: int64

In [359]:
df_test_batch_results_s3.FT_SEGMENT.value_counts()/df_test_batch_results_s3.shape[0]

12: Never Stream            0.254001
10: Movie Exclusive         0.143149
05: Hooked On Library       0.115616
09: Mobile First            0.097034
03: Series Viewer           0.095556
01: Friends & BBT Fan       0.062092
06: Series Dabbler          0.056529
08: Series Abandoner        0.052168
07: All Caught Up           0.049488
11: Special Interest        0.036437
04: Hooked & Ongoing        0.023628
02: Talk Show & News Fan    0.014303
Name: FT_SEGMENT, dtype: float64

In [360]:
df_balanced_test_s3.FT_SEGMENT.value_counts()/df_test_batch_results_s3.shape[0]

12: Never Stream            0.226346
10: Movie Exclusive         0.125130
05: Hooked On Library       0.096412
09: Mobile First            0.085818
03: Series Viewer           0.078859
01: Friends & BBT Fan       0.051466
06: Series Dabbler          0.047841
08: Series Abandoner        0.045678
07: All Caught Up           0.042804
11: Special Interest        0.033005
04: Hooked & Ongoing        0.019931
02: Talk Show & News Fan    0.011862
Name: FT_SEGMENT, dtype: float64

In [214]:
df_segment=df_balanced_test_s3.groupby(SEGMENT_COL).agg({SEGMENT_COL:["count"]}).reset_index().rename(columns={"count":f"COUNT_{SEGMENT_COL}"})
df_segment.columns=df_segment.columns.values[1]

df_segment.sort_values(SEGMENT_COL, inplace=True)

# df_balanced_test_s3[SEGMENT_COL]=df_balanced_test_s3[SEGMENT_COL].apply(lambda x : x.split(":")[1])

df_balanced_test_s3=df_balanced_test_s3.merge(df_segment, on=SEGMENT_COL, how="left")

df_balanced_test_s3_model_1=df_balanced_test_s3[["ACTUAL", SEGMENT_COL, f"COUNT_{SEGMENT_COL}", "PREDICTED"]]

col_names=["ACTUAL", SEGMENT_COL, f"COUNT_{SEGMENT_COL}", "PREDICTED"]
df_balanced_test_s3_model_1.columns=col_names

def report_aggregate_accuracy(data):
    return data.groupby(SEGMENT_COL).apply(lambda group: accuracy_score(group.ACTUAL, group.PREDICTED)).reset_index().rename(columns={0:"ACCURACY"})

df_acc_result_model_1=report_aggregate_accuracy(df_balanced_test_s3_model_1)

df_acc_result_model_1.columns=[SEGMENT_COL,"BASE_MODEL_ACC"]

df_balanced_test_acc_final=df_segment.merge(df_acc_result_model_1, on=SEGMENT_COL, how="left")

balanced_val_accuracy_score_model_1=accuracy_score(df_balanced_test_s3["ACTUAL"]
                                          , df_balanced_test_s3["PREDICTED"])

df_balanced_test_acc_final.loc[len(df_balanced_test_acc_final)] = ["13: Overall"
                                                  , df_balanced_test_s3.shape[0]
                                                  , balanced_val_accuracy_score_model_1
                                                ]
df_balanced_test_acc_final

Unnamed: 0,FT_SEGMENT,COUNT_FT_SEGMENT,BASE_MODEL_ACC
0,01: Friends & BBT Fan,35203,0.879726
1,02: Talk Show & News Fan,8112,0.876849
2,03: Series Viewer,53960,0.857802
3,04: Hooked & Ongoing,13599,0.862931
4,05: Hooked On Library,66011,0.856297
5,06: Series Dabbler,32748,0.863198
6,07: All Caught Up,29260,0.867157
7,08: Series Abandoner,31313,0.826462
8,09: Mobile First,58759,0.790347
9,10: Movie Exclusive,85663,0.827183


In [293]:
BALANCED_RESULT_FILE_NAME=f"balanced_data_result_{TIME_STR}.csv"
df_balanced_test_acc_final.to_csv(BALANCED_RESULT_FILE_NAME, index=False)

In [212]:
test_accuracy_score

0.8316755438619008