This notebook has code for data modeling and evaluation to create different training sets with an additional feature __WEIGHT__ as compared to test and validation sets for 7 Day Free trial propensity model. The  feature __WEIGHT__  represents weight of a particular training instance. The objective of this notebook is to train different models with variations in the weight of training instances for both active and dormant users. The only difference between the models is the input data.  The whole process has been divided into 2 notebooks:

- Part 1: Data Preprocessing: 7.0_sk_csv_weights_fine_tuning_FT_propensity_data_preprocessing.ipynb.
- part 2: Data Modeling and Evaluation: 7.0_sk_csv_weights_fine_tuning_FT_propensity_data_modeling.ipynb (this notebook).

In [8]:
!free -m

             total       used       free     shared    buffers     cached
Mem:         70342      66981       3361          0       1019      13057
-/+ buffers/cache:      52904      17438
Swap:            0          0          0


In [9]:
import gc
gc.collect()

0

In [10]:
import time
TIME_STR = time.strftime("%Y%m%d-%H%M%S")

In [11]:
import warnings
warnings.filterwarnings('ignore')

In [12]:
import os
import json
import numpy as np
import pandas as pd

In [13]:
from sklearn.metrics import accuracy_score, auc, classification_report, confusion_matrix, roc_auc_score, roc_curve
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

In [14]:
import matplotlib.pyplot as plt
import seaborn as sn

In [15]:
import boto3
import sagemaker
from sagemaker.amazon.amazon_estimator import get_image_uri
from sagemaker.predictor import csv_serializer
from sagemaker.tuner import IntegerParameter, CategoricalParameter, ContinuousParameter, HyperparameterTuner
import sagemaker.xgboost as xgboost

In [16]:
RANDOM_STATE=101
SMALL_DATASET=False

In [17]:
TARGET_COL="FLG_TARGET"
SEGMENT_COL="FT_SEGMENT"

In [18]:
BUCKET = "datascience-hbo-users"
PREFIX = "users/sk/FT_propensity/7_day"
DATA_PREFIX=PREFIX+"/model_input_data"
MODEL_PREFIX=PREFIX+"/model_artifacts"
INFERENCE_PREFIX=PREFIX+"/inference"

In [19]:
TRAIN_DATA_PREFIX=DATA_PREFIX+"/csv_weights"

# 1. Data Ingestion

In [20]:
train_01_file_name="train_01.csv"
train_10_file_name="train_10.csv"
train_99_file_name="train_99.csv"

In [21]:
train_25_file_name="train_25.csv"
train_50_file_name="train_50.csv"
train_75_file_name="train_75.csv"

test_file_name="test.csv"
val_file_name="val.csv"

In [22]:
train_file_name_s3_dormant_weight_01_perc='s3://{}/{}/{}'.format(BUCKET, TRAIN_DATA_PREFIX, train_01_file_name)
train_file_name_s3_dormant_weight_10_perc='s3://{}/{}/{}'.format(BUCKET, TRAIN_DATA_PREFIX, train_10_file_name)
train_file_name_s3_dormant_weight_99_perc='s3://{}/{}/{}'.format(BUCKET, TRAIN_DATA_PREFIX, train_99_file_name)

In [23]:
train_file_name_s3_dormant_weight_25_perc='s3://{}/{}/{}'.format(BUCKET, TRAIN_DATA_PREFIX, train_25_file_name)
train_file_name_s3_dormant_weight_50_perc='s3://{}/{}/{}'.format(BUCKET, TRAIN_DATA_PREFIX, train_50_file_name)
train_file_name_s3_dormant_weight_75_perc='s3://{}/{}/{}'.format(BUCKET, TRAIN_DATA_PREFIX, train_75_file_name)

In [24]:
s3_input_train_01 = sagemaker.s3_input(s3_data=train_file_name_s3_dormant_weight_01_perc, content_type='csv')
s3_input_train_10 = sagemaker.s3_input(s3_data=train_file_name_s3_dormant_weight_10_perc, content_type='csv')
s3_input_train_99 = sagemaker.s3_input(s3_data=train_file_name_s3_dormant_weight_99_perc, content_type='csv')

's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.
's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.
's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.


In [25]:
s3_input_train_25 = sagemaker.s3_input(s3_data=train_file_name_s3_dormant_weight_25_perc, content_type='csv')
s3_input_train_50 = sagemaker.s3_input(s3_data=train_file_name_s3_dormant_weight_50_perc, content_type='csv')
s3_input_train_75 = sagemaker.s3_input(s3_data=train_file_name_s3_dormant_weight_75_perc, content_type='csv')

's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.
's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.
's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.


In [27]:
s3_input_test = sagemaker.s3_input(s3_data='s3://{}/{}/{}'.format(BUCKET, DATA_PREFIX, test_file_name), content_type='csv')
s3_input_val = sagemaker.s3_input(s3_data='s3://{}/{}/{}'.format(BUCKET, DATA_PREFIX, val_file_name), content_type='csv')

's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.
's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.


In [28]:
TEST_DATA_URL=s3_input_test.config["DataSource"]["S3DataSource"]["S3Uri"]
VAL_DATA_URL=s3_input_val.config["DataSource"]["S3DataSource"]["S3Uri"]

In [29]:
s3_input_test_side_info = sagemaker.s3_input(s3_data='s3://{}/{}/{}'.format(BUCKET, DATA_PREFIX, "val_side_info.csv"), content_type='csv')

's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.


In [30]:
TEST_DATA_SIDE_INFO_URL=s3_input_test_side_info.config["DataSource"]["S3DataSource"]["S3Uri"]
side_info_cols=["UNIQUE_ID", "HBO_UUID", "PERIOD_RANK", "FT_SEGMENT"]
df_test_side_info_s3=pd.read_csv(TEST_DATA_SIDE_INFO_URL, names=side_info_cols)

In [31]:
df_test_side_info_s3.shape

(616006, 4)

# 5. Model Training

In [61]:
region = boto3.Session().region_name
smclient = boto3.Session().client('sagemaker')
role = sagemaker.get_execution_role()
sess = sagemaker.Session()
container = get_image_uri(region, 'xgboost', repo_version='latest')

	get_image_uri(region, 'xgboost', '1.0-1').


## 5.1 Model 1

In [200]:
def fit_model(job_name, s3_train_path, s3_val_path):
    model = sagemaker.estimator.Estimator(container,
                                    role, 
                                    train_instance_count=5, 
                                    train_instance_type='ml.m4.4xlarge',
                                    output_path='s3://{}/{}/{}'.format(BUCKET, MODEL_PREFIX, job_name),
                                    sagemaker_session=sess)
    model.set_hyperparameters(
                            eval_metric='auc'
                            , alpha=1.218487609
                            , eta=0.225242353
                            , max_depth=10
                            , min_child_weight=2.284773815
                            , num_round=100
                            , objective='binary:logistic'
                            , rate_drop=0.3
                            , tweedie_variance_power=1.4
                            , csv_weights=1
                            , seed=RANDOM_STATE
                            )

    model.fit({'train': s3_train_path, 'validation': s3_val_path}, wait=False)
    return model

In [203]:
model_01=fit_model(job_name="csv-weights-01",s3_train_path=s3_input_train_01, s3_val_path=s3_input_val)



In [202]:
model_10=fit_model(job_name="csv-weights-10",s3_train_path=s3_input_train_10, s3_val_path=s3_input_val)



In [204]:
model_99=fit_model(job_name="csv-weights-99",s3_train_path=s3_input_train_99, s3_val_path=s3_input_val)



In [76]:
model_25=fit_model(job_name="csv-weights-25",s3_train_path=s3_input_train_25, s3_val_path=s3_input_val)



2020-07-16 05:20:00 Starting - Starting the training job...
2020-07-16 05:20:02 Starting - Launching requested ML instances.........
2020-07-16 05:21:51 Starting - Preparing the instances for training.........
2020-07-16 05:23:17 Downloading - Downloading input data............
2020-07-16 05:25:16 Training - Downloading the training image.[34mArguments: train[0m
[34m[2020-07-16:05:25:32:INFO] Running distributed xgboost training.[0m
[34mArguments: train[0m
[34m[2020-07-16:05:25:35:INFO] Running distributed xgboost training.[0m
[35mArguments: train[0m
[35m[2020-07-16:05:25:37:INFO] Running distributed xgboost training.[0m
[32mArguments: train[0m
[32m[2020-07-16:05:25:38:INFO] Running distributed xgboost training.[0m
[36mArguments: train[0m
[36m[2020-07-16:05:25:38:INFO] Running distributed xgboost training.[0m
[35m[2020-07-16:05:25:41:INFO] Number of hosts: 5, master IP address: 10.2.113.137, host IP address: 10.2.86.40.[0m
[35m[2020-07-16:05:25:41:INFO] Finished 

In [85]:
model_50=fit_model(job_name="csv-weights-50", s3_train_path=s3_input_train_50, s3_val_path=s3_input_val)



2020-07-16 05:44:23 Starting - Starting the training job...
2020-07-16 05:44:25 Starting - Launching requested ML instances......
2020-07-16 05:45:44 Starting - Preparing the instances for training.........
2020-07-16 05:47:15 Downloading - Downloading input data...............
2020-07-16 05:49:47 Training - Training image download completed. Training in progress.[32mArguments: train[0m
[32m[2020-07-16:05:49:42:INFO] Running distributed xgboost training.[0m
[34mArguments: train[0m
[36mArguments: train[0m
[34m[2020-07-16:05:49:47:INFO] Running distributed xgboost training.[0m
[32m[2020-07-16:05:49:48:INFO] Number of hosts: 5, master IP address: 10.0.210.248, host IP address: 10.0.247.228.[0m
[32m[2020-07-16:05:49:48:INFO] Finished Yarn configuration files setup.
[0m
[32mstarting datanode, logging to /opt/amazon/hadoop/logs/hadoop--datanode-ip-10-0-247-228.ec2.internal.out[0m
[36m[2020-07-16:05:49:48:INFO] Running distributed xgboost training.[0m
[34m[2020-07-16:05:49:

In [89]:
model_75=fit_model(job_name="csv-weights-75", s3_train_path=s3_input_train_75, s3_val_path=s3_input_val)



2020-07-16 06:06:10 Starting - Starting the training job...
2020-07-16 06:06:13 Starting - Launching requested ML instances......
2020-07-16 06:07:28 Starting - Preparing the instances for training......
2020-07-16 06:08:37 Downloading - Downloading input data...............
2020-07-16 06:11:09 Training - Downloading the training image...
2020-07-16 06:11:29 Training - Training image download completed. Training in progress.[33mArguments: train[0m
[35mArguments: train[0m
[35m[2020-07-16:06:11:30:INFO] Running distributed xgboost training.[0m
[33m[2020-07-16:06:11:30:INFO] Running distributed xgboost training.[0m
[32mArguments: train[0m
[32m[2020-07-16:06:11:30:INFO] Running distributed xgboost training.[0m
[34mArguments: train[0m
[34m[2020-07-16:06:11:30:INFO] Running distributed xgboost training.[0m
[36mArguments: train[0m
[36m[2020-07-16:06:11:30:INFO] Running distributed xgboost training.[0m
[35m[2020-07-16:06:11:33:INFO] Number of hosts: 5, master IP address: 1

# 6. Model Prediction

##  Batch Prediction

In [77]:
def batch_transform(model, s3_data_path, dataset_type="train", bucket=BUCKET, prefix=INFERENCE_PREFIX):
    model_transformer = model.transformer(instance_count=2,
                                  instance_type='ml.m4.xlarge',
                                  strategy='MultiRecord',
                                  assemble_with='Line',
                                  output_path='s3://{}/{}/transform/{}'.format(bucket, prefix, dataset_type)
                                  , accept="text/csv")

    return model_transformer.transform(data=s3_data_path, content_type='text/csv', split_type='Line', input_filter='$[1:]', join_source='Input', output_filter='$[0,-1]')

In [211]:
batch_transform(model=model_01, s3_data_path=VAL_DATA_URL, dataset_type="val_01")

batch_transform(model=model_01, s3_data_path=TEST_DATA_URL, dataset_type="test_01")



In [210]:
batch_transform(model=model_10, s3_data_path=VAL_DATA_URL, dataset_type="val_10")

batch_transform(model=model_10, s3_data_path=TEST_DATA_URL, dataset_type="test_10")



In [84]:
batch_transform(model=model_25, s3_data_path=VAL_DATA_URL, dataset_type="val_25")

batch_transform(model=model_25, s3_data_path=TEST_DATA_URL, dataset_type="test_25")



In [88]:
batch_transform(model=model_50, s3_data_path=VAL_DATA_URL, dataset_type="val_50")

batch_transform(model=model_50, s3_data_path=TEST_DATA_URL, dataset_type="test_50")



In [90]:
batch_transform(model=model_75, s3_data_path=VAL_DATA_URL, dataset_type="val_75")

batch_transform(model=model_75, s3_data_path=TEST_DATA_URL, dataset_type="test_75")



In [212]:
batch_transform(model=model_99, s3_data_path=VAL_DATA_URL, dataset_type="val_99")

batch_transform(model=model_99, s3_data_path=TEST_DATA_URL, dataset_type="test_99")



In [48]:
val_01_batch_output_file_name='s3://{}/{}/transform/val_01/{}.out'.format(BUCKET, INFERENCE_PREFIX, val_file_name)
test_01_batch_output_file_name='s3://{}/{}/transform/test_01/{}.out'.format(BUCKET, INFERENCE_PREFIX, test_file_name)

In [49]:
val_10_batch_output_file_name='s3://{}/{}/transform/val_10/{}.out'.format(BUCKET, INFERENCE_PREFIX, val_file_name)
test_10_batch_output_file_name='s3://{}/{}/transform/test_10/{}.out'.format(BUCKET, INFERENCE_PREFIX, test_file_name)

In [50]:
val_25_batch_output_file_name='s3://{}/{}/transform/val_25/{}.out'.format(BUCKET, INFERENCE_PREFIX, val_file_name)
test_25_batch_output_file_name='s3://{}/{}/transform/test_25/{}.out'.format(BUCKET, INFERENCE_PREFIX, test_file_name)

In [51]:
val_50_batch_output_file_name='s3://{}/{}/transform/val_50/{}.out'.format(BUCKET, INFERENCE_PREFIX, val_file_name)
test_50_batch_output_file_name='s3://{}/{}/transform/test_50/{}.out'.format(BUCKET, INFERENCE_PREFIX, test_file_name)

In [52]:
val_75_batch_output_file_name='s3://{}/{}/transform/val_75/{}.out'.format(BUCKET, INFERENCE_PREFIX, val_file_name)
test_75_batch_output_file_name='s3://{}/{}/transform/test_75/{}.out'.format(BUCKET, INFERENCE_PREFIX, test_file_name)

In [53]:
val_99_batch_output_file_name='s3://{}/{}/transform/val_99/{}.out'.format(BUCKET, INFERENCE_PREFIX, val_file_name)
test_99_batch_output_file_name='s3://{}/{}/transform/test_99/{}.out'.format(BUCKET, INFERENCE_PREFIX, test_file_name)

In [54]:
df_val_01_batch_results_s3=pd.read_csv(val_01_batch_output_file_name, header=None, names=["ACTUAL", "PROBABILITY_01"])

df_val_01_batch_results_s3.head()

Unnamed: 0,ACTUAL,PROBABILITY_01
0,0,0.080114
1,0,0.044429
2,1,0.778404
3,1,0.886454
4,1,0.817366


In [56]:
df_test_01_batch_results_s3=pd.read_csv(test_01_batch_output_file_name, header=None, names=["ACTUAL", "PROBABILITY_01"])

df_test_01_batch_results_s3.head()

Unnamed: 0,ACTUAL,PROBABILITY_01
0,1,0.906986
1,1,0.746571
2,1,0.764113
3,1,0.926163
4,1,0.638732


In [57]:
df_test_10_batch_results_s3=pd.read_csv(test_10_batch_output_file_name, header=None, names=["ACTUAL", "PROBABILITY_10"])

df_test_10_batch_results_s3.head()

Unnamed: 0,ACTUAL,PROBABILITY_10
0,1,0.890552
1,1,0.817726
2,1,0.703
3,1,0.921882
4,1,0.590999


In [68]:
df_test_25_batch_results_s3=pd.read_csv(test_25_batch_output_file_name, header=None, names=["ACTUAL", "PROBABILITY_25"])

df_test_25_batch_results_s3.head()

Unnamed: 0,ACTUAL,PROBABILITY_25
0,0,0.101064
1,0,0.113517
2,1,0.803135
3,1,0.918991
4,1,0.776272


In [70]:
df_test_50_batch_results_s3=pd.read_csv(test_50_batch_output_file_name, header=None, names=["ACTUAL", "PROBABILITY_50"])

df_test_50_batch_results_s3.head()

Unnamed: 0,ACTUAL,PROBABILITY_50
0,0,0.089866
1,0,0.042694
2,1,0.728799
3,1,0.903947
4,1,0.799724


In [71]:
df_test_50_batch_results_s3.shape

(616006, 2)

In [72]:
df_test_75_batch_results_s3=pd.read_csv(test_75_batch_output_file_name, header=None, names=["ACTUAL", "PROBABILITY_75"])

df_test_75_batch_results_s3.head()

Unnamed: 0,ACTUAL,PROBABILITY_75
0,0,0.092495
1,0,0.038197
2,1,0.786436
3,1,0.89625
4,1,0.776982


In [75]:
df_test_99_batch_results_s3=pd.read_csv(test_99_batch_output_file_name, header=None, names=["ACTUAL", "PROBABILITY_99"])

df_test_99_batch_results_s3.head()

Unnamed: 0,ACTUAL,PROBABILITY_99
0,1,0.829689
1,1,0.871813
2,1,0.817325
3,1,0.893623
4,1,0.640096


In [240]:
df_test_batch_results_s3=df_test_25_batch_results_s3.copy()

In [241]:
df_test_batch_results_s3[["FT_SEGMENT"]]=df_test_side_info_s3[["FT_SEGMENT"]]

In [242]:
df_test_batch_results_s3.head()

Unnamed: 0,ACTUAL,PROBABILITY_25,FT_SEGMENT
0,1,0.892717,04: Hooked & Ongoing
1,1,0.862303,01: Friends & BBT Fan
2,1,0.728982,12: Never Stream
3,1,0.925202,05: Hooked On Library
4,1,0.617099,09: Mobile First


In [243]:
df_test_batch_results_s3[["PROBABILITY_01"]]=df_test_01_batch_results_s3[["PROBABILITY_01"]]
df_test_batch_results_s3[["PROBABILITY_10"]]=df_test_10_batch_results_s3[["PROBABILITY_10"]]
df_test_batch_results_s3[["PROBABILITY_99"]]=df_test_99_batch_results_s3[["PROBABILITY_99"]]

df_test_batch_results_s3[["PROBABILITY_50"]]=df_test_50_batch_results_s3[["PROBABILITY_50"]]
df_test_batch_results_s3[["PROBABILITY_75"]]=df_test_75_batch_results_s3[["PROBABILITY_75"]]

assert df_test_batch_results_s3.ACTUAL.isnull().sum()==0
assert df_test_batch_results_s3.PROBABILITY_01.isnull().sum()==0
assert df_test_batch_results_s3.PROBABILITY_10.isnull().sum()==0
assert df_test_batch_results_s3.PROBABILITY_99.isnull().sum()==0
assert df_test_batch_results_s3.PROBABILITY_25.isnull().sum()==0
assert df_test_batch_results_s3.PROBABILITY_50.isnull().sum()==0
assert df_test_batch_results_s3.PROBABILITY_75.isnull().sum()==0

# 7. Model Evaluation

## Find the optimal threshold using AUC curve

In [246]:
THRESHOLD=0.5


In [247]:
test_01_pred=np.where(df_test_batch_results_s3[["PROBABILITY_01"]] > THRESHOLD, 1, 0)

df_test_batch_results_s3["PREDICTED_01"]=test_01_pred

assert df_test_batch_results_s3.PREDICTED_01.isnull().sum()==0

In [248]:
test_10_pred=np.where(df_test_batch_results_s3[["PROBABILITY_10"]] > THRESHOLD, 1, 0)

df_test_batch_results_s3["PREDICTED_10"]=test_10_pred

assert df_test_batch_results_s3.PREDICTED_10.isnull().sum()==0

In [249]:
test_99_pred=np.where(df_test_batch_results_s3[["PROBABILITY_99"]] > THRESHOLD, 1, 0)

df_test_batch_results_s3["PREDICTED_99"]=test_99_pred

assert df_test_batch_results_s3.PREDICTED_99.isnull().sum()==0

In [250]:
test_25_pred=np.where(df_test_batch_results_s3[["PROBABILITY_25"]] > THRESHOLD, 1, 0)

df_test_batch_results_s3["PREDICTED_25"]=test_25_pred

assert df_test_batch_results_s3.PREDICTED_25.isnull().sum()==0

In [251]:
test_50_pred=np.where(df_test_batch_results_s3[["PROBABILITY_50"]] > THRESHOLD, 1, 0)

df_test_batch_results_s3["PREDICTED_50"]=test_50_pred

assert df_test_batch_results_s3.PREDICTED_50.isnull().sum()==0

In [252]:
test_75_pred=np.where(df_test_batch_results_s3[["PROBABILITY_75"]] > THRESHOLD, 1, 0)

df_test_batch_results_s3["PREDICTED_75"]=test_75_pred

assert df_test_batch_results_s3.PREDICTED_75.isnull().sum()==0

In [253]:
df_test_batch_results_s3.head()

Unnamed: 0,ACTUAL,PROBABILITY_25,FT_SEGMENT,PROBABILITY_01,PROBABILITY_10,PROBABILITY_99,PROBABILITY_50,PROBABILITY_75,PREDICTED_01,PREDICTED_10,PREDICTED_99,PREDICTED_25,PREDICTED_50,PREDICTED_75
0,1,0.892717,04: Hooked & Ongoing,0.906986,0.890552,0.829689,0.917956,0.895725,1,1,1,1,1,1
1,1,0.862303,01: Friends & BBT Fan,0.746571,0.817726,0.871813,0.806593,0.859296,1,1,1,1,1,1
2,1,0.728982,12: Never Stream,0.764113,0.703,0.817325,0.789017,0.808102,1,1,1,1,1,1
3,1,0.925202,05: Hooked On Library,0.926163,0.921882,0.893623,0.930162,0.936535,1,1,1,1,1,1
4,1,0.617099,09: Mobile First,0.638732,0.590999,0.640096,0.549491,0.52449,1,1,1,1,1,1


In [2]:
def plot_confusion_matrix(mtx):
    sn.set(font_scale=1.4) 
    sn.heatmap(pd.DataFrame(mtx), annot=True, annot_kws={"size": 12}, fmt='g') 
    plt.show() 

## 7.1 AUC

In [227]:
val_01_batch_auc=roc_auc_score(df_val_01_batch_results_s3["ACTUAL"],df_val_01_batch_results_s3["PROBABILITY_01"]) 
test_01_batch_auc=roc_auc_score(df_test_01_batch_results_s3["ACTUAL"],df_test_01_batch_results_s3["PROBABILITY_01"])

In [228]:
test_10_batch_auc=roc_auc_score(df_test_10_batch_results_s3["ACTUAL"],df_test_10_batch_results_s3["PROBABILITY_10"])

In [229]:
val_25_batch_auc=roc_auc_score(df_val_25_batch_results_s3["ACTUAL"],df_val_25_batch_results_s3["PROBABILITY_25"]) 
test_25_batch_auc=roc_auc_score(df_test_25_batch_results_s3["ACTUAL"],df_test_25_batch_results_s3["PROBABILITY_25"])

In [132]:
test_50_batch_auc=roc_auc_score(df_test_50_batch_results_s3["ACTUAL"],df_test_50_batch_results_s3["PROBABILITY_50"])

In [133]:
test_75_batch_auc=roc_auc_score(df_test_75_batch_results_s3["ACTUAL"],df_test_75_batch_results_s3["PROBABILITY_75"])

In [236]:
test_99_batch_auc=roc_auc_score(df_test_99_batch_results_s3["ACTUAL"],df_test_99_batch_results_s3["PROBABILITY_99"])

In [231]:
test_01_batch_auc

0.908855001190498

In [232]:
test_10_batch_auc

0.9099662875329954

In [135]:
test_25_batch_auc

0.910506254744957

In [233]:
test_50_batch_auc

0.9098799900066254

In [234]:
test_75_batch_auc

0.9086022323527512

In [237]:
test_99_batch_auc

0.8978237646640321

## 7.2 Classification Report

In [257]:
test_01_classification_report=classification_report(df_test_batch_results_s3["ACTUAL"]
                                          , df_test_batch_results_s3["PREDICTED_01"])

print(test_01_classification_report)

              precision    recall  f1-score   support

           0       0.89      0.73      0.80    296077
           1       0.82      0.93      0.87    388374

    accuracy                           0.84    684451
   macro avg       0.85      0.83      0.84    684451
weighted avg       0.85      0.84      0.84    684451



In [258]:
test_10_classification_report=classification_report(df_test_batch_results_s3["ACTUAL"]
                                          , df_test_batch_results_s3["PREDICTED_10"])

print(test_10_classification_report)

              precision    recall  f1-score   support

           0       0.89      0.73      0.80    296077
           1       0.82      0.93      0.87    388374

    accuracy                           0.84    684451
   macro avg       0.85      0.83      0.84    684451
weighted avg       0.85      0.84      0.84    684451



In [152]:
test_25_classification_report=classification_report(df_test_batch_results_s3["ACTUAL"]
                                          , df_test_batch_results_s3["PREDICTED_25"])

print(test_25_classification_report)

              precision    recall  f1-score   support

           0       0.89      0.73      0.80    296077
           1       0.82      0.93      0.87    388374

    accuracy                           0.84    684451
   macro avg       0.85      0.83      0.84    684451
weighted avg       0.85      0.84      0.84    684451



In [153]:
test_50_classification_report=classification_report(df_test_batch_results_s3["ACTUAL"]
                                          , df_test_batch_results_s3["PREDICTED_50"])

print(test_50_classification_report)

              precision    recall  f1-score   support

           0       0.89      0.73      0.80    296077
           1       0.82      0.93      0.87    388374

    accuracy                           0.84    684451
   macro avg       0.85      0.83      0.84    684451
weighted avg       0.85      0.84      0.84    684451



In [154]:
test_75_classification_report=classification_report(df_test_batch_results_s3["ACTUAL"]
                                          , df_test_batch_results_s3["PREDICTED_75"])

print(test_75_classification_report)

              precision    recall  f1-score   support

           0       0.89      0.73      0.80    296077
           1       0.82      0.93      0.87    388374

    accuracy                           0.84    684451
   macro avg       0.85      0.83      0.84    684451
weighted avg       0.85      0.84      0.84    684451



In [259]:
test_99_classification_report=classification_report(df_test_batch_results_s3["ACTUAL"]
                                          , df_test_batch_results_s3["PREDICTED_99"])

print(test_99_classification_report)

              precision    recall  f1-score   support

           0       0.88      0.73      0.80    296077
           1       0.82      0.93      0.87    388374

    accuracy                           0.84    684451
   macro avg       0.85      0.83      0.83    684451
weighted avg       0.85      0.84      0.84    684451



## 7.3 Accuracy

In [260]:
test_01_accuracy_score=accuracy_score(df_test_batch_results_s3["ACTUAL"]
                                          , df_test_batch_results_s3["PREDICTED_01"])
test_10_accuracy_score=accuracy_score(df_test_batch_results_s3["ACTUAL"]
                                          , df_test_batch_results_s3["PREDICTED_10"])
test_99_accuracy_score=accuracy_score(df_test_batch_results_s3["ACTUAL"]
                                          , df_test_batch_results_s3["PREDICTED_99"])

In [155]:
test_25_accuracy_score=accuracy_score(df_test_batch_results_s3["ACTUAL"]
                                          , df_test_batch_results_s3["PREDICTED_25"])
test_50_accuracy_score=accuracy_score(df_test_batch_results_s3["ACTUAL"]
                                          , df_test_batch_results_s3["PREDICTED_50"])
test_75_accuracy_score=accuracy_score(df_test_batch_results_s3["ACTUAL"]
                                          , df_test_batch_results_s3["PREDICTED_75"])

In [261]:
test_01_accuracy_score

0.8431575087186665

In [262]:
test_10_accuracy_score

0.8441685379961458

In [156]:
test_25_accuracy_score

0.844310257417989

In [157]:
test_50_accuracy_score

0.8443540881670127

In [158]:
test_75_accuracy_score

0.8440487339488144

In [263]:
test_99_accuracy_score

0.840139031135903

## 7.4 Confusion Matrix

In [264]:
test_01_confusion_matrix=confusion_matrix(df_test_batch_results_s3["ACTUAL"]
                                          , df_test_batch_results_s3["PREDICTED_01"])

In [265]:
test_10_confusion_matrix=confusion_matrix(df_test_batch_results_s3["ACTUAL"]
                                          , df_test_batch_results_s3["PREDICTED_10"])

In [159]:
test_25_confusion_matrix=confusion_matrix(df_test_batch_results_s3["ACTUAL"]
                                          , df_test_batch_results_s3["PREDICTED_25"])

In [266]:
test_50_confusion_matrix=confusion_matrix(df_test_batch_results_s3["ACTUAL"]
                                          , df_test_batch_results_s3["PREDICTED_50"])

In [267]:
test_75_confusion_matrix=confusion_matrix(df_test_batch_results_s3["ACTUAL"]
                                          , df_test_batch_results_s3["PREDICTED_75"])

In [268]:
test_99_confusion_matrix=confusion_matrix(df_test_batch_results_s3["ACTUAL"]
                                          , df_test_batch_results_s3["PREDICTED_99"])

# 8.1 Segment Summary

## 8.1.1 Segment Accuracy

In [269]:
df_test_batch_results_s3=df_test_batch_results_s3[df_test_batch_results_s3.FT_SEGMENT!="missing"]

df_segment=df_test_batch_results_s3.groupby(SEGMENT_COL).agg({SEGMENT_COL:["count"]}).reset_index().rename(columns={"count":f"COUNT_{SEGMENT_COL}"})
df_segment.columns=df_segment.columns.values[1]

df_segment.sort_values(SEGMENT_COL, inplace=True)

In [271]:
#df_test_batch_results_s3[SEGMENT_COL]=df_test_batch_results_s3[SEGMENT_COL].apply(lambda x : x.split(":")[1])

df_test_batch_results_s3_step_1=df_test_batch_results_s3.merge(df_segment, on=SEGMENT_COL, how="left")

In [327]:
df_test_batch_results_s3_model_01=df_test_batch_results_s3_step_1[["ACTUAL", SEGMENT_COL, "PREDICTED_01", "PROBABILITY_01"]]
df_test_batch_results_s3_model_10=df_test_batch_results_s3_step_1[["ACTUAL", SEGMENT_COL, "PREDICTED_10", "PROBABILITY_10"]]
df_test_batch_results_s3_model_99=df_test_batch_results_s3_step_1[["ACTUAL", SEGMENT_COL, "PREDICTED_99", "PROBABILITY_99"]]

In [328]:
df_test_batch_results_s3_model_25=df_test_batch_results_s3_step_1[["ACTUAL", SEGMENT_COL, "PREDICTED_25", "PROBABILITY_25"]]
df_test_batch_results_s3_model_50=df_test_batch_results_s3_step_1[["ACTUAL", SEGMENT_COL, "PREDICTED_50", "PROBABILITY_50"]]
df_test_batch_results_s3_model_75=df_test_batch_results_s3_step_1[["ACTUAL", SEGMENT_COL, "PREDICTED_75", "PROBABILITY_75"]]

col_names=["ACTUAL", SEGMENT_COL, "PREDICTED", "PROBABILITY"]
df_test_batch_results_s3_model_01.columns=col_names
df_test_batch_results_s3_model_10.columns=col_names
df_test_batch_results_s3_model_99.columns=col_names

df_test_batch_results_s3_model_25.columns=col_names
df_test_batch_results_s3_model_50.columns=col_names
df_test_batch_results_s3_model_75.columns=col_names

def report_aggregate_accuracy(data):
    return data.groupby(SEGMENT_COL).apply(lambda group: accuracy_score(group.ACTUAL, group.PREDICTED)).reset_index().rename(columns={0:"ACCURACY"})

df_acc_result_model_01=report_aggregate_accuracy(df_test_batch_results_s3_model_01)
df_acc_result_model_10=report_aggregate_accuracy(df_test_batch_results_s3_model_10)
df_acc_result_model_99=report_aggregate_accuracy(df_test_batch_results_s3_model_99)

df_acc_result_model_01.columns=[SEGMENT_COL,"MODEL_01_ACC"]
df_acc_result_model_10.columns=[SEGMENT_COL,"MODEL_10_ACC"]
df_acc_result_model_99.columns=[SEGMENT_COL,"MODEL_99_ACC"]

df_acc_result_model_25=report_aggregate_accuracy(df_test_batch_results_s3_model_25)
df_acc_result_model_50=report_aggregate_accuracy(df_test_batch_results_s3_model_50)
df_acc_result_model_75=report_aggregate_accuracy(df_test_batch_results_s3_model_75)

df_acc_result_model_25.columns=[SEGMENT_COL,"MODEL_25_ACC"]
df_acc_result_model_50.columns=[SEGMENT_COL,"MODEL_50_ACC"]
df_acc_result_model_75.columns=[SEGMENT_COL,"MODEL_75_ACC"]

df_test_acc_final=df_segment.merge(df_acc_result_model_01, on=SEGMENT_COL, how="left").merge(df_acc_result_model_10, on=SEGMENT_COL, how="left").merge(df_acc_result_model_25, on=SEGMENT_COL, how="left").merge(df_acc_result_model_50, on=SEGMENT_COL, how="left").merge(df_acc_result_model_75, on=SEGMENT_COL, how="left").merge(df_acc_result_model_99, on=SEGMENT_COL, how="left")


df_test_acc_final.loc[len(df_test_acc_final)] = ["13: Overall"
                                                  , df_test_batch_results_s3.shape[0]
                                                  , test_01_accuracy_score
                                                  , test_10_accuracy_score
                                                  , test_25_accuracy_score
                                                  , test_50_accuracy_score
                                                  , test_75_accuracy_score
                                                  , test_99_accuracy_score
                                                ]
df_test_acc_final

Unnamed: 0,FT_SEGMENT,COUNT_FT_SEGMENT,MODEL_01_ACC,MODEL_10_ACC,MODEL_25_ACC,MODEL_50_ACC,MODEL_75_ACC,MODEL_99_ACC
0,01: Friends & BBT Fan,42499,0.896868,0.896656,0.896374,0.895527,0.895198,0.892115
1,02: Talk Show & News Fan,9790,0.891624,0.893973,0.89142,0.88856,0.885495,0.877017
2,03: Series Viewer,65403,0.878293,0.878538,0.878584,0.876733,0.874562,0.86944
3,04: Hooked & Ongoing,16172,0.881771,0.881029,0.882142,0.880472,0.87769,0.874969
4,05: Hooked On Library,79133,0.875412,0.875867,0.876057,0.8754,0.873454,0.869789
5,06: Series Dabbler,38691,0.879223,0.879042,0.878189,0.877568,0.875863,0.870719
6,07: All Caught Up,33872,0.880255,0.881495,0.880196,0.879606,0.87748,0.873553
7,08: Series Abandoner,35706,0.841343,0.83829,0.837534,0.836694,0.831289,0.815185
8,09: Mobile First,66415,0.80798,0.806264,0.806489,0.802996,0.801596,0.781917
9,10: Movie Exclusive,97978,0.841515,0.841771,0.840066,0.839076,0.83632,0.825747


In [275]:
RESULT_FILE_NAME=f"csv_weights_unbalanced_data_result_{TIME_STR}.csv"


In [None]:
df_test_acc_final.to_csv(RESULT_FILE_NAME, index=False)

## 8.1.2 Segment AUC

In [326]:
def roc_auc_score_with_value_error_handler(y_true, y_pred):
    try:
        return roc_auc_score(y_true, y_pred)
    except:
        return np.nan

def report_aggregate_roc_auc(data):
    return data.groupby(SEGMENT_COL).apply(lambda group: roc_auc_score_with_value_error_handler(group.ACTUAL, group.PROBABILITY)).reset_index().rename(columns={0:"AUC"})

In [329]:
df_auc_result_model_01=report_aggregate_roc_auc(df_test_batch_results_s3_model_01)
df_auc_result_model_10=report_aggregate_roc_auc(df_test_batch_results_s3_model_10)
df_auc_result_model_99=report_aggregate_roc_auc(df_test_batch_results_s3_model_99)

df_auc_result_model_01.columns=[SEGMENT_COL,"MODEL_01_AUC"]
df_auc_result_model_10.columns=[SEGMENT_COL,"MODEL_10_AUC"]
df_auc_result_model_99.columns=[SEGMENT_COL,"MODEL_99_AUC"]

df_auc_result_model_25=report_aggregate_roc_auc(df_test_batch_results_s3_model_25)
df_auc_result_model_50=report_aggregate_roc_auc(df_test_batch_results_s3_model_50)
df_auc_result_model_75=report_aggregate_roc_auc(df_test_batch_results_s3_model_75)

df_auc_result_model_25.columns=[SEGMENT_COL,"MODEL_25_AUC"]
df_auc_result_model_50.columns=[SEGMENT_COL,"MODEL_50_AUC"]
df_auc_result_model_75.columns=[SEGMENT_COL,"MODEL_75_AUC"]

df_test_auc_final=df_segment.merge(df_auc_result_model_01, on=SEGMENT_COL, how="left").merge(df_auc_result_model_10, on=SEGMENT_COL, how="left").merge(df_auc_result_model_25, on=SEGMENT_COL, how="left").merge(df_auc_result_model_50, on=SEGMENT_COL, how="left").merge(df_auc_result_model_75, on=SEGMENT_COL, how="left").merge(df_auc_result_model_99, on=SEGMENT_COL, how="left")


df_test_auc_final.loc[len(df_test_auc_final)] = ["13: Overall"
                                                  , df_test_batch_results_s3.shape[0]
                                                  , test_01_batch_auc
                                                  , test_10_batch_auc
                                                  , test_25_batch_auc
                                                  , test_50_batch_auc
                                                  , test_75_batch_auc
                                                  , test_99_batch_auc
                                                ]
df_test_auc_final

Unnamed: 0,FT_SEGMENT,COUNT_FT_SEGMENT,MODEL_01_AUC,MODEL_10_AUC,MODEL_25_AUC,MODEL_50_AUC,MODEL_75_AUC,MODEL_99_AUC
0,01: Friends & BBT Fan,42499,0.912866,0.911331,0.912266,0.909608,0.906954,0.889402
1,02: Talk Show & News Fan,9790,0.936001,0.936079,0.935345,0.931045,0.923469,0.880753
2,03: Series Viewer,65403,0.902958,0.903697,0.9052,0.900372,0.893115,0.862412
3,04: Hooked & Ongoing,16172,0.923039,0.924454,0.922715,0.920564,0.914354,0.888077
4,05: Hooked On Library,79133,0.902296,0.902238,0.903125,0.898632,0.895904,0.872248
5,06: Series Dabbler,38691,0.918775,0.920215,0.919491,0.916813,0.913274,0.889444
6,07: All Caught Up,33872,0.932265,0.933639,0.933189,0.930221,0.927044,0.909321
7,08: Series Abandoner,35706,0.913783,0.912385,0.911504,0.908337,0.902715,0.878446
8,09: Mobile First,66415,0.888009,0.887382,0.886484,0.884815,0.880468,0.852399
9,10: Movie Exclusive,97978,0.908977,0.909103,0.908193,0.90683,0.902652,0.886324


# 8.2 Accuracy on balanced test dataset

In [277]:
def create_balanced_test_data_set(data, random_state=RANDOM_STATE):
    df_majority=data[data.ACTUAL==1]
    df_minority=data[data.ACTUAL==0]
    num_minority_class=df_minority.shape[0]
    df_majority_frac=df_majority.sample(num_minority_class, random_state=random_state)
    df_balanced_test=df_minority.append(df_majority_frac)
    return df_balanced_test
    
df_balanced_test_batch_results_s3=create_balanced_test_data_set(df_test_batch_results_s3)

In [279]:
balanced_test_01_accuracy_score=accuracy_score(df_balanced_test_batch_results_s3["ACTUAL"]
                                          , df_balanced_test_batch_results_s3["PREDICTED_01"])
balanced_test_10_accuracy_score=accuracy_score(df_balanced_test_batch_results_s3["ACTUAL"]
                                          , df_balanced_test_batch_results_s3["PREDICTED_10"])
balanced_test_99_accuracy_score=accuracy_score(df_balanced_test_batch_results_s3["ACTUAL"]
                                          , df_balanced_test_batch_results_s3["PREDICTED_99"])

balanced_test_25_accuracy_score=accuracy_score(df_balanced_test_batch_results_s3["ACTUAL"]
                                          , df_balanced_test_batch_results_s3["PREDICTED_25"])
balanced_test_50_accuracy_score=accuracy_score(df_balanced_test_batch_results_s3["ACTUAL"]
                                          , df_balanced_test_batch_results_s3["PREDICTED_50"])
balanced_test_75_accuracy_score=accuracy_score(df_balanced_test_batch_results_s3["ACTUAL"]
                                          , df_balanced_test_batch_results_s3["PREDICTED_75"])

In [281]:
df_test_batch_results_s3.ACTUAL.value_counts()

1    388373
0    296076
Name: ACTUAL, dtype: int64

In [282]:
df_balanced_test_batch_results_s3.ACTUAL.value_counts()

1    296076
0    296076
Name: ACTUAL, dtype: int64

In [286]:
df_balanced_test_batch_results_s3=df_balanced_test_batch_results_s3[df_balanced_test_batch_results_s3.FT_SEGMENT!="missing"]

df_balanced_segment=df_balanced_test_batch_results_s3.groupby(SEGMENT_COL).agg({SEGMENT_COL:["count"]}).reset_index().rename(columns={"count":f"COUNT_{SEGMENT_COL}"})
df_balanced_segment.columns=df_balanced_segment.columns.values[1]

df_balanced_segment.sort_values(SEGMENT_COL, inplace=True)

In [287]:
#df_balanced_test_batch_results_s3[SEGMENT_COL]=df_balanced_test_batch_results_s3[SEGMENT_COL].apply(lambda x : x.split(":")[1])

df_balanced_test_batch_results_s3_step_1=df_balanced_test_batch_results_s3.merge(df_segment, on=SEGMENT_COL, how="left")

In [288]:
df_balanced_test_batch_results_s3_model_01=df_balanced_test_batch_results_s3_step_1[["ACTUAL", SEGMENT_COL, "PREDICTED_01"]]
df_balanced_test_batch_results_s3_model_10=df_balanced_test_batch_results_s3_step_1[["ACTUAL", SEGMENT_COL, "PREDICTED_10"]]
df_balanced_test_batch_results_s3_model_99=df_balanced_test_batch_results_s3_step_1[["ACTUAL", SEGMENT_COL, "PREDICTED_99"]]

In [None]:

df_balanced_test_batch_results_s3_model_25=df_balanced_test_batch_results_s3_step_1[["ACTUAL", SEGMENT_COL, "PREDICTED_25"]]
df_balanced_test_batch_results_s3_model_50=df_balanced_test_batch_results_s3_step_1[["ACTUAL", SEGMENT_COL, "PREDICTED_50"]]
df_balanced_test_batch_results_s3_model_75=df_balanced_test_batch_results_s3_step_1[["ACTUAL", SEGMENT_COL, "PREDICTED_75"]]

col_names=["ACTUAL", SEGMENT_COL, "PREDICTED"]

df_balanced_test_batch_results_s3_model_01.columns=col_names
df_balanced_test_batch_results_s3_model_10.columns=col_names
df_balanced_test_batch_results_s3_model_99.columns=col_names

df_balanced_test_batch_results_s3_model_25.columns=col_names
df_balanced_test_batch_results_s3_model_50.columns=col_names
df_balanced_test_batch_results_s3_model_75.columns=col_names

def report_aggregate_accuracy(data):
    return data.groupby(SEGMENT_COL).apply(lambda group: accuracy_score(group.ACTUAL, group.PREDICTED)).reset_index().rename(columns={0:"ACCURACY"})

df_balanced_acc_result_model_01=report_aggregate_accuracy(df_balanced_test_batch_results_s3_model_01)
df_balanced_acc_result_model_10=report_aggregate_accuracy(df_balanced_test_batch_results_s3_model_10)
df_balanced_acc_result_model_99=report_aggregate_accuracy(df_balanced_test_batch_results_s3_model_99)

df_balanced_acc_result_model_01.columns=[SEGMENT_COL,"MODEL_01_ACC"]
df_balanced_acc_result_model_10.columns=[SEGMENT_COL,"MODEL_10_ACC"]
df_balanced_acc_result_model_99.columns=[SEGMENT_COL,"MODEL_99_ACC"]

df_balanced_acc_result_model_25=report_aggregate_accuracy(df_balanced_test_batch_results_s3_model_25)
df_balanced_acc_result_model_50=report_aggregate_accuracy(df_balanced_test_batch_results_s3_model_50)
df_balanced_acc_result_model_75=report_aggregate_accuracy(df_balanced_test_batch_results_s3_model_75)

df_balanced_acc_result_model_25.columns=[SEGMENT_COL,"MODEL_25_ACC"]
df_balanced_acc_result_model_50.columns=[SEGMENT_COL,"MODEL_50_ACC"]
df_balanced_acc_result_model_75.columns=[SEGMENT_COL,"MODEL_75_ACC"]

df_balanced_test_acc_final=df_balanced_segment.merge(df_balanced_acc_result_model_01, on=SEGMENT_COL, how="left").merge(df_balanced_acc_result_model_10, on=SEGMENT_COL, how="left").merge(df_balanced_acc_result_model_25, on=SEGMENT_COL, how="left").merge(df_balanced_acc_result_model_50, on=SEGMENT_COL, how="left").merge(df_balanced_acc_result_model_75, on=SEGMENT_COL, how="left").merge(df_balanced_acc_result_model_99, on=SEGMENT_COL, how="left")


df_balanced_test_acc_final.loc[len(df_balanced_test_acc_final)] = ["13: Overall"
                                                  , df_balanced_test_batch_results_s3.shape[0]
                                                  , balanced_test_01_accuracy_score
                                                  , balanced_test_10_accuracy_score
                                                  , balanced_test_25_accuracy_score
                                                  , balanced_test_50_accuracy_score
                                                  , balanced_test_75_accuracy_score
                                                  , balanced_test_99_accuracy_score
                                                ]

df_balanced_test_acc_final

In [180]:
BALANCED_RESULT_FILE_NAME=f"csv_weights_balanced_data_result_{TIME_STR}.csv"
df_balanced_test_acc_final.to_csv(BALANCED_RESULT_FILE_NAME, index=False)

## 8.3 Confusion Matrix by Segment

In [299]:
df_test_batch_results_s3.FT_SEGMENT.value_counts(dropna=False)

12: Never Stream            173851
10: Movie Exclusive          97978
05: Hooked On Library        79133
09: Mobile First             66415
03: Series Viewer            65403
01: Friends & BBT Fan        42499
06: Series Dabbler           38691
08: Series Abandoner         35706
07: All Caught Up            33872
11: Special Interest         24939
04: Hooked & Ongoing         16172
02: Talk Show & News Fan      9790
Name: FT_SEGMENT, dtype: int64

In [315]:
df_test_batch_results_s3_friends_and_bbt_fan=df_test_batch_results_s3[df_test_batch_results_s3.FT_SEGMENT=="01: Friends & BBT Fan"]

In [314]:
df_test_batch_results_s3_talk_show_n_news_fan=df_test_batch_results_s3[df_test_batch_results_s3.FT_SEGMENT=="02: Talk Show & News Fan"]

In [313]:
df_test_batch_results_s3_series_viewer=df_test_batch_results_s3[df_test_batch_results_s3.FT_SEGMENT=="03: Series Viewer"]

In [312]:
df_test_batch_results_s3_hooked_and_ongoing=df_test_batch_results_s3[df_test_batch_results_s3.FT_SEGMENT=="04: Hooked & Ongoing"]

In [309]:
df_test_batch_results_s3_hooked_on_library=df_test_batch_results_s3[df_test_batch_results_s3.FT_SEGMENT=="05: Hooked On Library"]

In [308]:
df_test_batch_results_s3_series_dabbler=df_test_batch_results_s3[df_test_batch_results_s3.FT_SEGMENT=="06: Series Dabbler"]

In [307]:
df_test_batch_results_s3_all_caught_up=df_test_batch_results_s3[df_test_batch_results_s3.FT_SEGMENT=="07: All Caught Up"]

In [306]:
df_test_batch_results_s3_series_abandoner=df_test_batch_results_s3[df_test_batch_results_s3.FT_SEGMENT=="08: Series Abandoner"]

In [304]:
df_test_batch_results_s3_mobile_first=df_test_batch_results_s3[df_test_batch_results_s3.FT_SEGMENT=="09: Mobile First"]

In [302]:
df_test_batch_results_s3_movie_exclusive=df_test_batch_results_s3[df_test_batch_results_s3.FT_SEGMENT=="10: Movie Exclusive"]

In [300]:
df_test_batch_results_s3_spl_interest=df_test_batch_results_s3[df_test_batch_results_s3.FT_SEGMENT=="11: Special Interest"]

In [297]:
df_test_batch_results_s3_never_stream=df_test_batch_results_s3[df_test_batch_results_s3.FT_SEGMENT=="12: Never Stream"]

In [317]:
df_test_batch_results_s3_friends_and_bbt_fan.shape

(42499, 14)

In [318]:
df_test_batch_results_s3_talk_show_n_news_fan.shape

(9790, 14)

In [319]:
df_test_batch_results_s3_series_viewer.shape

(65403, 14)

In [305]:
df_test_batch_results_s3_mobile_first.shape

(66415, 14)

In [303]:
df_test_batch_results_s3_movie_exclusive.shape

(97978, 14)

In [301]:
df_test_batch_results_s3_spl_interest.shape

(24939, 14)

In [298]:
df_test_batch_results_s3_never_stream.shape

(173851, 14)

## 8.3.1 : Friends & BBT Fan

In [322]:
friends_bbt_test_01_confusion_matrix=confusion_matrix(df_test_batch_results_s3_friends_and_bbt_fan["ACTUAL"]
                                          , df_test_batch_results_s3_friends_and_bbt_fan["PREDICTED_01"])

never_stream_test_01_confusion_matrix=confusion_matrix(df_test_batch_results_s3_never_stream["ACTUAL"]
                                          , df_test_batch_results_s3_never_stream["PREDICTED_01"])

In [323]:
friends_bbt_test_01_confusion_matrix

array([[ 8268,  3618],
       [  765, 29848]])

In [325]:
never_stream_test_01_confusion_matrix

array([[70818, 22904],
       [11489, 68640]])

In [330]:
friends_bbt_test_99_confusion_matrix=confusion_matrix(df_test_batch_results_s3_friends_and_bbt_fan["ACTUAL"]
                                          , df_test_batch_results_s3_friends_and_bbt_fan["PREDICTED_99"])

never_stream_test_99_confusion_matrix=confusion_matrix(df_test_batch_results_s3_never_stream["ACTUAL"]
                                          , df_test_batch_results_s3_never_stream["PREDICTED_99"])

In [331]:
friends_bbt_test_99_confusion_matrix

array([[ 8169,  3717],
       [  868, 29745]])

In [332]:
never_stream_test_99_confusion_matrix

array([[73335, 20387],
       [11577, 68552]])