This notebook has code for data processing to create different training sets with an additional feature __WEIGHT__ as compared to test and validation sets for 7 Day Free trial propensity model. The  feature __WEIGHT__  represents weight of a particular training instance. The objective of this notebook is to create different datasets that compare the performance of the model using csv_weights hyperparameter to optimize the model performance on both active and dormant users. The only difference between the models is the input data.  The whole process has been divided into 2 notebooks:

- Part 1: Data Preprocessing: 7.0_sk_csv_weights_fine_tuning_FT_propensity_data_preprocessing.ipynb (this notebook). 
- part 2: Data Modeling and Evaluation: 7.0_sk_csv_weights_fine_tuning_FT_propensity_data_modeling.ipynb.

In [1]:
import pandas as pd

In [1]:
BUCKET = "datascience-hbo-users"
PREFIX = "users/sk/FT_propensity/7_day"
DATA_PREFIX=PREFIX+"/model_input_data"
MODEL_PREFIX=PREFIX+"/model_artifacts"
INFERENCE_PREFIX=PREFIX+"/inference"
DATA_OUTPUT_PREFIX=DATA_PREFIX+"/csv_weights"

In [None]:
train_file_path_s3='s3://{}/{}/{}'.format(BUCKET, DATA_PREFIX, "train.csv")
train_file_name_s3_dormant_weight_01_perc='s3://{}/{}/{}'.format(BUCKET, DATA_OUTPUT_PREFIX, "train_01.csv")
train_file_name_s3_dormant_weight_10_perc='s3://{}/{}/{}'.format(BUCKET, DATA_OUTPUT_PREFIX, "train_10.csv")
train_file_name_s3_dormant_weight_99_perc='s3://{}/{}/{}'.format(BUCKET, DATA_OUTPUT_PREFIX, "train_99.csv")
train_file_name_s3_dormant_weight_25_perc='s3://{}/{}/{}'.format(BUCKET, DATA_OUTPUT_PREFIX, "train_25.csv")
train_file_name_s3_dormant_weight_50_perc='s3://{}/{}/{}'.format(BUCKET, DATA_OUTPUT_PREFIX, "train_50.csv")
train_file_name_s3_dormant_weight_75_perc='s3://{}/{}/{}'.format(BUCKET, DATA_OUTPUT_PREFIX, "train_75.csv")

In [None]:
df_raw_train=pd.read_csv(train_file_path_s3, header=None)

In [None]:
train_columns_file_path_s3='s3://{}/{}/{}'.format(BUCKET, DATA_PREFIX, "train_columns.csv")
df_raw_train_columns=pd.read_csv(train_columns_file_path_s3, header=None)
cols=list(df_raw_train_columns[0].values)
cols.insert(0, "FLG_TARGET")

df_raw_train.columns=cols

In [None]:
def add_weight_col(data, weight):
    stream_mask = (data["NUM_STREAMS_ADJ"]>0)
    dormant_mask = (data["NUM_STREAMS_ADJ"]==0)
    data.loc[stream_mask, "WEIGHT"] = 1-weight
    data.loc[dormant_mask, "WEIGHT"] = weight
    cols=list(data.columns)
    cols.insert(1, cols.pop(cols.index("WEIGHT")))
    data = data.loc[:, cols]
    return data

In [None]:
df_dormant_weight_01_perc=add_weight_col(df_raw_train, 0.01)
df_dormant_weight_99_perc=add_weight_col(df_raw_train, 0.99)
df_dormant_weight_10_perc=add_weight_col(df_raw_train, 0.1)
df_dormant_weight_25_perc=add_weight_col(df_raw_train, 0.25)
df_dormant_weight_50_perc=add_weight_col(df_raw_train, 0.50)
df_dormant_weight_75_perc=add_weight_col(df_raw_train, 0.75)

In [None]:
assert "WEIGHT" == df_dormant_weight_01_perc.columns[1]
assert not df_dormant_weight_01_perc.isnull().sum().any()
assert "WEIGHT" == df_dormant_weight_99_perc.columns[1]
assert not df_dormant_weight_99_perc.isnull().sum().any()
assert "WEIGHT" == df_dormant_weight_10_perc.columns[1]
assert not df_dormant_weight_10_perc.isnull().sum().any()
assert "WEIGHT" == df_dormant_weight_25_perc.columns[1]
assert "WEIGHT" == df_dormant_weight_50_perc.columns[1]
assert "WEIGHT" == df_dormant_weight_75_perc.columns[1]

assert not df_dormant_weight_25_perc.isnull().sum().any()
assert not df_dormant_weight_50_perc.isnull().sum().any()
assert not df_dormant_weight_75_perc.isnull().sum().any()

In [None]:
df_dormant_weight_01_perc[["FLG_TARGET", "WEIGHT", "NUM_STREAMS_ADJ"]].tail(10)

In [16]:
df_dormant_weight_10_perc[["FLG_TARGET", "WEIGHT", "NUM_STREAMS_ADJ"]].head()

Unnamed: 0,FLG_TARGET,WEIGHT,NUM_PROFILE,NUM_ADULT_PROFILE,NUM_KID_PROFILE,FLG_TURN_OFF_AUTORENEW,TOTAL_HBONOW_WATCH_SEC_ADJ,ROKU_PERCENT_ADJ_NOW,PS_PERCENT_ADJ_NOW,IPHONE_PERCENT_ADJ_NOW,...,x3_1: Friends Fan,x3_2: BBT Fan,x3_2: Education Issue,x3_3: Friends & BBT Mixed,x3_3: Heavy User Interaction,x3_4: A Few User Interaction,x3_5: Abandon Profile Creation,x3_6: No Action After Signup,x3_7: ERROR,x3_missing
0,0,0.9,1.0,1.0,0.0,0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1,1,0.9,3.0,3.0,0.0,0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2,1,0.9,1.0,1.0,0.0,0,0.0,0.0,0.0,0.0,...,0,1,0,0,0,0,0,0,0,0
3,0,0.9,2.0,2.0,0.0,0,46391.0,1.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
4,1,0.9,3.0,3.0,0.0,0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


In [55]:
df_dormant_weight_25_perc[["FLG_TARGET", "WEIGHT", "NUM_STREAMS_ADJ"]].head()

Unnamed: 0,FLG_TARGET,WEIGHT,NUM_PROFILE,NUM_ADULT_PROFILE,NUM_KID_PROFILE,FLG_TURN_OFF_AUTORENEW,TOTAL_HBONOW_WATCH_SEC_ADJ,ROKU_PERCENT_ADJ_NOW,PS_PERCENT_ADJ_NOW,IPHONE_PERCENT_ADJ_NOW,...,x3_1: Friends Fan,x3_2: BBT Fan,x3_2: Education Issue,x3_3: Friends & BBT Mixed,x3_3: Heavy User Interaction,x3_4: A Few User Interaction,x3_5: Abandon Profile Creation,x3_6: No Action After Signup,x3_7: ERROR,x3_missing
0,0,0.75,1.0,1.0,0.0,0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1,1,0.75,3.0,3.0,0.0,0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2,1,0.75,1.0,1.0,0.0,0,0.0,0.0,0.0,0.0,...,0,1,0,0,0,0,0,0,0,0
3,0,0.75,2.0,2.0,0.0,0,46391.0,1.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
4,1,0.75,3.0,3.0,0.0,0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


In [56]:
df_dormant_weight_50_perc[["FLG_TARGET", "WEIGHT", "NUM_STREAMS_ADJ"]].head()

Unnamed: 0,FLG_TARGET,WEIGHT,NUM_PROFILE,NUM_ADULT_PROFILE,NUM_KID_PROFILE,FLG_TURN_OFF_AUTORENEW,TOTAL_HBONOW_WATCH_SEC_ADJ,ROKU_PERCENT_ADJ_NOW,PS_PERCENT_ADJ_NOW,IPHONE_PERCENT_ADJ_NOW,...,x3_1: Friends Fan,x3_2: BBT Fan,x3_2: Education Issue,x3_3: Friends & BBT Mixed,x3_3: Heavy User Interaction,x3_4: A Few User Interaction,x3_5: Abandon Profile Creation,x3_6: No Action After Signup,x3_7: ERROR,x3_missing
0,0,0.5,1.0,1.0,0.0,0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1,1,0.5,3.0,3.0,0.0,0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2,1,0.5,1.0,1.0,0.0,0,0.0,0.0,0.0,0.0,...,0,1,0,0,0,0,0,0,0,0
3,0,0.5,2.0,2.0,0.0,0,46391.0,1.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
4,1,0.5,3.0,3.0,0.0,0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
df_dormant_weight_75_perc[["FLG_TARGET", "WEIGHT", "NUM_STREAMS_ADJ"]].head()

In [30]:
df_dormant_weight_99_perc[["FLG_TARGET", "WEIGHT", "NUM_STREAMS_ADJ"]].tail(10)

Unnamed: 0,FLG_TARGET,WEIGHT,NUM_STREAMS_ADJ
5544042,0,0.01,2
5544043,0,0.01,62
5544044,1,0.01,1
5544045,0,0.01,32
5544046,0,0.99,0
5544047,1,0.99,0
5544048,0,0.01,2
5544049,0,0.01,1
5544050,0,0.01,1
5544051,0,0.01,13


In [32]:
df_dormant_weight_01_perc.to_csv(train_file_name_s3_dormant_weight_01_perc, header=False, index=False)

In [18]:
df_dormant_weight_10_perc.to_csv(train_file_name_s3_dormant_weight_10_perc, header=False, index=False)

In [67]:
df_dormant_weight_25_perc.to_csv(train_file_name_s3_dormant_weight_25_perc, header=False, index=False)

In [68]:
df_dormant_weight_50_perc.to_csv(train_file_name_s3_dormant_weight_50_perc, header=False, index=False)

In [69]:
df_dormant_weight_75_perc.to_csv(train_file_name_s3_dormant_weight_75_perc, header=False, index=False)

In [33]:
df_dormant_weight_99_perc.to_csv(train_file_name_s3_dormant_weight_99_perc, header=False, index=False)