This notebook is to fine tune the XGBoost model and perform evaulation on user data for 7 day Free Trial.

The whole process has been divided into 2 notebooks: 

- Part 1: Data Preprocessing: 6.0_sk_fine_tuning_FT_propensity_data_preprocessing.ipynb (this notebook)
- part 2: Data Modeling and Evaluation: 6.0_sk_fine_tuning_FT_propensity_data_modeling.ipynb 

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
import os
import json
import numpy as np
import pandas as pd
import sagemaker

from sklearn.preprocessing import OneHotEncoder

In [None]:
RANDOM_STATE=101
SMALL_DATASET=False

BUCKET = "datascience-hbo-users"
PREFIX = "users/sk/FT_propensity/7_day"

In [None]:
import sys
!{sys.executable} -m pip install sagemaker

In [None]:
sagemaker_session = sagemaker.Session(default_bucket=BUCKET)

In [None]:
sagemaker_session.default_bucket()

In [None]:
!pip show sagemaker

In [None]:
import boto3
import sagemaker
from sagemaker.sklearn.processing import SKLearnProcessor

region = boto3.Session().region_name
smclient = boto3.Session().client('sagemaker')
role = sagemaker.get_execution_role()
sess = sagemaker.Session()

sklearn_processor = SKLearnProcessor(framework_version='0.20.0',
                                     role=role,
                                     instance_type='ml.r5.24xlarge',
                                     output_kms_key='alias/aws/s3',
                                     instance_count=1,
                                     sagemaker_session=sagemaker_session,
                                     base_job_name='FT-Propensity')

In [None]:
%%time

from sagemaker.processing import ProcessingInput, ProcessingOutput

output_destination = 's3://datascience-hbo-users/users/sk/FT_propensity/7_day/model_input_data'
input_source = 's3://datascience-hbo-users/users/sk/FT_propensity/7_day/snowflake-hbomax-staging'

sklearn_processor.run(code='preprocessing.py',
                      inputs=[ProcessingInput(
                        source=input_source,
                        destination='/opt/ml/processing/input')],
                      outputs=[
                               ProcessingOutput(output_name='train_data',
                                                source='/opt/ml/processing/train',
                                                destination=output_destination),
                               
                               ProcessingOutput(output_name='test_data',
                                                source='/opt/ml/processing/test',
                                                destination=output_destination),
                          
                               ProcessingOutput(output_name='val_data',
                                               source='/opt/ml/processing/val',
                                               destination=output_destination)
                               
                               
                              ],
                      arguments=['--train-test-split-ratio', '0.1']
                     )