In [None]:
import joblib
import sagemaker
import pandas as pd
from src.config import config as cfg
from sagemaker.pipeline import PipelineModel
from utilities.evaluation import ModelEvaluation
from sagemaker import get_execution_role, s3_input
from sagemaker.model_monitor import DataCaptureConfig
from utilities.utils import download_model, decompress_model, prediction_df
from sagemaker.tuner import IntegerParameter, ContinuousParameter, HyperparameterTuner
from aws.sagemaker_helper import sklearn_preprocessor, sagemaker_model, batch_transform, calculate_prediction

In [None]:
role = get_execution_role()
session = sagemaker.Session()

In [None]:
bucket = cfg.S3_BUCKET
prefix = cfg.S3_PREFIX

train_path = f's3://{bucket}/{prefix}/train/train.csv'
test_path = f's3://{bucket}/{prefix}/test/test.csv'
validation_path = f's3://{bucket}/{prefix}/validation/validation.csv'

output_path = f's3://{bucket}/{prefix}/output'
s3_capture_upload_path = f's3://{bucket}/{prefix}/data_capture'

## Preprocesamiento

In [None]:
preprocessor = sklearn_preprocessor(entry_point='sklearn_preprocessing.py', role=role, output_dir=output_path)

preprocessor.fit({'train': s3_input(train_path, content_type='text/csv')}, job_name='scikit-preprocessing-slg')

In [None]:
batch_transform(model=preprocessor, data=train_path, output_dir=output_path)

In [None]:
batch_transform(model=preprocessor, data=test_path, output_dir=output_path)

In [None]:
batch_transform(model=preprocessor, data=test_path, output_dir=output_path)

## XGBoost

In [None]:
hyperparams = {
    'objective': 'binary:logistic',
    'early_stopping_rounds': 10,
    'num_round': 250,
    'max_depth': 4,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'scale_pos_weight': 4
}

job_name = 'xgboost-model-slg'

xgb = sagemaker_model(image='xgboost', role=role,
                      region_name=cfg.AWS_REGION, hyperparams=hyperparams, output_dir=output_path)

data_channel = {
    'train': s3_input(f'{output_path}/train.csv.out', content_type='text/csv'),
    'validation': s3_input(f'{output_path}/validation.csv.out', content_type='text/csv')
}

xgb.fit(data_channel, job_name=job_name)

#### Evaluación del modelo

In [None]:
test_label = pd.read_csv(f's3://{bucket}/{prefix}/test/test_label.csv', sep=';', names=['label'])

In [None]:
download_model(local_dir=f'models/{job_name}/model.tar.gz',
               model_path=f'{output_path}/{job_name}/output/model.tar.gz')

In [None]:
decompress_model(local_dir=f'models/{job_name}/model.tar.gz')

In [None]:
model = joblib.load('xgboost-model')
prediction = prediction_df(model, file_path=f'{output_path}/test.csv.out', score=0.5)

evaluation = ModelEvaluation(observed=test_label['label'], predicted=prediction['prediction'])

In [None]:
evaluation.generate_report()

In [None]:
evaluation.calculate_metrics()
evaluation.print_metrics()
evaluation.confusion_matrix()

## Optimización de hiperparametros

In [None]:
hyperparams = {
    'objective': 'binary:logistic',
    'early_stopping_rounds': 10,
    'num_round': 250,
    'max_depth': 4,
    'subsample': 0.7,
    'colsample_bytree': 0.7,
    'scale_pos_weight': 4
}

job_name = 'xgboost-model-slg-1'

xgb_model = sagemaker_model(image='xgboost', role=role,
                            region_name=cfg.AWS_REGION, hyperparams=hyperparams, output_dir=output_path)

In [None]:
xgb_model_cv = HyperparameterTuner(estimator=xgb_model,
                                   objective_metric_name='validation:f1',
                                   objective_type='Maximize',
                                   max_jobs=10,
                                   max_parallel_jobs=3,
                                   hyperparameter_ranges={
                                       'max_depth': IntegerParameter(min_value=3, max_value=6),
                                       'subsample': ContinuousParameter(min_value=0.5, max_value=0.9)
                                   })


xgb_model_cv.fit(data_channel, job_name=job_name)

In [None]:
xgb_model_cv.wait()

In [None]:
new_job_name = xgb_model_cv.best_training_job()

download_model(local_dir=f'models/{new_job_name}/model.tar.gz',
               model_path=f'{output_path}/{new_job_name}/output/model.tar.gz')

In [None]:
decompress_model(local_dir=f'models/{new_job_name}/model.tar.gz')

In [None]:
model = joblib.load('xgboost-model')
prediction = prediction_df(model, file_path=f'{output_path}/test.csv.out', score=0.5)

evaluation = ModelEvaluation(observed=test_label['label'], predicted=prediction['prediction'])

In [None]:
evaluation.generate_report()

In [None]:
evaluation.calculate_metrics()
evaluation.print_metrics()
evaluation.confusion_matrix()

## Pipeline de inferencia

In [None]:
data_capture_config = DataCaptureConfig(enable_capture=True,
                                        sampling_percentage=100,
                                        destination_s3_uri=s3_capture_upload_path)

In [None]:
scikit_preprocessor = preprocessor.create_model(env={
    "SAGEMAKER_DEFAULT_INVOCATIONS_ACCEPT": "text/csv"
})
xgboost = xgb.create_model()

model_name = endpoint_name = 'churn-score-slg'

pipeline_model = PipelineModel(name=model_name, role=role, models=[
    scikit_preprocessor, xgboost
])

In [None]:
pipeline_model.deploy(initial_instance_count=1, instance_type='ml.t2.medium',
                      endpoint_name=endpoint_name, data_capture_config=data_capture_config)

## Consumo del EndPoint

In [None]:
new_data = '4982;15768137;Bray;667;Spain;Female;23;6;136100.69;2;0;0;169669.33'

In [None]:
%timeit calculate_prediction(data=new_data)