## Carga de módulos

In [1]:
import os
import time
import boto3
import joblib
import warnings
import pandas as pd
from config import region_name
from utilities.sagemaker_functions import create_model, transform, \
                                          download_model, decompress_model, prediction_df
from utilities.evaluation import generate_report, confusion_matrix, \
                                 calculate_metrics, metrics_summary


import sagemaker
from sagemaker.pipeline import PipelineModel
from sagemaker.sklearn.estimator import SKLearn
from sagemaker import get_execution_role, s3_input
from sagemaker.content_types import CONTENT_TYPE_CSV
from sagemaker.predictor import RealTimePredictor, csv_serializer, csv_deserializer, \
                                json_serializer, json_deserializer


warnings.filterwarnings('ignore')

In [2]:
role = get_execution_role()
session = sagemaker.Session()

Primero vamos a instanciar diferentes variables, por ejemplo, el bucket donde se encuentran los datos alojados para el entrenamiento del XGBoost.

In [3]:
bucket = 'banking-data'
prefix = 'churn-modeling'

train_path = f's3://{bucket}/{prefix}/train/train.csv'
test_path = f's3://{bucket}/{prefix}/test/test.csv'
validation_path = f's3://{bucket}/{prefix}/validation/validation.csv'

output_path = f's3://{bucket}/{prefix}/output'

### Preprocesamiento (primer contenedor)

In [4]:
preprocessor = SKLearn(entry_point='scikit_preprocessing.py',
                       role=role,
                       train_instance_type='ml.m4.xlarge',
                       sagemaker_session=session,
                       dependencies=['config.py', 'custom_pipeline.py'],
                       py_version='py3')

preprocessor.fit({'train': s3_input(train_path, content_type='text/csv')}, job_name='scikit-preprocessor')

2020-05-07 18:30:48 Starting - Starting the training job...
2020-05-07 18:30:50 Starting - Launching requested ML instances......
2020-05-07 18:31:54 Starting - Preparing the instances for training...
2020-05-07 18:32:45 Downloading - Downloading input data......
2020-05-07 18:33:39 Training - Training image download completed. Training in progress.[34m2020-05-07 18:33:39,954 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training[0m
[34m2020-05-07 18:33:39,956 sagemaker-containers INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2020-05-07 18:33:39,968 sagemaker_sklearn_container.training INFO     Invoking user training script.[0m
[34m2020-05-07 18:33:53,921 sagemaker-containers INFO     Module scikit_preprocessing does not provide a setup.py. [0m
[34mGenerating setup.py[0m
[34m2020-05-07 18:33:53,921 sagemaker-containers INFO     Generating setup.cfg[0m
[34m2020-05-07 18:33:53,921 sagemaker-containers INFO     Generating MANIFE

In [5]:
transform(preprocessor, data=train_path, output_path=output_path, instance_type='ml.m4.xlarge')

.....................[34mProcessing /opt/ml/code[0m
[34mBuilding wheels for collected packages: scikit-preprocessing
  Building wheel for scikit-preprocessing (setup.py): started
  Building wheel for scikit-preprocessing (setup.py): finished with status 'done'
  Created wheel for scikit-preprocessing: filename=scikit_preprocessing-1.0.0-py2.py3-none-any.whl size=14801 sha256=5dab848965649d037e65d2f4da45e3048403d97ae3d12eeaea0d7cc524bf65d1
  Stored in directory: /tmp/pip-ephem-wheel-cache-p8vngot6/wheels/35/24/16/37574d11bf9bde50616c67372a334f94fa8356bc7164af8ca3[0m
[34mSuccessfully built scikit-preprocessing[0m
[34mInstalling collected packages: scikit-preprocessing[0m
[34mSuccessfully installed scikit-preprocessing-1.0.0[0m
  import imp[0m
[34m[2020-05-07 18:37:54 +0000] [37] [INFO] Starting gunicorn 19.9.0[0m
[34m[2020-05-07 18:37:54 +0000] [37] [INFO] Listening at: unix:/tmp/gunicorn.sock (37)[0m
[34m[2020-05-07 18:37:54 +0000] [37] [INFO] Using worker: gevent[0m
[

In [6]:
transform(preprocessor, data=validation_path, output_path=output_path, instance_type='ml.m4.xlarge')

Using already existing model: scikit-preprocessor


........................[34mProcessing /opt/ml/code[0m
[34mBuilding wheels for collected packages: scikit-preprocessing
  Building wheel for scikit-preprocessing (setup.py): started[0m
[34m  Building wheel for scikit-preprocessing (setup.py): finished with status 'done'
  Created wheel for scikit-preprocessing: filename=scikit_preprocessing-1.0.0-py2.py3-none-any.whl size=14798 sha256=c76e5a26ed7044cc2ecd46bc9aa576312ace808297919f5790c50365e773e415
  Stored in directory: /tmp/pip-ephem-wheel-cache-3xmnwvbd/wheels/35/24/16/37574d11bf9bde50616c67372a334f94fa8356bc7164af8ca3[0m
[34mSuccessfully built scikit-preprocessing[0m
[34mInstalling collected packages: scikit-preprocessing[0m
[34mSuccessfully installed scikit-preprocessing-1.0.0[0m
  import imp[0m
[34m[2020-05-07 18:43:08 +0000] [38] [INFO] Starting gunicorn 19.9.0[0m
[34m[2020-05-07 18:43:08 +0000] [38] [INFO] Listening at: unix:/tmp/gunicorn.sock (38)[0m
[34m[2020-05-07 18:43:08 +0000] [38] [INFO] Using worker: g

In [7]:
transform(preprocessor, data=test_path, output_path=output_path, instance_type='ml.m4.xlarge')

Using already existing model: scikit-preprocessor


.....................[34mProcessing /opt/ml/code[0m
[34mBuilding wheels for collected packages: scikit-preprocessing
  Building wheel for scikit-preprocessing (setup.py): started[0m
[34m  Building wheel for scikit-preprocessing (setup.py): finished with status 'done'
  Created wheel for scikit-preprocessing: filename=scikit_preprocessing-1.0.0-py2.py3-none-any.whl size=14802 sha256=f862b094e3c198095c016d3b66aa8e1ffe22c2d36ed7408a4fdf95a3ba539028
  Stored in directory: /tmp/pip-ephem-wheel-cache-o5he7xov/wheels/35/24/16/37574d11bf9bde50616c67372a334f94fa8356bc7164af8ca3[0m
[34mSuccessfully built scikit-preprocessing[0m
[34mInstalling collected packages: scikit-preprocessing[0m
[34mSuccessfully installed scikit-preprocessing-1.0.0[0m
  import imp[0m
[34m[2020-05-07 18:47:15 +0000] [38] [INFO] Starting gunicorn 19.9.0[0m
[34m[2020-05-07 18:47:15 +0000] [38] [INFO] Listening at: unix:/tmp/gunicorn.sock (38)[0m
[34m[2020-05-07 18:47:15 +0000] [38] [INFO] Using worker: geve

In [8]:
hyperparams = {
    'objective': 'binary:logistic',
    'early_stopping_rounds': 10,
    'num_round': 200,
    'max_depth': 3,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'scale_pos_weight': 4
}

job_name = 'xgboost'
xgb = create_model(image='xgboost', role=role, hyperparameters=hyperparams,
                   instance_type='ml.m4.xlarge', output_path=output_path,
                   region_name=region_name, s3_train=f'{output_path}/train.csv.out',
                   s3_validation=f'{output_path}/validation.csv.out', job_name=job_name)

2020-05-07 18:48:10 Starting - Starting the training job...
2020-05-07 18:48:12 Starting - Launching requested ML instances......
2020-05-07 18:49:19 Starting - Preparing the instances for training......
2020-05-07 18:50:26 Downloading - Downloading input data...
2020-05-07 18:51:02 Training - Downloading the training image...
2020-05-07 18:51:34 Uploading - Uploading generated training model
2020-05-07 18:51:34 Completed - Training job completed
[34mINFO:sagemaker-containers:Imported framework sagemaker_xgboost_container.training[0m
[34mINFO:sagemaker-containers:Failed to parse hyperparameter objective value binary:logistic to Json.[0m
[34mReturning the value itself[0m
[34mINFO:sagemaker-containers:No GPUs detected (normal if no gpus installed)[0m
[34mINFO:sagemaker_xgboost_container.training:Running XGBoost Sagemaker in algorithm mode[0m
[34mINFO:root:Determined delimiter of CSV input is ','[0m
[34mINFO:root:Determined delimiter of CSV input is ','[0m
[34mINFO:root:Det

## Evaluación del modelo

In [9]:
test_label = pd.read_csv(f's3://{bucket}/{prefix}/test/test_label.csv', sep=';', names=['label'])

In [10]:
download_model(model_path=f'{output_path}/{job_name}/output/model.tar.gz',
               local_dir=f'model/{job_name}/model.tar.gz')
decompress_model(local_dir=f'model/{job_name}/model.tar.gz')

0

In [11]:
model = joblib.load('xgboost-model')
prediction = prediction_df(model, file_path=f'{output_path}/test.csv.out', score=0.6)
metrics = calculate_metrics(test_label['label'], prediction['prediction'])

In [12]:
generate_report(test_label['label'], prediction['prediction'])

Unnamed: 0,f1-score,precision,recall,support
0,0.9,0.91,0.89,956.0
1,0.64,0.61,0.68,244.0
micro avg,0.85,0.85,0.85,1200.0
macro avg,0.77,0.76,0.78,1200.0
weighted avg,0.85,0.85,0.85,1200.0


In [13]:
metrics_summary(metrics)
confusion_matrix(test_label['label'], prediction['prediction'])

The AUC is: 0.78
The accuracy is: 0.85
The precision is: 0.61
The recall is: 0.68
The F1 score is: 0.64


Predicted,0,1
Observed,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.89,0.11
1,0.32,0.68


In [14]:
scikit_preprocessor = preprocessor.create_model(env={"SAGEMAKER_DEFAULT_INVOCATIONS_ACCEPT": "text/csv"})
xgboost = xgb.create_model()

model_name = 'churn-score'
endpoint_name = 'churn-score'

model = PipelineModel(name=model_name,
                      role=role,
                      models=[
                          scikit_preprocessor,
                          xgboost
                      ])

In [16]:
model.deploy(initial_instance_count=1, instance_type='ml.t2.medium', endpoint_name=endpoint_name)

Using already existing model: churn-score


-----------------!

In [17]:
def calculate_churn_score(data: str):
    client = boto3.client('sagemaker-runtime', 'us-east-1')
    score_endpoint = 'churn-score'
    response = client.invoke_endpoint(EndpointName=score_endpoint,
                                      Body=data,
                                      ContentType='text/csv',
                                      Accept='text/csv')
    score = round(float(response['Body'].read()), 4) * 100
    return score

In [18]:
new_data = '3799;15621834;Game;700;Spain;Female;43;0;0.0;2;1;0;59475.35;0'

In [19]:
calculate_churn_score(new_data)

ModelError: An error occurred (ModelError) when calling the InvokeEndpoint operation: Received client error (400) from container-2 with message "Unable to evaluate payload provided: Feature size of csv inference data 12 is not consistent with feature size of trained model 11.". See https://us-east-1.console.aws.amazon.com/cloudwatch/home?region=us-east-1#logEventViewer:group=/aws/sagemaker/Endpoints/churn-score in account 512194003765 for more information.