In [63]:
import sklearn
import pandas as pd
import numpy as np

import joblib
import os

from sklearn.ensemble import RandomForestClassifier

# model selection
from sklearn.model_selection import train_test_split
# evaluation
from sklearn.metrics import classification_report

import warnings
warnings.filterwarnings("ignore")

In [8]:
data_hdy = pd.read_csv("./heart.csv")
data_hdy.head(5)

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [10]:
features_name = ['cp', 'thalach', 'slope', 'restecg', 'trestbps', 'age', 'sex', 'thal', 'ca', 'oldpeak', 'exang', 'chol']
features = data_hdy[features_name]
target = data_hdy['target']
# Split into train and test sets
x_train, x_test, y_train, y_test = train_test_split(features, target, test_size = 0.2, random_state = 20, stratify=target)
# random state is useful for getting consistent results
print(y_test.value_counts())

1    33
0    28
Name: target, dtype: int64


In [11]:
rfc = RandomForestClassifier(random_state = 20, max_depth = 3) # default criterion is gini
rfc = rfc.fit(x_train,y_train)
y_pred = rfc.predict(x_test)
print(classification_report(y_test, y_pred))

# saving the model
joblib.dump(rfc,"./hdd_rf.joblib")

              precision    recall  f1-score   support

           0       0.85      0.79      0.81        28
           1       0.83      0.88      0.85        33

    accuracy                           0.84        61
   macro avg       0.84      0.83      0.83        61
weighted avg       0.84      0.84      0.84        61



['./hdd_rf.joblib']

In [42]:
hdd_rf.predict(x_test)

array([1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0,
       1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0,
       1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0])

In [57]:
def input_fn(request_body):
    return np.array(request_body).reshape(1, -1)

def predict_fn(input_data, model):
    prediction = model.predict(input_data)
    pred_prob = model.predict_proba(input_data)
    return np.array([prediction, pred_prob[0]])

def output_fn(prediction, content_type):
    return content_type[prediction[0]]

In [64]:
model = joblib.load("./hdd_rf.joblib")
request_body = [1.0, 172.0, 2.0, 1.0, 120.0, 52.0, 1.0, 2.0, 0.0, 0.2, 0.0, 325.0]

input_data = input_fn(request_body)
prediction_arr = predict_fn(input_data, model)

content_type = ['No','Yes']

print(prediction_arr)
print('Heart Disease: ',output_fn(prediction_arr[0], content_type))

[array([1]) array([0.10652913, 0.89347087])]
Heart Disease:  Yes


In [65]:
model.predict(np.array([1.0, 172.0, 2.0, 1.0, 120.0, 52.0, 1.0, 2.0, 0.0, 0.2, 0.0, 325.0]).reshape(1, -1))

array([1])

In [66]:
model.predict(np.array(list(x_test.iloc[2])).reshape(1, -1))

array([0])

## Sagemaker deployment using S3 bucket

In [67]:
import sagemaker
from sagemaker import get_execution_role

In [68]:
sagemaker_session = sagemaker.Session()

In [69]:
role = get_execution_role()

In [70]:
train_input = sagemaker_session.upload_data("data")

In [75]:
conda list scikit

# packages in environment at /home/ec2-user/anaconda3/envs/python3:
#
# Name                    Version                   Build  Channel
scikit-image              0.18.3           py38h43a58ef_0    conda-forge
scikit-learn              1.0.1            py38h1561384_2    conda-forge
scikit-learn-intelex      2021.3.0         py38h578d9bd_1    conda-forge

Note: you may need to restart the kernel to use updated packages.


In [80]:
from sagemaker.sklearn.estimator import SKLearn

script_path = 'heartdis.py'

sklearn = SKLearn(
    entry_point=script_path,
    instance_type="ml.m4.xlarge",
    framework_version="0.20.0",
    py_version="py3",
    role=role,
    sagemaker_session=sagemaker_session)

In [81]:
sklearn.fit({'train': train_input})

2022-10-18 09:18:46 Starting - Starting the training job...
2022-10-18 09:19:13 Starting - Preparing the instances for trainingProfilerReport-1666084726: InProgress
.........
2022-10-18 09:20:38 Downloading - Downloading input data......
2022-10-18 09:21:31 Training - Downloading the training image...
2022-10-18 09:22:11 Training - Training image download completed. Training in progress..[34m2022-10-18 09:22:12,331 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training[0m
[34m2022-10-18 09:22:12,335 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2022-10-18 09:22:12,350 sagemaker_sklearn_container.training INFO     Invoking user training script.[0m
[34m2022-10-18 09:22:13,068 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2022-10-18 09:22:13,094 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2022-10-18 09:22:13,113 sag

UnexpectedStatusException: Error for Training job sagemaker-scikit-learn-2022-10-18-09-18-46-458: Failed. Reason: AlgorithmError: framework error: 
Traceback (most recent call last):
  File "/miniconda3/lib/python3.7/site-packages/sagemaker_containers/_trainer.py", line 84, in train
    entrypoint()
  File "/miniconda3/lib/python3.7/site-packages/sagemaker_sklearn_container/training.py", line 39, in main
    train(environment.Environment())
  File "/miniconda3/lib/python3.7/site-packages/sagemaker_sklearn_container/training.py", line 35, in train
    runner_type=runner.ProcessRunnerType)
  File "/miniconda3/lib/python3.7/site-packages/sagemaker_training/entry_point.py", line 100, in run
    wait, capture_error
  File "/miniconda3/lib/python3.7/site-packages/sagemaker_training/process.py", line 291, in run
    cwd=environment.code_dir,
  File "/miniconda3/lib/python3.7/site-packages/sagemaker_training/process.py", line 208, in check_error
    info=extra_info,
sagemaker_training.errors.ExecuteUserScriptError: ExecuteUserScriptError:
ExitCode 1
ErrorMessage ""
Command "/miniconda3/bin/python heartdis.py"

ExecuteUserScriptE

In [82]:
deployment = sklearn.deploy(initial_instance_count=1, instance_type="ml.m4.xlarge")

ClientError: An error occurred (ValidationException) when calling the CreateModel operation: Could not find model data at s3://sagemaker-us-west-1-491178551691/sagemaker-scikit-learn-2022-10-18-09-18-46-458/output/model.tar.gz.

In [83]:
deployment.endpoint

NameError: name 'deployment' is not defined

In [None]:
deployment.predict([[1.0, 172.0, 2.0, 1.0, 120.0, 52.0, 1.0, 2.0, 0.0, 0.2, 0.0, 325.0]])

In [84]:
# import libraries
import boto3, re, sys, math, json, os, sagemaker, urllib.request
from sagemaker import get_execution_role
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import Image
from IPython.display import display
from time import gmtime, strftime
from sagemaker.predictor import csv_serializer

In [85]:
# Define IAM role
role = get_execution_role()
prefix = 'sagemaker/DEMO-xgboost-dm'
my_region = boto3.session.Session().region_name # set the region of the instance

# this line automatically looks for the XGBoost image URI and builds an XGBoost container.
xgboost_container = sagemaker.image_uris.retrieve("xgboost", my_region, "latest")

print("Success - the MySageMakerInstance is in the " + my_region + " region. You will use the " + xgboost_container + " container for your SageMaker endpoint.")

Success - the MySageMakerInstance is in the us-west-1 region. You will use the 632365934929.dkr.ecr.us-west-1.amazonaws.com/xgboost:latest container for your SageMaker endpoint.


In [86]:
bucket_name = 'xgbsp-s3' # <--- CHANGE THIS VARIABLE TO A UNIQUE NAME FOR YOUR BUCKET
s3 = boto3.resource('s3')
try:
    if  my_region == 'us-east-1':
      s3.create_bucket(Bucket=bucket_name)
    else: 
      s3.create_bucket(Bucket=bucket_name, CreateBucketConfiguration={ 'LocationConstraint': my_region })
    print('S3 bucket created successfully')
except Exception as e:
    print('S3 error: ',e)

S3 bucket created successfully


In [100]:
model_data = pd.read_csv('./heart.csv')
useful_feature_target = ['cp', 'thalach', 'slope', 'restecg', 'trestbps', 'age', 'sex', 'thal', 'ca', 'oldpeak', 'exang', 'chol', 'target']
model_data.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [101]:
train_data, test_data = np.split(model_data.sample(frac=1, random_state=20), [int(0.8 * len(model_data))])
print(train_data.shape, test_data.shape)

(242, 14) (61, 14)


In [102]:
pd.concat([train_data['target'], train_data.drop(['target'], axis=1)], axis=1).to_csv('train.csv', index=False, header=False)
boto3.Session().resource('s3').Bucket(bucket_name).Object(os.path.join(prefix, 'train/train.csv')).upload_file('train.csv')
s3_input_train = sagemaker.inputs.TrainingInput(s3_data='s3://{}/{}/train'.format(bucket_name, prefix), content_type='csv')

In [104]:
sess = sagemaker.Session()
xgb = sagemaker.estimator.Estimator(xgboost_container,role, instance_count=1, instance_type='ml.m4.xlarge',output_path='s3://{}/{}/output'.format(bucket_name, prefix),sagemaker_session=sess)
xgb.set_hyperparameters(max_depth=4,eta=0.2,gamma=4,min_child_weight=6,subsample=0.8,silent=0,objective='binary:logistic',num_round=100)

In [105]:
xgb.fit({'train': s3_input_train})

2022-10-18 09:49:09 Starting - Starting the training job...
2022-10-18 09:49:35 Starting - Preparing the instances for trainingProfilerReport-1666086549: InProgress
.........
2022-10-18 09:51:00 Downloading - Downloading input data...
2022-10-18 09:51:36 Training - Downloading the training image........[34mArguments: train[0m
[34m[2022-10-18:09:52:55:INFO] Running standalone xgboost training.[0m
[34m[2022-10-18:09:52:55:INFO] Path /opt/ml/input/data/validation does not exist![0m
[34m[2022-10-18:09:52:55:INFO] File size need to be processed in the node: 0.01mb. Available memory size in the node: 8812.0mb[0m
[34m[2022-10-18:09:52:55:INFO] Determined delimiter of CSV input is ','[0m
[34m[09:52:55] S3DistributionType set as FullyReplicated[0m
[34m[09:52:55] 242x13 matrix with 3146 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[34m[09:52:55] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 6 extra nodes, 2 pruned nodes, max_dept

In [106]:
xgb_predictor = xgb.deploy(initial_instance_count=1,instance_type='ml.m4.xlarge')

------!

In [109]:
from sagemaker.serializers import CSVSerializer

test_data_array = test_data.drop(['target'], axis=1).values #load the data into an array
xgb_predictor.serializer = CSVSerializer() # set the serializer type
predictions = xgb_predictor.predict(test_data_array).decode('utf-8') # predict!
predictions_array = np.fromstring(predictions[1:], sep=',') # and turn the prediction into an array
print(predictions_array.shape)

(61,)


In [117]:
test_data['target'].values

array([1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1,
       1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0,
       1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0])

In [136]:
def predict_cls(predictions_array):
    y_pred=[]
    for i in predictions_array:
        if i<=0.5:
            y_pred.append(0)
        else:
            y_pred.append(1)
    return y_pred

def output_fn(prediction, content_type):
    return content_type[int(prediction[0])]

y_pred = predict_cls(predictions_array)

In [137]:
y_test = test_data['target'].values

In [138]:
print(classification_report(y_test, y_pred)) #y_pred should be np.array()

              precision    recall  f1-score   support

           0       0.89      0.76      0.82        33
           1       0.76      0.89      0.82        28

    accuracy                           0.82        61
   macro avg       0.83      0.83      0.82        61
weighted avg       0.83      0.82      0.82        61



In [139]:
x_test = test_data.drop(['target'], axis=1)

In [142]:
pred_prob = float(xgb_predictor.predict(x_test.iloc[0].values).decode('utf-8'))
print('The predicted probability is', pred_prob)
pred_val = predict_cls([pred_prob])

content_type = ['No','Yes']
print('Heart disease:',output_fn(pred_val, content_type))

The predicted probability is 0.8522401452064514
Heart disease: Yes


In [143]:
# Delete your endpoint
xgb_predictor.delete_endpoint(delete_endpoint_config=True)
# Delete your training artifacts and S3 bucket
bucket_to_delete = boto3.resource('s3').Bucket(bucket_name)
bucket_to_delete.objects.all().delete()

[{'ResponseMetadata': {'RequestId': 'H2VKX8J5NM25VQYG',
   'HostId': 'TFKD9tiz1Sd5ns5//Ctmgyba5aLRpkG6XQJdNTjjxUDS/sUjk62isNuujKq77JOkE8SSXakX40o=',
   'HTTPStatusCode': 200,
   'HTTPHeaders': {'x-amz-id-2': 'TFKD9tiz1Sd5ns5//Ctmgyba5aLRpkG6XQJdNTjjxUDS/sUjk62isNuujKq77JOkE8SSXakX40o=',
    'x-amz-request-id': 'H2VKX8J5NM25VQYG',
    'date': 'Tue, 18 Oct 2022 11:06:45 GMT',
    'content-type': 'application/xml',
    'transfer-encoding': 'chunked',
    'server': 'AmazonS3',
    'connection': 'close'},
   'RetryAttempts': 0},
  'Deleted': [{'Key': 'sagemaker/DEMO-xgboost-dm/output/xgboost-2022-10-18-09-40-32-437/output/model.tar.gz'},
   {'Key': 'sagemaker/DEMO-xgboost-dm/output/xgboost-2022-10-18-09-49-09-303/rule-output/ProfilerReport-1666086549/profiler-output/profiler-reports/OverallSystemUsage.json'},
   {'Key': 'sagemaker/DEMO-xgboost-dm/output/xgboost-2022-10-18-09-49-09-303/rule-output/ProfilerReport-1666086549/profiler-output/profiler-reports/Dataloader.json'},
   {'Key': 'sagem