In [1]:
# install xgboost
!conda install -y -c conda-forge xgboost

Solving environment: done


  current version: 4.5.12
  latest version: 4.7.12

Please update conda by running

    $ conda update -n base -c defaults conda



## Package Plan ##

  environment location: /home/ec2-user/anaconda3/envs/python3

  added / updated specs: 
    - xgboost


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    xgboost-0.90               |   py36he1b5a44_4          11 KB  conda-forge
    mkl_random-1.0.2           |           py36_0         1.3 MB  conda-forge
    openssl-1.0.2t             |       h14c3975_0         3.1 MB  conda-forge
    tbb-2019.9                 |       hc9558a2_0         1.4 MB  conda-forge
    py-xgboost-0.90            |           py36_4          73 KB  conda-forge
    _py-xgboost-mutex-2.0      |            cpu_0           8 KB  conda-forge
    tbb4py-2019.9              |   py36hc9558a2_0         245 KB  conda-forge
    libxgboost-0.90      

In [2]:
# import the packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import boto3
import re
import sagemaker
from sagemaker import get_execution_role
import xgboost as xgb
from sklearn import preprocessing
from sklearn.metrics import classification_report, confusion_matrix

## Iris Classification dataset
Input Features:
sepal_length, sepal_width, petal_length, petal_width

Target Feature:
Iris plant class


In [3]:
columns = ['encoded_class', 'sepal_length','sepal_width','petal_length','petal_width']

In [None]:
# Encode the class labels to integers
le = preprocessing.LabelEncoder()
le.fit(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'])

In [None]:
le.classes_

In [None]:
df = pd.read_csv('iris_all.csv')

In [None]:
df['class'].value_counts()

In [None]:
df['encoded_class'] = le.transform(df['class'])

## Training and validation set
### Target variables are the first columns followed by input features:
class, sepal_length, sepal_width, petal_length, petal_width

### Training and validation files do not have column headers

In [None]:
# Training = 70% of the data
# Validation = 30% of the data
# Randomize the dataset
np.random.seed(5)
l = list(df.index)
np.random.shuffle(l)
df = df.loc[l]

In [None]:
rows = df.shape[0]
train = int(.7* rows)
test = rows - train 

In [None]:
# Write the Training dataset
df[:train].to_csv('iris_train.csv',index=False,header=False, columns=columns)

In [None]:
# Write the validation dataset
df[train:].to_csv('iris_validation.csv',index=False, header=False, columns=columns)

In [None]:
# Write columns list
with open('iris_columns_list.txt','w') as f:
    f.write(','.join(columns))

## XGBoost 
### Multiclass classification using xgboost in the local mode
This is testing the parameters before running the model in Sagemaker

In [None]:
# create a variable for the files that have been generated
column_list_file = 'iris_columns_list.txt'
train_file = 'iris_train.csv'
validation_file = 'iris_validation.csv'

In [None]:
columns = ''
with open(column_list_file,'r') as f:
    columns = f.read().split(',')

In [None]:
columns

In [None]:
# Training and validation data frame. Columns list is needed
# as the files do not have column headers
df_train = pd.read_csv(train_file,names=columns)
df_validation = pd.read_csv(validation_file, names=columns)

In [None]:
X_train = df_train.iloc[:,1:]
y_train = df_train.iloc[:,0].ravel()

X_validation = df_validation.iloc[:,1:]
y_validation = df_validation.iloc[:,0].ravel()

In [None]:
classifier = xgb.XGBClassifier(objective="multi:softmax",
                               num_classes=3,
                               n_estimators=100)

In [None]:
classifier

In [None]:
classifier.fit(X_train, y_train,
               eval_set=[(X_train, y_train), (X_validation, y_validation)],
               eval_metric=['mlogloss'],
               early_stopping_rounds=10)

In [None]:
eval_result = classifier.evals_result()

In [None]:
training_rounds = range(len(eval_result['validation_0']['mlogloss']))

In [None]:
print(training_rounds)

In [None]:
plt.scatter(x=training_rounds, y=eval_result['validation_0']['mlogloss'], label='Training Error')
plt.scatter(x=training_rounds, y=eval_result['validation_1']['mlogloss'], label='Validation Error')
plt.xlabel('Iterations')
plt.ylabel('LogLoss')
plt.title('Training vs Validation Error')
plt.grid(True)
plt.legend()
plt.show()

In [None]:
xgb.plot_importance(classifier)
plt.show()

In [None]:
df = pd.read_csv(validation_file, names=columns)

In [None]:
X_test = df.iloc[:,1:]

In [None]:
result = classifier.predict(X_test)

In [None]:
df['predicted_class'] = result

In [None]:
# compare the performance of actual and predicted class
plt.scatter(df.index, df['encoded_class'], label='Actual')
plt.scatter(df.index, df['predicted_class'], label='Predicted', marker='^')
plt.xlabel('Sample')
plt.ylabel('Class')
plt.legend(loc=1)
plt.yticks([0,1,2])
plt.show()

In [None]:
print(classification_report(
      df['encoded_class'],
      df['predicted_class'],
      labels=[0,1,2],
      target_names=['Iris-setosa','Iris-versicolor', 'Iris-virginica']))

# Sagemaker deployment of the XGBoost model

### Upload data to S3

In [9]:
bucket_name = 'napa-ml-sagemaker'
training_file_key = 'iris/iris_train.csv'
validation_file_key = 'iris/iris_validation.csv'

s3_model_output_location = r's3://{0}/iris/model'.format(bucket_name)
s3_training_file_location = r's3://{0}/{1}'.format(bucket_name, training_file_key)
s3_validation_file_location = r's3://{0}/{1}'.format(bucket_name, validation_file_key)

In [None]:
print(s3_model_output_location)
print(s3_training_file_location)
print(s3_validation_file_location)

In [None]:
# Write file to S3 and then read the file
# files are referred to as objects in S3
# file name is referred to as key name in S3

def write_to_s3(filename, bucket, key):
    with open(filename,'rb') as f:
        return boto3.Session().resource('s3').Bucket(bucket).Object(key).upload_fileobj(f)

In [None]:
write_to_s3('iris_train.csv', bucket_name, training_file_key)
write_to_s3('iris_validation.csv', bucket_name, validation_file_key)

## Training and validation have a docker image

In [4]:
# Establish a Sagemaker session with AWS
sess = sagemaker.Session()

In [5]:
# Get the role that has the permission to train, deploy models in Sagemaker
role = get_execution_role()

In [6]:
# Get the container
containers = sagemaker.amazon.amazon_estimator.get_image_uri(
              sess.boto_region_name,
              "xgboost",
              "latest")

	get_image_uri(region, 'xgboost', '0.90-1').


In [7]:
print("Using Sagemaker container:\n{} ({})".format(containers, sess.boto_region_name))

Using Sagemaker container:
811284229777.dkr.ecr.us-east-1.amazonaws.com/xgboost:latest (us-east-1)


## Build the model

In [10]:
estimator = sagemaker.estimator.Estimator(containers, role,
                                          train_instance_count=1,
                                          train_instance_type='ml.m4.xlarge',
                                          output_path=s3_model_output_location,
                                          sagemaker_session=sess,
                                          base_job_name='xgboost-iris-v1')

In [11]:
estimator.set_hyperparameters(max_depth=5,
                              objective="multi:softmax",
                              num_class=3,
                              num_round=50,
                              early_stopping_rounds=10)

In [12]:
estimator.hyperparameters()

{'max_depth': 5,
 'objective': 'multi:softmax',
 'num_class': 3,
 'num_round': 50,
 'early_stopping_rounds': 10}

## Specify the data channels that show the training and validation data

In [16]:
training_input_config = sagemaker.session.s3_input(s3_data=s3_training_file_location,
                                                   content_type='csv',
                                                   s3_data_type ='S3Prefix'
                                                   )
validation_input_config = sagemaker.session.s3_input(s3_data=s3_validation_file_location,
                                                     content_type = 'csv',
                                                     s3_data_type ='S3Prefix')

data_channels = {'train':training_input_config, 'validation':validation_input_config}

## Train the model

In [17]:
estimator.fit(data_channels)

2019-11-11 16:01:27 Starting - Starting the training job...
2019-11-11 16:01:29 Starting - Launching requested ML instances......
2019-11-11 16:02:32 Starting - Preparing the instances for training...
2019-11-11 16:03:27 Downloading - Downloading input data......
2019-11-11 16:04:25 Training - Training image download completed. Training in progress.
2019-11-11 16:04:25 Uploading - Uploading generated training model[31mArguments: train[0m
[31m[2019-11-11:16:04:21:INFO] Running standalone xgboost training.[0m
[31m[2019-11-11:16:04:21:INFO] File size need to be processed in the node: 0.0mb. Available memory size in the node: 8582.96mb[0m
[31m[2019-11-11:16:04:21:INFO] Determined delimiter of CSV input is ','[0m
[31m[16:04:21] S3DistributionType set as FullyReplicated[0m
[31m[16:04:21] 105x4 matrix with 420 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[31m[2019-11-11:16:04:21:INFO] Determined delimiter of CSV input is ','[0m
[31m[16:0

## Deploy the model

In [18]:
predictor = estimator.deploy(initial_instance_count=1,
                             instance_type='ml.m4.xlarge',
                             endpoint_name = 'xgboost-iris-v1')

-----------------------------------------------------------------------------------------------------------------!

# Run the prediction

In [19]:
# Acquire real time endpoint
endpoint_name = 'xgboost-iris-v1'
predictor = sagemaker.predictor.RealTimePredictor(endpoint=endpoint_name)

In [20]:
from sagemaker.predictor import csv_serializer, json_deserializer

predictor.content_type = 'text/csv'
predictor.serializer = csv_serializer
predictor.deserializer = None

In [21]:
# Test prediction on the validation dataset
df_all = pd.read_csv('iris_validation.csv',
                     names=columns)

In [22]:
df_all.columns

Index(['encoded_class', 'sepal_length', 'sepal_width', 'petal_length',
       'petal_width'],
      dtype='object')

In [23]:
# Need to pass an array to the prediction
arr_test = df_all.as_matrix(['sepal_length','sepal_width','petal_length','petal_width'])

  from ipykernel import kernelapp as app


In [73]:
predictions = []
for arr in np.array_split(arr_test,10):
    result = predictor.predict(arr)
    result = result.decode("utf-8")
    result = result.split(',')
    for r in result:
        predictions.append(int(float(r)))

In [75]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(['Iris-setosa', 'Iris-versicolor','Iris-virginica'])

LabelEncoder()

In [77]:
df_all['class'] = le.inverse_transform(df_all.encoded_class)

In [78]:
df_all['predicted_class'] = le.inverse_transform(predictions)

In [79]:
df_all.head()

Unnamed: 0,encoded_class,sepal_length,sepal_width,petal_length,petal_width,class,predicted_class
0,1,5.8,2.7,4.1,1.0,Iris-versicolor,Iris-versicolor
1,0,4.8,3.4,1.6,0.2,Iris-setosa,Iris-setosa
2,1,6.0,2.2,4.0,1.0,Iris-versicolor,Iris-versicolor
3,2,6.4,3.1,5.5,1.8,Iris-virginica,Iris-virginica
4,2,6.7,2.5,5.8,1.8,Iris-virginica,Iris-virginica


In [80]:
print('Confusion matrix - Actula versus Predicted')
pd.crosstab(df_all['class'], df_all['predicted_class'])

Confusion matrix - Actula versus Predicted


predicted_class,Iris-setosa,Iris-versicolor,Iris-virginica
class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Iris-setosa,16,0,0
Iris-versicolor,0,10,1
Iris-virginica,0,1,17


In [82]:
import sklearn.metrics as metrics
print(metrics.classification_report(df_all['class'], df_all['predicted_class']))

                 precision    recall  f1-score   support

    Iris-setosa       1.00      1.00      1.00        16
Iris-versicolor       0.91      0.91      0.91        11
 Iris-virginica       0.94      0.94      0.94        18

      micro avg       0.96      0.96      0.96        45
      macro avg       0.95      0.95      0.95        45
   weighted avg       0.96      0.96      0.96        45

