In [1]:
import pandas as pd
import numpy as np

from sklearn import datasets
from sklearn.model_selection import train_test_split

In [2]:
floral = datasets.load_iris()

print(floral.target_names)
print(floral.feature_names)

['setosa' 'versicolor' 'virginica']
['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']


In [3]:
data=pd.DataFrame({
    'sepal length':floral.data[:,0],
    'sepal width':floral.data[:,1],
    'petal length':floral.data[:,2],
    'petal width':floral.data[:,3],
    'species':floral.target
})
data.head()

Unnamed: 0,sepal length,sepal width,petal length,petal width,species
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [4]:
# Features
input_data =data[['sepal length', 'sepal width', 'petal length', 'petal width']]  

# Labels
labels =data['species']  

In [5]:
# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(input_data, labels, test_size=0.3)

train = pd.concat([pd.Series(y_train, index= X_train.index, name='species', dtype='int'), X_train], axis=1)

test = pd.concat([pd.Series(y_test, index= X_test.index, name='species', dtype='int'), X_test], axis=1)

In [6]:
train.to_csv("train.csv",index=False, header=False)

In [7]:
import sagemaker,boto3, os

bucket = sagemaker.Session().default_bucket()
prefix = 'FloralDataset'

boto3.Session().resource('s3').Bucket(bucket).Object(
    os.path.join(prefix, 'data/train.csv')).upload_file('train.csv')

In [8]:
!aws s3 ls {bucket}/{prefix}/data --recursive

2022-05-27 15:46:24       1890 FloralDataset/data/train.csv


In [9]:
import sagemaker

region = sagemaker.Session().boto_region_name
print(f"Aws Region name : {region}")

role = sagemaker.get_execution_role()
print(f"Role ARN (AWS Resource Name) : {role}")

Aws Region name : us-east-2
Role ARN (AWS Resource Name) : arn:aws:iam::172197024265:role/service-role/AmazonSageMaker-ExecutionRole-20220512T214615


In [10]:
from sagemaker.session import TrainingInput
s3_output_location = 's3://{}/{}/{}'.format(bucket, prefix, 'xgboostModel')
container = sagemaker.image_uris.retrieve("xgboost", region, "latest")

In [11]:
xgboostModel = sagemaker.estimator.Estimator(
    image_uri=container,
    role = role,
    instance_count=1,
    instance_type='ml.m4.xlarge',
    output_path = s3_output_location,
    sagemaker_session= sagemaker.Session()
)


In [12]:
xgboostModel.set_hyperparameters(    
    objective="multi:softmax",num_class=3,
    num_round=100)

In [13]:
from sagemaker.session import TrainingInput

training_input = TrainingInput(
    "s3://{}/{}/{}".format(bucket, prefix, 'data/train.csv'), content_type='csv')
xgboostModel.fit({'train':training_input},wait=True)

2022-05-27 15:46:26 Starting - Starting the training job...
2022-05-27 15:46:53 Starting - Preparing the instances for trainingProfilerReport-1653666385: InProgress
.........
2022-05-27 15:48:19 Downloading - Downloading input data......
2022-05-27 15:49:13 Training - Downloading the training image...
2022-05-27 15:49:53 Training - Training image download completed. Training in progress..[34mArguments: train[0m
[34m[2022-05-27:15:49:54:INFO] Running standalone xgboost training.[0m
[34m[2022-05-27:15:49:54:INFO] Path /opt/ml/input/data/validation does not exist![0m
[34m[2022-05-27:15:49:54:INFO] File size need to be processed in the node: 0.0mb. Available memory size in the node: 8472.46mb[0m
[34m[2022-05-27:15:49:54:INFO] Determined delimiter of CSV input is ','[0m
[34m[15:49:54] S3DistributionType set as FullyReplicated[0m
[34m[15:49:54] 105x4 matrix with 420 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[34m[15:49:54] src/tree/up

In [14]:
import sagemaker
from sagemaker.serializers import CSVSerializer

xgb_predictor = xgboostModel.deploy(initial_instance_count=1,instance_type='ml.t2.medium',serializer=CSVSerializer())

-------!

In [15]:
test_data_array = test.drop(['species'], axis=1).values 
predictions = xgb_predictor.predict(test_data_array).decode('utf-8')

In [16]:
y_pred= np.fromstring(predictions[1:], sep=',')

In [17]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

0.9333333333333333

In [18]:
#getting the name of the endpoint
xgb_predictor.endpoint_name

'xgboost-2022-05-27-15-51-08-705'