1.Replace prefix with S3 prefix

2.Key should be the key path of s3 bucket with training data

3.You can change Training instance type if you need faster training

4.You can view and test endpoint

5.Batch transform works to get prediction for the input data specify the S3 prefix with data to run inference with.

In [9]:
import boto3
import botocore
import os
import sagemaker


bucket = sagemaker.Session().default_bucket()
"""Put the bucket prefix on prefix"""
prefix = ""
execution_role = sagemaker.get_execution_role()
region = boto3.Session().region_name

# check if the bucket exists
try:
    boto3.Session().client("s3").head_bucket(Bucket=bucket)
except botocore.exceptions.ParamValidationError as e:
    print(
        "Hey! You either forgot to specify your S3 bucket or you gave your bucket an invalid name!"
    )
except botocore.exceptions.ClientError as e:
    if e.response["Error"]["Code"] == "403":
        print(f"Hey! You don't have permission to access the bucket, {bucket}.")
    elif e.response["Error"]["Code"] == "404":
        print(f"Hey! Your bucket, {bucket}, doesn't exist!")
    else:
#         raise
else:
    print(f"Training input/output will be stored in: s3://{bucket}/{prefix}")

In [13]:
#Change Train Data to the train file name ,assuming train.csv
train_data_file = "train.csv"
"""Key Should be full path of the train data file"""
key = sagemaker/ipinsight-uae/train/train.csv
s3_train_data = f"s3://{bucket}/{key}"
print(f"S3 Train Data is {s3_train_data}")
boto3.resource("s3").Bucket(bucket).Object(key).put(Body=train_data_file)

# Configure SageMaker IP Insights Input Channels
input_data = {
    "train": sagemaker.session.s3_input(
        s3_train_data, distribution="FullyReplicated", content_type="text/csv"
    )
}


In [11]:
from sagemaker.amazon.amazon_estimator import get_image_uri

image = get_image_uri(boto3.Session().region_name, "ipinsights")

In [8]:
# Set up the estimator with training job configuration
ip_insights = sagemaker.estimator.Estimator(
    image,
    execution_role,
    instance_count=1,
    instance_type="ml.m5.2xlarge",
    output_path=f"s3://{bucket}/{prefix}/output",
    sagemaker_session=sagemaker.Session(),
)

# Configure algorithm-specific hyperparameters
ip_insights.set_hyperparameters(
    num_entity_vectors="20000",
    random_negative_sampling_rate="5",
    vector_dim="128",
    mini_batch_size="1000",
    epochs="5",
    learning_rate="0.01",
)

# Start the training job (should take about ~ loong time / epoch to complete)
ip_insights.fit(input_data)

In [None]:
#Verify Training Job is complete
print(f"Training job name: {ip_insights.latest_training_job.job_name}")


In [None]:
#Deploy End point
predictor = ip_insights.deploy(initial_instance_count=1, instance_type="ml.m5.xlarge")


In [None]:
#Verify Endpoint
print(f"Endpoint name: {predictor.endpoint}")


In [None]:
from sagemaker.predictor import csv_serializer, json_deserializer

predictor.serializer = csv_serializer
predictor.deserializer = json_deserializer

In [None]:
inference_data = [(data[0], data[1]) for data in train_df[:5].values]
predictor.predict(
    inference_data, 
    initial_args={"ContentType": "text/csv", "Accept": "application/json"}
)

In [None]:
#Use the input data to run inference call eg , valid.csv
validation_data_file = "valid.csv"
"""Key full patch of validation csv"""
key = 'sagemaker/ipinsight-uae/validation/valid.csv'
boto3.resource("s3").Bucket(bucket).Object(key).put(Body=valid_data)
s3_valid_data = f"s3://{bucket}/{key}"

# Configure SageMaker IP Insights Input Channels
input_data = {"train": s3_train_data, "validation": s3_valid_data}

In [None]:
#Batch Transform


transformer = ip_insights.transformer(instance_count=1, instance_type="ml.m5.2xlarge")


transformer.transform(s3_valid_data,
                      content_type="text/csv", 
                      split_type="Line")

In [None]:
test = ip_insights.transformer()

In [None]:
# Wait for Transform Job to finish
transformer.wait()

In [None]:
print(f"Batch Transform output is at: {transformer.output_path}")
