In [None]:
import boto3
import sagemaker
import pandas as pd


sess = sagemaker.Session()
bucket = sess.default_bucket()
role = sagemaker.get_execution_role()
region = boto3.Session().region_name

sm = boto3.Session().client(service_name="sagemaker", region_name=region)
iam = boto3.Session().client(service_name="iam", region_name=region)
ec2 = boto3.Session().client(service_name="ec2", region_name=region)

In [None]:
%store -r processed_train_data_s3_uri

In [None]:
try:
    processed_train_data_s3_uri
except NameError:
    print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
    print("[ERROR] Please run the notebooks in the PREPARE section before you continue.")
    print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")

In [None]:
print(processed_train_data_s3_uri)

In [None]:
%store -r processed_validation_data_s3_uri

In [None]:
try:
    processed_validation_data_s3_uri
except NameError:
    print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
    print("[ERROR] Please run the notebooks in the PREPARE section before you continue.")
    print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")

In [None]:
print(processed_validation_data_s3_uri)

In [None]:
%store -r processed_test_data_s3_uri

In [None]:
try:
    processed_test_data_s3_uri
except NameError:
    print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
    print("[ERROR] Please run the notebooks in the PREPARE section before you continue.")
    print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")

In [None]:
print(processed_test_data_s3_uri)

In [None]:
%store -r max_seq_length

In [None]:
print(max_seq_length)

# Specify the Dataset in S3
We are using the train, validation, and test splits created in the previous section.

In [None]:
print(processed_train_data_s3_uri)

!aws s3 ls $processed_train_data_s3_uri/

In [None]:
print(processed_validation_data_s3_uri)

!aws s3 ls $processed_validation_data_s3_uri/

In [None]:
print(processed_test_data_s3_uri)

!aws s3 ls $processed_test_data_s3_uri/

# Specify S3 `Distribution Strategy`

In [None]:
from sagemaker.inputs import TrainingInput

s3_input_train_data = TrainingInput(s3_data=processed_train_data_s3_uri, distribution="ShardedByS3Key")
s3_input_validation_data = TrainingInput(s3_data=processed_validation_data_s3_uri, distribution="ShardedByS3Key")
s3_input_test_data = TrainingInput(s3_data=processed_test_data_s3_uri, distribution="ShardedByS3Key")

print(s3_input_train_data.config)
print(s3_input_validation_data.config)
print(s3_input_test_data.config)

# Setup Hyper-Parameters for Classification Layer

In [None]:
print(max_seq_length)

In [None]:
epochs = 1
learning_rate = 0.00001
epsilon = 0.00000001
train_batch_size = 128
validation_batch_size = 128
test_batch_size = 128
train_steps_per_epoch = 100
validation_steps = 100
test_steps = 100
train_instance_count = 1
train_instance_type = "ml.c5.9xlarge"
train_volume_size = 1024
use_xla = True
use_amp = True
freeze_bert_layer = False
enable_sagemaker_debugger = True
enable_checkpointing = False
enable_tensorboard = False
# input_mode='Pipe'
input_mode = "File"
run_validation = True
run_test = True
run_sample_predictions = True

In [None]:
metrics_definitions = [
    {"Name": "train:loss", "Regex": "loss: ([0-9\\.]+)"},
    {"Name": "train:accuracy", "Regex": "accuracy: ([0-9\\.]+)"},
    {"Name": "validation:loss", "Regex": "val_loss: ([0-9\\.]+)"},
    {"Name": "validation:accuracy", "Regex": "val_accuracy: ([0-9\\.]+)"},
]

# Setup Our BERT + TensorFlow Script to Run on SageMaker
Prepare our TensorFlow model to run on the managed SageMaker service

In [None]:
assume_role_policy_doc = {
    "Version": "2012-10-17",
    "Statement": [
        {"Effect": "Allow", "Principal": {"Service": "sagemaker.amazonaws.com"}, "Action": "sts:AssumeRole"}
    ],
}

In [None]:
import time

timestamp = int(time.time())

In [None]:
secure_iam_role_name = "DSOAWS_Secure_Train_VPC_{}".format(timestamp)

In [None]:
import json
import time

from botocore.exceptions import ClientError

try:
    secure_iam_role = iam.create_role(
        RoleName=secure_iam_role_name,
        AssumeRolePolicyDocument=json.dumps(assume_role_policy_doc),
        Description="DSOAWS Secure Role",
    )
except ClientError as e:
    if e.response["Error"]["Code"] == "EntityAlreadyExists":
        iam_role = iam.get_role(RoleName=secure_iam_role_name)
    #        print("Role already exists")
    else:
        print("Unexpected error: %s" % e)

print(secure_iam_role)

time.sleep(30)

In [None]:
iam_policy_allow_s3 = {
    "Version": "2012-10-17",
    "Statement": [{"Sid": "", "Effect": "Allow", "Action": ["s3:*"], "Resource": ["arn:aws:s3:::{}".format(bucket)]}],
}

In [None]:
policy_allow_s3_name = "DSOAWS_Secure_Train_Allow_S3_{}".format(timestamp)

In [None]:
import time

response = iam.put_role_policy(
    RoleName=secure_iam_role_name, PolicyName=policy_allow_s3_name, PolicyDocument=json.dumps(iam_policy_allow_s3)
)

print(response)

time.sleep(30)

In [None]:
different_subnet_id = "blah"
different_security_group_ids = ["blah"]

In [None]:
# Create the bucket policy
policy_deny_create_training_job = {
    "Version": "2008-10-17",
    "Statement": [
        {
            "Effect": "Deny",
            "Action": [
                "sagemaker:CreateTrainingJob",
            ],
            "Resource": ["*"],
            "Condition": {
                "StringNotEquals": {
                    "sagemaker:VpcSecurityGroupIds": different_security_group_ids,
                    "sagemaker:VpcSubnets": [different_subnet_id],
                }
            },
        }
    ],
}

In [None]:
policy_deny_create_training_job_name = "DSOAWS_Secure_Train_Deny_CreateTrainingJob_VPC_{}".format(timestamp)

In [None]:
import time

response = iam.put_role_policy(
    RoleName=secure_iam_role_name,
    PolicyName=policy_deny_create_training_job_name,
    PolicyDocument=json.dumps(policy_deny_create_training_job),
)

print(response)

time.sleep(30)

In [None]:
from sagemaker.tensorflow import TensorFlow

estimator = TensorFlow(
    entry_point="tf_bert_reviews.py",
    source_dir="src",
    role=secure_iam_role_name,
    instance_count=train_instance_count,
    instance_type=train_instance_type,
    volume_size=train_volume_size,
    py_version="py3",
    framework_version="2.1.0",
    hyperparameters={
        "epochs": epochs,
        "learning_rate": learning_rate,
        "epsilon": epsilon,
        "train_batch_size": train_batch_size,
        "validation_batch_size": validation_batch_size,
        "test_batch_size": test_batch_size,
        "train_steps_per_epoch": train_steps_per_epoch,
        "validation_steps": validation_steps,
        "test_steps": test_steps,
        "use_xla": use_xla,
        "use_amp": use_amp,
        "max_seq_length": max_seq_length,
        "freeze_bert_layer": freeze_bert_layer,
        "enable_sagemaker_debugger": enable_sagemaker_debugger,
        "enable_checkpointing": enable_checkpointing,
        "enable_tensorboard": enable_tensorboard,
        "run_validation": run_validation,
        "run_test": run_test,
        "run_sample_predictions": run_sample_predictions,
    },
    input_mode=input_mode,
    subnets=None,
    security_group_ids=None,
)

# Verify `CreateTrainingJob: AccessDenied`

In [None]:
estimator.fit(
    inputs={"train": s3_input_train_data, "validation": s3_input_validation_data, "test": s3_input_test_data},
    wait=False,
)

In [None]:
training_job_name = estimator.latest_training_job.name
print("Training Job Name:  {}".format(training_job_name))

In [None]:
from IPython.core.display import display, HTML

display(
    HTML(
        '<b>Review <a target="blank" href="https://console.aws.amazon.com/sagemaker/home?region={}#/jobs/{}">Training Job</a> After About 5 Minutes</b>'.format(
            region, training_job_name
        )
    )
)

In [None]:
from IPython.core.display import display, HTML

display(
    HTML(
        '<b>Review <a target="blank" href="https://console.aws.amazon.com/cloudwatch/home?region={}#logStream:group=/aws/sagemaker/TrainingJobs;prefix={};streamFilter=typeLogStreamPrefix">CloudWatch Logs</a> After About 5 Minutes</b>'.format(
            region, training_job_name
        )
    )
)

In [None]:
from IPython.core.display import display, HTML

display(
    HTML(
        '<b>Review <a target="blank" href="https://s3.console.aws.amazon.com/s3/buckets/{}/{}/?region={}&tab=overview">S3 Output Data</a> After The Training Job Has Completed</b>'.format(
            bucket, training_job_name, region
        )
    )
)

In [None]:
%%time

estimator.latest_training_job.wait(logs=False)

# Cleanup Policies and Roles

In [None]:
response = iam.delete_role_policy(RoleName=secure_iam_role_name, PolicyName=policy_deny_create_training_job_name)
print(response)

time.sleep(30)

In [None]:
response = iam.delete_role_policy(RoleName=secure_iam_role_name, PolicyName=policy_allow_s3_name)
print(response)

time.sleep(30)

In [None]:
iam.delete_role(RoleName=secure_iam_role_name)

time.sleep(30)

In [None]:
import json

notebook_instance_name = None

try:
    with open("/opt/ml/metadata/resource-metadata.json") as notebook_info:
        data = json.load(notebook_info)
        resource_arn = data["ResourceArn"]
        region = resource_arn.split(":")[3]
        notebook_instance_name = data["ResourceName"]
    print("Notebook Instance Name: {}".format(notebook_instance_name))
except:
    print("+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
    print("[ERROR]: COULD NOT RETRIEVE THE NOTEBOOK INSTANCE METADATA.")
    print("+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")

In [None]:
response = sm.describe_notebook_instance(NotebookInstanceName=notebook_instance_name)

print(response)

In [None]:
print("SubnetId: {}".format(response["SubnetId"]))
print("SecurityGroups: {}".format(response["SecurityGroups"]))
print("IAM Role: {}".format(response["RoleArn"]))
print("NetworkInterfaceId: {}".format(response["NetworkInterfaceId"]))
print("DirectInternetAccess: {}".format(response["DirectInternetAccess"]))

In [None]:
subnet_id = response["SubnetId"]
print(subnet_id)

In [None]:
security_group_ids = response["SecurityGroups"]
print(security_group_ids)

In [None]:
import time

timestamp = int(time.time())

In [None]:
secure_iam_role_name = "DSOAWS_Secure_Train_VPC_{}".format(timestamp)

In [None]:
import json
import time

from botocore.exceptions import ClientError

try:
    secure_iam_role = iam.create_role(
        RoleName=secure_iam_role_name,
        AssumeRolePolicyDocument=json.dumps(assume_role_policy_doc),
        Description="DSOAWS Secure Role",
    )
except ClientError as e:
    if e.response["Error"]["Code"] == "EntityAlreadyExists":
        iam_role = iam.get_role(RoleName=secure_iam_role_name)
    #        print("Role already exists")
    else:
        print("Unexpected error: %s" % e)

print(secure_iam_role)

time.sleep(30)

In [None]:
iam_policy_allow_s3 = {
    "Version": "2012-10-17",
    "Statement": [{"Sid": "", "Effect": "Allow", "Action": ["s3:*"], "Resource": ["arn:aws:s3:::{}".format(bucket)]}],
}

In [None]:
policy_allow_s3_name = "DSOAWS_Secure_Train_Allow_S3_{}".format(timestamp)

In [None]:
import time

response = iam.put_role_policy(
    RoleName=secure_iam_role_name, PolicyName=policy_allow_s3_name, PolicyDocument=json.dumps(iam_policy_allow_s3)
)

print(response)

time.sleep(30)

In [None]:
# Create the bucket policy
policy_deny_create_training_job = {
    "Version": "2008-10-17",
    "Statement": [
        {
            "Effect": "Deny",
            "Action": [
                "sagemaker:CreateTrainingJob",
            ],
            "Resource": ["*"],
            "Condition": {
                "StringNotEquals": {
                    "sagemaker:VpcSecurityGroupIds": security_group_ids,
                    "sagemaker:VpcSubnets": [subnet_id],
                }
            },
        }
    ],
}

In [None]:
policy_deny_create_training_job_name = "DSOAWS_Secure_Train_Deny_CreateTrainingJob_VPC_{}".format(timestamp)

In [None]:
import time

response = iam.put_role_policy(
    RoleName=secure_iam_role_name,
    PolicyName=policy_deny_create_training_job_name,
    PolicyDocument=json.dumps(policy_deny_create_training_job),
)

print(response)

time.sleep(30)

# Specify the VPC parameters and Verify Successful Training Job

In [None]:
from sagemaker.tensorflow import TensorFlow

estimator = TensorFlow(
    entry_point="tf_bert_reviews.py",
    source_dir="src",
    role=secure_iam_role_name,
    instance_count=train_instance_count,
    instance_type=train_instance_type,
    volume_size=train_volume_size,
    py_version="py3",
    framework_version="2.1.0",
    hyperparameters={
        "epochs": epochs,
        "learning_rate": learning_rate,
        "epsilon": epsilon,
        "train_batch_size": train_batch_size,
        "validation_batch_size": validation_batch_size,
        "test_batch_size": test_batch_size,
        "train_steps_per_epoch": train_steps_per_epoch,
        "validation_steps": validation_steps,
        "test_steps": test_steps,
        "use_xla": use_xla,
        "use_amp": use_amp,
        "max_seq_length": max_seq_length,
        "freeze_bert_layer": freeze_bert_layer,
        "enable_sagemaker_debugger": enable_sagemaker_debugger,
        "enable_checkpointing": enable_checkpointing,
        "enable_tensorboard": enable_tensorboard,
        "run_validation": run_validation,
        "run_test": run_test,
        "run_sample_predictions": run_sample_predictions,
    },
    input_mode=input_mode,
    subnets=[subnet_id],
    security_group_ids=security_group_ids,
)

# Verify Training Starts OK

# TODO:  This works if we don't specify VPC because when we're not using a VPC,  we are going through the public internet - which is not good...  it is preferred to go through the VPC.

```
UnexpectedStatusException: Error for Training job tensorflow-training-2020-12-20-23-13-52-444: Failed. Reason: ClientError: Data download failed:Please ensure that the subnet's route table has a route to an S3 VPC endpoint or a NAT device, and both the security groups and the subnet's network ACL allow outbound traffic to S3.                  
```

In [None]:
estimator.fit(
    inputs={"train": s3_input_train_data, "validation": s3_input_validation_data, "test": s3_input_test_data},
    wait=False,
)

In [None]:
training_job_name = estimator.latest_training_job.name
print("Training Job Name:  {}".format(training_job_name))

In [None]:
from IPython.core.display import display, HTML

display(
    HTML(
        '<b>Review <a target="blank" href="https://console.aws.amazon.com/sagemaker/home?region={}#/jobs/{}">Training Job</a> After About 5 Minutes</b>'.format(
            region, training_job_name
        )
    )
)

In [None]:
from IPython.core.display import display, HTML

display(
    HTML(
        '<b>Review <a target="blank" href="https://console.aws.amazon.com/cloudwatch/home?region={}#logStream:group=/aws/sagemaker/TrainingJobs;prefix={};streamFilter=typeLogStreamPrefix">CloudWatch Logs</a> After About 5 Minutes</b>'.format(
            region, training_job_name
        )
    )
)

In [None]:
from IPython.core.display import display, HTML

display(
    HTML(
        '<b>Review <a target="blank" href="https://s3.console.aws.amazon.com/s3/buckets/{}/{}/?region={}&tab=overview">S3 Output Data</a> After The Training Job Has Completed</b>'.format(
            bucket, training_job_name, region
        )
    )
)

In [None]:
%%time

estimator.latest_training_job.wait(logs=False)

# Wait Until the ^^ Training Job ^^ Completes Above!

# [INFO] _Feel free to continue to the next workshop section while this notebook is running._

In [None]:
!aws s3 cp s3://$bucket/$training_job_name/output/model.tar.gz ./model.tar.gz

In [None]:
!mkdir -p ./model/
!tar -xvzf ./model.tar.gz -C ./model/

In [None]:
!saved_model_cli show --all --dir ./model/tensorflow/saved_model/0/

# Release Resources

In [None]:
%%html

<p><b>Shutting down your kernel for this notebook to release resources.</b></p>
<button class="sm-command-button" data-commandlinker-command="kernelmenu:shutdown" style="display:none;">Shutdown Kernel</button>
        
<script>
try {
    els = document.getElementsByClassName("sm-command-button");
    els[0].click();
}
catch(err) {
    // NoOp
}    
</script>

In [None]:
%%javascript

try {
    Jupyter.notebook.save_checkpoint();
    Jupyter.notebook.session.delete();
}
catch(err) {
    // NoOp
}