# Training with SageMaker

In [1]:
import os
import sagemaker
import boto3 
import botocore
from pprint import pprint

sess = sagemaker.Session()
role = sagemaker.get_execution_role()
client_config = botocore.config.Config(max_pool_connections=16)
s3_client = boto3.client('s3',config=client_config)

In [2]:
data_src_folder = '../data/'
embeddings_src_folder = '../embeddings/'
bucket = 'veriff-sagemaker-data'
bucket_folder = 'datasets/ner_field_extraction/conll-2003-data'
bucket_data_folder = bucket_folder + '/' + 'data'
bucket_embeddings_folder = bucket_folder + '/' + 'embeddings'

## Download data

In [3]:
# Check where to download the train.txt, val.txt and test.txt files from into ../data/ folder

## Download embeddings to notebook instance

In [4]:
# %cd ../embeddings
# !wget http://nlp.stanford.edu/data/glove.6B.zip
# !unzip glove.6B.zip
# %cd ..

## Upload data and embeddings to S3 bucket

In [5]:
local_data_folder = '../data/'
local_embeddings_folder = '../embeddings/'

data_file_names = ['train.txt', 'val.txt', 'test.txt']
embeddings_file_names = ['glove.6B.50d.txt', 'glove.6B.100d.txt', 'glove.6B.200d.txt', 'glove.6B.300d.txt']

bucket = 'veriff-sagemaker-data'
bucket_folder = 'datasets/ner_field_extraction/conll-2003-data'
bucket_data_folder = bucket_folder + '/' + 'data'
bucket_embeddings_folder = bucket_folder + '/' + 'embeddings'

files = {
    "data": {
        "local_folder": local_data_folder,
        "file_names": data_file_names,
        "bucket_folder": bucket_data_folder
    },
    "embeddings": {
        "local_folder": local_embeddings_folder,
        "file_names": embeddings_file_names,
        "bucket_folder": bucket_embeddings_folder,
        
    }
}

for data_type, info in files.items():
    info['bucket_file_paths'] = {}
    for file_name in info['file_names']:
        local_file = os.path.join(info['local_folder'], file_name)
        assert os.path.isfile(local_file)
        print(info['bucket_folder'] + '/' + file_name)
        s3_client.upload_file(local_file, bucket, info['bucket_folder'] + '/' + file_name)
        info['bucket_file_paths'][os.path.splitext(file_name)[0]] = bucket + '/' + info['bucket_folder'] + '/' + file_name
    
pprint(files)

datasets/ner_field_extraction/conll-2003-data/data/train.txt
datasets/ner_field_extraction/conll-2003-data/data/val.txt
datasets/ner_field_extraction/conll-2003-data/data/test.txt
datasets/ner_field_extraction/conll-2003-data/embeddings/glove.6B.50d.txt
datasets/ner_field_extraction/conll-2003-data/embeddings/glove.6B.100d.txt
datasets/ner_field_extraction/conll-2003-data/embeddings/glove.6B.200d.txt
datasets/ner_field_extraction/conll-2003-data/embeddings/glove.6B.300d.txt
{'data': {'bucket_file_paths': {'test': 'veriff-sagemaker-data/datasets/ner_field_extraction/conll-2003-data/data/test.txt',
                                'train': 'veriff-sagemaker-data/datasets/ner_field_extraction/conll-2003-data/data/train.txt',
                                'val': 'veriff-sagemaker-data/datasets/ner_field_extraction/conll-2003-data/data/val.txt'},
          'bucket_folder': 'datasets/ner_field_extraction/conll-2003-data/data',
          'file_names': ['train.txt', 'val.txt', 'test.txt'],
  

In [6]:
objects_in_s3_folder = s3_client.list_objects_v2(Bucket=bucket, Prefix=bucket_folder)
pprint(objects_in_s3_folder)

{'Contents': [{'ETag': '"9cdd4db35b65db782ea91c80f62d074e"',
               'Key': 'datasets/ner_field_extraction/conll-2003-data/data/test.txt',
               'LastModified': datetime.datetime(2022, 4, 21, 14, 13, 36, tzinfo=tzlocal()),
               'Size': 748095,
               'StorageClass': 'STANDARD'},
              {'ETag': '"a595d83962c932f2b59fb52227a16086"',
               'Key': 'datasets/ner_field_extraction/conll-2003-data/data/train.txt',
               'LastModified': datetime.datetime(2022, 4, 21, 14, 13, 35, tzinfo=tzlocal()),
               'Size': 3283420,
               'StorageClass': 'STANDARD'},
              {'ETag': '"d9492db1858f07a26cbb1f3b3f98a49f"',
               'Key': 'datasets/ner_field_extraction/conll-2003-data/data/val.txt',
               'LastModified': datetime.datetime(2022, 4, 21, 14, 13, 36, tzinfo=tzlocal()),
               'Size': 827443,
               'StorageClass': 'STANDARD'},
              {'ETag': '"7e5eb4041401f51c93ec36f4e09a4f15

In [7]:
# Create a train data channel with S3_data_type as 'AugmentedManifestFile' and attribute names.
data_channels = {
    'train': "s3://"+files['data']['bucket_file_paths']['train'], 
    'validation': "s3://"+files['data']['bucket_file_paths']['val'], 
    'test': "s3://"+files['data']['bucket_file_paths']['test'],
    'embeddings': "s3://"+bucket+'/'+files['embeddings']['bucket_folder']
}
pprint(data_channels)

{'embeddings': 's3://veriff-sagemaker-data/datasets/ner_field_extraction/conll-2003-data/embeddings',
 'test': 's3://veriff-sagemaker-data/datasets/ner_field_extraction/conll-2003-data/data/test.txt',
 'train': 's3://veriff-sagemaker-data/datasets/ner_field_extraction/conll-2003-data/data/train.txt',
 'validation': 's3://veriff-sagemaker-data/datasets/ner_field_extraction/conll-2003-data/data/val.txt'}


## Train with Tensorflow on the notebook instance (aka 'local mode')

In [8]:
from sagemaker.tensorflow import TensorFlow

LOCAL = False

tf_estimator = TensorFlow(
    entry_point='train.py',
    code_location="s3://veriff-sagemaker-data/source",
    role=role,
    instance_count=1, 
    instance_type= 'local' if LOCAL else 'ml.p3.2xlarge',
    volume_size = 200,
    input_mode = 'File',
    output_path="s3://veriff-sagemaker-data/out/",                          
    debugger_hook_config=False,
    checkpoint_s3_uri= None if LOCAL else "s3://veriff-sagemaker-data/checkpoints",
    hyperparameters={"epochs": 50, "embedding_size": 50},
    base_job_name="ner-field-extraction-conll-2003",
    #dependencies=['dependencies'],
    framework_version='2.3.0', #'1.15.3',
    py_version='py37', 
#     image_uri="public.ecr.aws/bitnami/tensorflow-serving:latest",
    metric_definitions=[
        {'Name': 'train:seconds_per_epoch', 'Regex': '.*-\s([0-9]*)s\s-.*'},
        {'Name': 'train:loss', 'Regex': 'loss:\s(.*?)\s'},
        {'Name': 'validation:loss', 'Regex': 'val_loss:\s(.*?)\s'},
        {'Name': 'train:accuracy', 'Regex': 'accuracy:\s(.*?)\s'},
        {'Name': 'validation:loss', 'Regex': 'val_accuracy:\s(.*?)\s'},
    ],
    script_mode=True,
    max_run = 259200, 
    source_dir = '.'
)

In [9]:
# Train the model.
tf_estimator.fit(job_name=None, 
                 experiment_config=None,
                 inputs=data_channels, 
                 logs="All", 
                 wait=False)