# ESM-2 Domain Adaptation with Uniref100 dataset

In this notebook, we demonstrate how to perform full-parameter fine tuning of the ESM-2 protein language model on uniref100 dataset.

---
## 0. Install dependencies

In [7]:
%pip install -q --upgrade pip
%pip install -q --upgrade sagemaker boto3 awscli boto3 ipywidgets



[0mNote: you may need to restart the kernel to use updated packages.
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
sagemaker-datawrangler 0.4.3 requires ipywidgets<8.0.0, but you have ipywidgets 8.1.1 which is incompatible.
sagemaker-datawrangler 0.4.3 requires sagemaker-data-insights==0.4.0, but you have sagemaker-data-insights 0.3.3 which is incompatible.[0m[31m
[0mNote: you may need to restart the kernel to use updated packages.


In [8]:
import boto3
import os
import sagemaker
from sagemaker.experiments.run import Run
from sagemaker.inputs import TrainingInput
from sagemaker.pytorch import PyTorch
from time import strftime


boto_session = boto3.session.Session()
sagemaker_session = sagemaker.session.Session(boto_session)
S3_BUCKET = sagemaker_session.default_bucket()
s3 = boto_session.client("s3")
sagemaker_client = boto_session.client("sagemaker")
sagemaker_execution_role = sagemaker.session.get_execution_role(sagemaker_session)
REGION_NAME = sagemaker_session.boto_region_name
print(f"Assumed SageMaker role is {sagemaker_execution_role}")

S3_PREFIX = "esm-2-uniref100-benchmarking"
S3_PATH = sagemaker.s3.s3_path_join("s3://", S3_BUCKET, S3_PREFIX)
print(f"S3 path is {S3_PATH}")

EXPERIMENT_NAME = f"esm-2-benchmarking-ref100-650M" + strftime("%Y-%m-%d-%H-%M-%S")
print(f"Experiment name is {EXPERIMENT_NAME}")

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml
Assumed SageMaker role is arn:aws:iam::111918798052:role/DevelopmentRole
S3 path is s3://sagemaker-us-east-1-111918798052/esm-2-uniref100-benchmarking
Experiment name is esm-2-benchmarking-ref100-650M2023-10-25-00-23-55


In [9]:
# MODEL_ID="facebook/esm2_t48_15B_UR50D"
# MODEL_ID="facebook/esm2_t36_3B_UR50D"
MODEL_ID="facebook/esm2_t33_650M_UR50D"
# MODEL_ID="facebook/esm2_t30_150M_UR50D"
# MODEL_ID="facebook/esm2_t12_35M_UR50D"
# MODEL_ID = "facebook/esm2_t6_8M_UR50D"

---
## 1. Pre-Torkenize the data 

Torkenized using glue script. 

In [10]:
train_s3_uri_uniref100 = "s3://us-east-1-protein-ref-data/uniref100/torkenized-1mb-650m-v1/train"
test_s3_uri_uniref100 = "s3://us-east-1-protein-ref-data/uniref100/torkenized-1mb-650m-v1/test"


## 2. Create data map needed for training

Create index map of torkenized data using glue script. 

## 2.1 (Optional) Get sample data for a sample run. 

In [22]:
train_index_file = !(aws s3 ls {train_s3_uri_uniref100}/train_index_map/) 
train_index_file = train_index_file[0].split()[-1]
train_index_file_full_path = train_s3_uri_uniref100 + "/train_index_map/" + train_index_file

test_index_file = !(aws s3 ls {test_s3_uri_uniref100}/test_index_map/) 
test_index_file = test_index_file[0].split()[-1]
test_index_file_full_path = test_s3_uri_uniref100 + "/test_index_map/"+ test_index_file
test_index_file_full_path

's3://us-east-1-protein-ref-data/uniref100/torkenized-1mb-650m-v1/test/test_index_map/part-00000-385b3d29-67c5-4ee2-bfe5-0af3e991cd3a-c000.csv'

In [23]:
!mkdir ./tmp
!aws s3 cp {train_index_file_full_path} ./tmp/
!aws s3 cp {test_index_file_full_path} ./tmp/

mkdir: cannot create directory ‘./tmp’: File exists
download: s3://us-east-1-protein-ref-data/uniref100/torkenized-1mb-650m-v1/train/train_index_map/part-00000-e40463a2-f486-44cf-866a-20101d015b10-c000.csv to tmp/part-00000-e40463a2-f486-44cf-866a-20101d015b10-c000.csv
download: s3://us-east-1-protein-ref-data/uniref100/torkenized-1mb-650m-v1/test/test_index_map/part-00000-385b3d29-67c5-4ee2-bfe5-0af3e991cd3a-c000.csv to tmp/part-00000-385b3d29-67c5-4ee2-bfe5-0af3e991cd3a-c000.csv


In [24]:
import pandas as pd
train_index_map = pd.read_csv(f"./tmp/{train_index_file}")
train_index_map

Unnamed: 0,file_name,num_sequences,start_line,end_line
0,part-00000-ee38297b-c02c-41de-a579-e91950fc547...,1898,0,1897
1,part-00001-ee38297b-c02c-41de-a579-e91950fc547...,1898,1898,3795
2,part-00002-ee38297b-c02c-41de-a579-e91950fc547...,1898,3796,5693
3,part-00003-ee38297b-c02c-41de-a579-e91950fc547...,1898,5694,7591
4,part-00004-ee38297b-c02c-41de-a579-e91950fc547...,1898,7592,9489
...,...,...,...,...
149995,part-99995-ee38297b-c02c-41de-a579-e91950fc547...,1894,285426791,285428684
149996,part-99996-ee38297b-c02c-41de-a579-e91950fc547...,1894,285428685,285430578
149997,part-99997-ee38297b-c02c-41de-a579-e91950fc547...,1894,285430579,285432472
149998,part-99998-ee38297b-c02c-41de-a579-e91950fc547...,1894,285432473,285434366


In [31]:
train_index_map.iloc[0:3].to_csv("./tmp/sample_train_100.csv")

In [32]:
test_index_map = pd.read_csv(f"./tmp/{test_index_file}")
test_index_map

Unnamed: 0,file_name,num_sequences,start_line,end_line
0,part-00000-d1c21040-2129-4fe1-a072-bee9e1c2eb9...,2861,0,2860
1,part-00001-d1c21040-2129-4fe1-a072-bee9e1c2eb9...,2861,2861,5721
2,part-00002-d1c21040-2129-4fe1-a072-bee9e1c2eb9...,2861,5722,8582
3,part-00003-d1c21040-2129-4fe1-a072-bee9e1c2eb9...,2861,8583,11443
4,part-00004-d1c21040-2129-4fe1-a072-bee9e1c2eb9...,2861,11444,14304
...,...,...,...,...
24995,part-24995-d1c21040-2129-4fe1-a072-bee9e1c2eb9...,2861,71350359,71353219
24996,part-24996-d1c21040-2129-4fe1-a072-bee9e1c2eb9...,2861,71353220,71356080
24997,part-24997-d1c21040-2129-4fe1-a072-bee9e1c2eb9...,2861,71356081,71358941
24998,part-24998-d1c21040-2129-4fe1-a072-bee9e1c2eb9...,2861,71358942,71361802


In [33]:
test_index_map.iloc[0:1].to_csv("./tmp/sample_test_100.csv")

In [None]:
{train_s3_uri_uniref100}/sample_train_index_map/

In [34]:
!aws s3 cp ./tmp/sample_train_100.csv {train_s3_uri_uniref100}/sample_train_index_map/
!aws s3 cp ./tmp/sample_test_100.csv {test_s3_uri_uniref100}/sample_test_index_map/


upload: tmp/sample_train_100.csv to s3://us-east-1-protein-ref-data/uniref100/torkenized-1mb-650m-v1/train/sample_train_index_map/sample_train_100.csv
upload: tmp/sample_test_100.csv to s3://us-east-1-protein-ref-data/uniref100/torkenized-1mb-650m-v1/test/sample_test_index_map/sample_test_100.csv


## 3. Train on multiple g5.2xlarge

In [48]:
metric_definitions = [
    {"Name": "epoch", "Regex": "Epoch: ([0-9.]*)"},
    {"Name": "step", "Regex": "Step: ([0-9.]*)"},
    {"Name": "train_loss", "Regex": "Training Loss: ([0-9.e-]*)"},
    {"Name": "train_perplexity", "Regex": "Training Perplexity: ([0-9.e-]*)"},
    {
        "Name": "train_samples_per_second",
        "Regex": "Training Samples/sec: ([0-9.e-]*)",
    },
    {
        "Name": "train_tokens_per_second",
        "Regex": "Training Tokens/sec: ([0-9.e-]*)",
    },
    {"Name": "eval_loss", "Regex": "Eval Loss: ([0-9.e-]*)"},
    {"Name": "eval_perplexity", "Regex": "Eval Perplexity: ([0-9.e-]*)"},
    {
        "Name": "eval_samples_per_second",
        "Regex": "Eval Samples/sec: ([0-9.e-]*)",
    },
    {"Name": "eval_tokens_per_second", "Regex": "Eval Tokens/sec: ([0-9.e-]*)"},
]

In [None]:
minimum_training_logs_needed = 8



In [73]:
# Additional training parameters
hyperparameters = {
    "num_epochs": 2,
    "model_id": MODEL_ID,
    "per_device_train_batch_size": 10,
    "per_device_eval_batch_size": 10, 
    "bf16": True,
    "logging_steps": 2,
    "optim": "adamw_torch",
    "pretrain" : 1,
    "train_sample_count" : 10000,
    "train_index_file_path" : "sample_train_index_map",
    "test_index_file_path" : "sample_test_index_map",
    "gradient_accumulation_steps" : 10
    
}

# creates Hugging Face estimator
g5_estimator = PyTorch(
    base_job_name="esm-2-uniref100-p3dn-gacc-fsddp-compile",
    entry_point="cuda-uniref100-pretorkenized-mlm-train-ddp-fsdp.py",
    source_dir="training/cuda/uniref100",
    instance_type="ml.p3dn.24xlarge",
    instance_count=1,
    image_uri=f"763104351884.dkr.ecr.us-east-1.amazonaws.com/pytorch-training:2.0.1-gpu-py310-cu118-ubuntu20.04-sagemaker",
    output_path=f"{S3_PATH}/output",
    role=sagemaker_execution_role,
    hyperparameters=hyperparameters,
    metric_definitions=metric_definitions,
    sagemaker_session=sagemaker_session,
    distribution={"torch_distributed": {"enabled": True}},
    tags=[{"Key": "project", "Value": "esm-benchmarking"}],
    keep_alive_period_in_seconds=1800
)

with Run(
    experiment_name=EXPERIMENT_NAME,
    sagemaker_session=sagemaker_session,
) as run:
    g5_estimator.fit(
        {
            "train": TrainingInput(s3_data=train_s3_uri_uniref100, input_mode="FastFile"),
            "test": TrainingInput(s3_data=test_s3_uri_uniref100, input_mode="FastFile"),
        },
        wait=False,
    )

INFO:sagemaker:Creating training-job with name: esm-2-uniref100-p3dn-gacc-fsddp-compile-2023-10-26-03-41-42-601


Using provided s3_resource


In [10]:
# Additional training parameters
hyperparameters = {
    "num_train_epochs": 2,
    "model_id": MODEL_ID,
    "per_device_train_batch_size": 10,
    "per_device_eval_batch_size": 10,
    "bf16": True,
    "logging_steps": 8,
    "optim": "adamw_torch",
    "pretrain" : 1,
    "train_sample_count" : 10000,
    "train_index_file_path" : "sample_train_index_map",
    "test_index_file_path" : "sample_test_index_map"
}

from sagemaker import ProfilerConfig, Profiler
profiler_config = ProfilerConfig(
    profile_params = Profiler(cpu_profiling_duration=3600)
)

# creates Hugging Face estimator
g5_estimator = PyTorch(
    base_job_name="esm-2-uniref100-2p3dn24",
    entry_point="cuda-uniref100-pretorkenized-mlm-train-ddp-fsdp-ptprof.py",
    source_dir="training/cuda/uniref100",
    instance_type="ml.p3dn.24xlarge",
    instance_count=1,
    image_uri=f"763104351884.dkr.ecr.us-east-1.amazonaws.com/pytorch-training:2.0.1-gpu-py310-cu118-ubuntu20.04-sagemaker",
    output_path=f"{S3_PATH}/output",
    role=sagemaker_execution_role,
    hyperparameters=hyperparameters,
    metric_definitions=metric_definitions,
    sagemaker_session=sagemaker_session,
    distribution={"torch_distributed": {"enabled": True}},
    tags=[{"Key": "project", "Value": "esm-benchmarking"}],
    profiler_config=profiler_config
)

with Run(
    experiment_name=EXPERIMENT_NAME,
    sagemaker_session=sagemaker_session,
) as run:
    g5_estimator.fit(
        {
            "train": TrainingInput(s3_data=train_s3_uri_uniref100, input_mode="FastFile"),
            "test": TrainingInput(s3_data=test_s3_uri_uniref100, input_mode="FastFile"),
        },
        wait=False,
    )

INFO:sagemaker.image_uris:Defaulting to the only supported framework/algorithm version: latest.
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.
INFO:sagemaker:Creating training-job with name: esm-2-uniref100-2p3dn24-2023-10-21-07-02-16-996


Using provided s3_resource


In [None]:
1 + 1