In [None]:
!pip install --upgrade pip -q
!pip install -U boto3 sagemaker -q
!pip install seaborn -q

In [None]:
import re
import json
import pandas as pd
from tqdm import tqdm
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import time
import os
import numpy as np
import pandas as pd
import sagemaker
import torch
import seaborn as sns
import matplotlib.pyplot as plt

from torch import nn, optim
from torch.utils.data import Dataset, DataLoader

In [None]:
import sagemaker
print(sagemaker.__version__)
sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()

In [None]:
bucket = sagemaker_session.default_bucket()
prefix = "sagemaker/DEMO-pytorch-bert"


In [None]:
RANDOM_SEED = 43
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

In [None]:
!wget https://services.healthtech.dtu.dk/services/DeepLoc-1.0/deeploc_data.fasta -P ./data -q

In [None]:
!pip install Bio -q
import Bio

In [None]:
def read_fasta(file_path, columns) :
    from Bio.SeqIO.FastaIO import SimpleFastaParser
    with open('./data/deeploc_data.fasta') as fasta_file:  # Will close handle cleanly
        records = []
        for title, sequence in SimpleFastaParser(fasta_file):
            record = []
            title_splits = title.split(None)
            record.append(title_splits[0])  # First word is ID
            sequence = " ".join(sequence)
            record.append(sequence)
            record.append(len(sequence))
            location_splits = title_splits[1].split("-")
            record.append(location_splits[0])  # Second word is Location
            record.append(location_splits[1])  # Second word is Membrane

            if(len(title_splits) > 2):
                record.append(0)
            else:
                record.append(1)
                
            records.append(record)
    return pd.DataFrame(records, columns = columns)
        
data = read_fasta("./tmp/deeploc_data.fasta", columns=["id", "sequence", "sequence_length", "location", "membrane", "is_train"])
data.head()

In [None]:
data.info()

In [None]:
data.isnull().values.any()

In [None]:
unique_classes = data.location.unique()
print("Number of classes: ", len(unique_classes))
unique_classes

In [None]:
%matplotlib inline
%config InlineBackend.figure_format='retina'
sns.set(style='whitegrid', palette='muted', font_scale=1.2)
ax = sns.distplot(data['sequence_length'].values)
ax.set_xlim(0, 3000)
plt.title(f'sequence length distribution')
plt.grid(True)

In [None]:
categories = data.location.astype('category').cat
data['location'] = categories.codes
class_names = categories.categories
num_classes = len(class_names)
print(class_names)

In [None]:
df_train = data[data.is_train == 1]
df_train = df_train.drop(["is_train"], axis = 1)
df_train.shape[0]

In [None]:
df_test = data[data.is_train == 0]
df_test = df_test.drop(["is_train"], axis = 1)
df_test.shape[0]

In [None]:
train_dataset_path = './data/deeploc_per_protein_train.csv'
test_dataset_path = './data/deeploc_per_protein_test.csv'
df_train.to_csv(train_dataset_path)
df_test.to_csv(test_dataset_path)
inputs_train = sagemaker_session.upload_data(train_dataset_path, bucket=bucket, key_prefix=prefix)
inputs_test = sagemaker_session.upload_data(test_dataset_path, bucket=bucket, key_prefix=prefix)

In [None]:
print("S3 location for training data: ", inputs_train )
print("S3 location for testing data: ", inputs_test )

In [None]:
!pygmentize code/train.py

In [None]:
# Training job will take around 20-25 mins to execute. 

from sagemaker.pytorch import PyTorch


TRAINING_JOB_NAME="protbert-training-pytorch-{}".format(time.strftime("%m-%d-%Y-%H-%M-%S")) 
print('Training job name: ', TRAINING_JOB_NAME)

estimator = PyTorch(
    entry_point="train.py",
    source_dir="code",
    role=role,
    framework_version="1.6.0",
    py_version="py36",
    instance_count=1,  # this script support distributed training for only GPU instances.
    instance_type="ml.p3.16xlarge",
    distribution={'smdistributed':{
        'dataparallel':{
            'enabled': True
        }
    }
                 },
    debugger_hook_config=False,
    hyperparameters={
        "epochs": 3,
        "num_labels": num_classes,
        "batch-size": 4,
        "test-batch-size": 4,
        "log-interval": 100,
        "frozen_layers": 15,
    },
    metric_definitions=[
                   {'Name': 'train:loss', 'Regex': 'Training Loss: ([0-9\\.]+)'},
                   {'Name': 'test:accuracy', 'Regex': 'Validation Accuracy: ([0-9\\.]+)'},
                   {'Name': 'test:loss', 'Regex': 'Validation loss: ([0-9\\.]+)'},
                ]
)
estimator.fit({"training": inputs_train, "testing": inputs_test}, job_name=TRAINING_JOB_NAME)

In [None]:
model_data = estimator.model_data
print("Storing {} as model_data".format(model_data))
%store model_data

In [None]:
%store -r model_data

# If no model was found, set it manually here.
# model_data = 's3://sagemaker-{region}-XXX/protbert-training-pytorch-XX-XX-XXXX-XX-XX-XX/output/model.tar.gz'

print("Using this model: {}".format(model_data))

In [None]:
import sagemaker

from sagemaker.pytorch import PyTorchModel
ENDPOINT_NAME = "protbert-inference-pytorch-1-{}".format(time.strftime("%m-%d-%Y-%H-%M-%S"))
print("Endpoint name: ", ENDPOINT_NAME)
model = PyTorchModel(model_data=model_data, source_dir='code',
                        entry_point='inference.py', role=role, framework_version='1.6.0', py_version='py3')

In [None]:
%%time
predictor = model.deploy(initial_instance_count=1, instance_type='ml.m5.2xlarge', endpoint_name=ENDPOINT_NAME)

In [None]:
import boto3

runtime= boto3.client('runtime.sagemaker')
client = boto3.client('sagemaker')

endpoint_desc = client.describe_endpoint(EndpointName=ENDPOINT_NAME)
print(endpoint_desc)
print('---'*30)

In [None]:
predictor.serializer = sagemaker.serializers.JSONSerializer()
predictor.deserializer = sagemaker.deserializers.JSONDeserializer()

In [None]:
protein_sequence = 'M G K K D A S T T R T P V D Q Y R K Q I G R Q D Y K K N K P V L K A T R L K A E A K K A A I G I K E V I L V T I A I L V L L F A F Y A F F F L N L T K T D I Y E D S N N'
prediction = predictor.predict(protein_sequence)
print(prediction)
print(f'Protein Sequence: {protein_sequence}')
print("Sequence Localization Ground Truth is: {} - prediction is: {}".format('Endoplasmic.reticulum', class_names[prediction[0]]))

In [None]:
protein_sequence = 'M S M T I L P L E L I D K C I G S N L W V I M K S E R E F A G T L V G F D D Y V N I V L K D V T E Y D T V T G V T E K H S E M L L N G N G M C M L I P G G K P E'
prediction = predictor.predict(protein_sequence)
print(prediction)
print(f'Protein Sequence: {protein_sequence}')
print("Sequence Localization Ground Truth is: {} - prediction is: {}".format('Nucleus', class_names[prediction[0]]))

In [None]:
seq = 'M G G P T R R H Q E E G S A E C L G G P S T R A A P G P G L R D F H F T T A G P S K A D R L G D A A Q I H R E R M R P V Q C G D G S G E R V F L Q S P G S I G T L Y I R L D L N S Q R S T C C C L L N A G T K G M C'
prediction = predictor.predict(seq)
print(prediction)
print(f'Protein Sequence: {seq}')
print("Sequence Localization Ground Truth is: {} - prediction is: {}".format('Cytoplasm',class_names[prediction[0]]))

In [None]:
predictor.delete_endpoint()