In [None]:
import numpy as np # For mathematical operations
import pandas as pd # For tabular data structures
import os

In [None]:
# Creating directories
model_mlm_dir = '/working/mlm_model'
padded_states_dir = '/working/padded_states'
hidden_states_dir = '/working/hidden_states'
model_dir = '/working/model'
tokenizer_dir = '/working/tokenizer_dir'
txt_files_dir = '/working/sequence_files'

if not os.path.exists(txt_files_dir):
    os.makedirs(txt_files_dir)

if not os.path.exists(tokenizer_dir):
    os.makedirs(tokenizer_dir)

if not os.path.exists(model_dir):
    os.makedirs(model_dir)

if not os.path.exists(hidden_states_dir):
    os.makedirs(hidden_states_dir)
    
if not os.path.exists(padded_states_dir):
    os.makedirs(padded_states_dir)

if not os.path.exists(model_mlm_dir):
    os.makedirs(model_mlm_dir)

In [None]:
# Defining variables
max_position_embeddings = 2048 # For roberta tokenizer and model
max_sequence_length = 2000 # Cannot exceed max_position_embeddings
min_sequence_length = 400
num_sequence = 4000
vocab_size = 50265
gene_ontology_unique = -1 # Will be assigned later
gene_ontology_filter_threshold = 50 # Model will only take proteins with more than n common gene ontology ids

from collections import defaultdict
def def_value():
    return 0
gene_ontology_counts = defaultdict(def_value) # Default dictionary for gene ontology ids
filtered_gene_ontology_counts = defaultdict(def_value) # Default dictionary after gene ontology ids have been filtered

if max_sequence_length > max_position_embeddings:
    print('Sequence length exceeds position embeddings')
    exit()

In [None]:
uniprot = pd.read_csv('/data.csv')

Filtering data:

In [None]:
# Filtering out specific columns
uniprot = uniprot[['Sequence', 'Length', 'Organism', 'Gene Ontology IDs']]
# Filtering out proteins associated with humans, and with a specific sequence length
uniprot = uniprot[(uniprot['Length'] < max_sequence_length) & (uniprot['Length'] > min_sequence_length)]
# Picking first n proteins
uniprot = uniprot.dropna()
uniprot = uniprot.head(num_sequence)
uniprot = uniprot.reset_index(drop=True)

Preprocessing Gene Ontology IDs for fine-tuning:

In [None]:
def preprocess_gene_ontology(id):
    ids = id.split(';')
    gene_ontology_id = 0
    found = False
    
    for gene_ontology_id in ids:
        if gene_ontology_id in gene_ontology_counts.keys():
            found = True
            break
            
    if not found:
        gene_ontology_id = ids[0]
        
    gene_ontology_counts[gene_ontology_id] += 1
    return gene_ontology_id

In [None]:
uniprot['Gene_ontology_id'] = uniprot['Gene Ontology IDs'].apply(lambda id: preprocess_gene_ontology(id))
gene_ontology_unique = len(gene_ontology_counts.values())
print('Number of unique Gene Ontology IDs: {}'.format(gene_ontology_unique))

In [None]:
uniprot

In [None]:
def filter_by_gene_ontology_counts(df, gene_ontology_filter_threshold):
    
    count = 0
    
    for i in range(df.shape[0]):
        if(gene_ontology_counts[df.at[i, 'Gene_ontology_id']] > gene_ontology_filter_threshold):
            if df.at[i, 'Gene_ontology_id'] in filtered_gene_ontology_counts.keys():
                filtered_gene_ontology_counts[df.at[i, 'Gene_ontology_id']] += 1
            else:
                filtered_gene_ontology_counts[df.at[i, 'Gene_ontology_id']] = 1
            count += 1
            continue
        else:
            df = df.drop(i)
    
    print('Number of Proteins after Gene Ontology Filtering: {}'.format(count))
    
    return df

In [None]:
uniprot = filter_by_gene_ontology_counts(uniprot, gene_ontology_filter_threshold)

In [None]:
print('Filtered Gene Ontology Dictionary:{}'.format(filtered_gene_ontology_counts.items()))

In [None]:
uniprot

In [None]:
def preprocess_sequence(df):
    for index, row in df.iterrows():
        tokens = list(row['Sequence'])
        df.at[index, 'Sequence'] = str(tokens[0] + " ".join(tokens[1:-1]) + " " + tokens[-1])
    return df

In [None]:
uniprot = preprocess_sequence(uniprot)

In [None]:
uniprot.head()

In [None]:
from sklearn.preprocessing import OrdinalEncoder

gene_ontology_encoder = OrdinalEncoder()
uniprot['Gene_ontology_id_encoded'] = gene_ontology_encoder.fit_transform(np.array(uniprot['Gene_ontology_id']).reshape(-1,1))

In [None]:
uniprot

In [None]:
preds = uniprot['Gene_ontology_id_encoded']

# 20% for testing:
train = uniprot.head(int(num_sequence * 0.8))
train.reset_index(drop=True, inplace=True)
train_y = preds.head(int(num_sequence * 0.8))
train_y.reset_index(drop=True, inplace=True)

test = uniprot.tail(int(num_sequence * 0.2))
test.reset_index(drop=True, inplace=True)
test_y = preds.tail(int(num_sequence * 0.2))
test_y.reset_index(drop=True, inplace=True)

In [None]:
train

In [None]:
train_y

Initializing the language model for MLM:

In [None]:
from transformers import RobertaConfig
from transformers import RobertaForMaskedLM

# Set a configuration for the model
config = RobertaConfig(
    vocab_size=vocab_size,
    max_position_embeddings=max_position_embeddings,
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=1
)

# Initialize the model from a configuration without pretrained weights
model_mlm = RobertaForMaskedLM.from_pretrained('roberta-base', config=config, ignore_mismatched_sizes=True)
print('Number of parameters: ',model_mlm.num_parameters())

Initializing the tokenizer from the previous configuration:

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('roberta-base', 
    max_len=max_sequence_length,
    truncation=True,
    padding='max_length',
    return_tensors='pt',
    )

In [None]:
TRAIN_EPOCHS = 100
LEARNING_RATE = 1e-4
TRAIN_BATCH_SIZE = 2

Building the dataset for masked language modelling:

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader

class MLMDataset(Dataset):
    def __init__(self, sequences, tokenizer, max_length=max_position_embeddings, mask_prob=0.15):
        self.sequences = sequences
        self.tokenizer = tokenizer
        self.mask_prob = mask_prob
        self.max_length = max_length

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        assert 0 <= idx < len(self.sequences), f"Index {idx} is out of bounds for sequences"

        sequence = self.sequences[idx]

        # Tokenize the sentence
        input_ids, labels = self.mask_and_encode_tokens(sequence)

        # Create attention mask
        attention_mask = [1] * len(input_ids)

        # Convert labels to tensor
        labels = torch.tensor(labels)

        return {
            'input_ids': torch.tensor(input_ids),
            'attention_mask': torch.tensor(attention_mask),
            'labels': labels
        }

    def mask_and_encode_tokens(self, tokens):
        
        encoding = self.tokenizer.encode_plus(
            text=tokens,
            padding='max_length',
            max_length = self.max_length,
            return_tensors='pt',
        )

        # Extract input IDs
        tokens = encoding['input_ids'].squeeze().tolist()

        # Initialize labels with -100 (ignore index)
        labels = [-100] * len(tokens)

        for i, token in enumerate(tokens):
            # Randomly decide whether to mask the token
            if torch.rand(1).item() < self.mask_prob:
                # Mask the token
                tokens[i] = self.tokenizer.mask_token_id
                labels[i] = token

        return tokens, labels

In [None]:
mlm_dataset = MLMDataset(train['Sequence'], tokenizer)
mlm_dataloader = DataLoader(mlm_dataset, batch_size=TRAIN_BATCH_SIZE, shuffle=True)

In [None]:
# Add this to console
# CUDA_LAUNCH_BLOCKING=1
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

In [None]:
import torch
import sys
from transformers import AdamW

def train_model_mlm(model, train_dataloader, optimizer, model_dir, num_epochs=5, save_every=1):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
    model.to(device)
    model.train()

    print('Starting RoBERTa MLM Training')
    
    losses = []
    prev_loss = sys.float_info.max
    early_stop_threshold = 0.1
    
    for epoch in range(num_epochs):
        total_loss = 0.0
        for batch in train_dataloader:
            input_ids = batch['input_ids'].to(device)
            labels = batch['labels'].to(device)
            attention_mask = batch['attention_mask'].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids=input_ids, labels=labels, attention_mask=attention_mask)
            loss = outputs.loss
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        average_loss = total_loss / len(train_dataloader)
        losses.append(average_loss)
        print(f'Epoch {epoch + 1}/{num_epochs}, Average Loss: {average_loss:.4f}')
        
        # For early stopping
        if (prev_loss > average_loss):
            prev_loss = average_loss
            # Save the model every 'save_every' epochs
            if (epoch + 1) % save_every == 0:
                model.save_pretrained(model_dir)
                print('Saved model at Epoch: {}'.format(epoch+1))
        elif(prev_loss < average_loss and average_loss - prev_loss <= early_stop_threshold * average_loss):
            prev_loss = average_loss
            if (epoch + 1) % save_every == 0:
                model.save_pretrained(model_dir)
                print('Saved model at Epoch: {}'.format(epoch+1))
        else:
            print('Early stopping at Epoch: {}'.format(epoch+1))
            break
        
        # Saving the loss
        losses_np = np.array(losses)
        np.save('/working/losses.npy', losses_np)

    print("Training complete.")
    return model, losses

In [None]:
# Initialize the optimizer and the loss function
optimizer = AdamW(model_mlm.parameters(), lr=LEARNING_RATE)

model_mlm, losses = train_model_mlm(model_mlm, mlm_dataloader, optimizer, model_mlm_dir, num_epochs=TRAIN_EPOCHS)

In [None]:
import matplotlib.pyplot as plt

plt.plot(losses, label='Training Loss')
plt.title('Training Loss Over Epochs')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.show()

In [None]:
# Save the pre-trained model
model_mlm.save_pretrained(model_mlm_dir)