In [None]:
import numpy as np # For mathematical operations
import pandas as pd # For tabular data structures
import os

In [None]:
# Creating directories
model_mlm_dir = '/working/mlm_model'
padded_states_dir = '/working/padded_states'
hidden_states_dir = '/working/hidden_states'
model_dir = '/working/model'
tokenizer_dir = '/working/tokenizer_dir'
txt_files_dir = '/working/sequence_files'

if not os.path.exists(txt_files_dir):
    os.makedirs(txt_files_dir)

if not os.path.exists(tokenizer_dir):
    os.makedirs(tokenizer_dir)

if not os.path.exists(model_dir):
    os.makedirs(model_dir)

if not os.path.exists(hidden_states_dir):
    os.makedirs(hidden_states_dir)
    
if not os.path.exists(padded_states_dir):
    os.makedirs(padded_states_dir)

if not os.path.exists(model_mlm_dir):
    os.makedirs(model_mlm_dir)

In [None]:
# Defining variables
max_position_embeddings = 2048 # For bert tokenizer and model
max_sequence_length = 2000 # Cannot exceed max_position_embeddings
min_sequence_length = 400
num_sequence = 4000
vocab_size = 50265
gene_ontology_unique = -1 # Will be assigned later
gene_ontology_filter_threshold = 50 # Model will only take proteins with more than n common gene ontology ids

from collections import defaultdict
def def_value():
    return 0
gene_ontology_counts = defaultdict(def_value) # Default dictionary for gene ontology ids
filtered_gene_ontology_counts = defaultdict(def_value) # Default dictionary after gene ontology ids have been filtered

In [None]:
uniprot = pd.read_csv('/data.csv')

Filtering data:

In [None]:
# Filtering out specific columns
uniprot = uniprot[['Sequence', 'Length', 'Organism', 'Gene Ontology IDs']]
# Filtering out proteins associated with humans, and with a specific sequence length
uniprot = uniprot[(uniprot['Organism'] == 'Homo sapiens (Human)') & (uniprot['Length'] < max_sequence_length) & (uniprot['Length'] > min_sequence_length)]
# Picking first n proteins
uniprot = uniprot.dropna()
uniprot = uniprot.head(num_sequence)
uniprot = uniprot.reset_index(drop=True)

Preprocessing Gene Ontology IDs for fine-tuning:

In [None]:
def preprocess_gene_ontology(id):
    ids = id.split(';')
    gene_ontology_id = 0
    found = False
    
    for gene_ontology_id in ids:
        if gene_ontology_id in gene_ontology_counts.keys():
            found = True
            break
            
    if not found:
        gene_ontology_id = ids[0]
        
    gene_ontology_counts[gene_ontology_id] += 1
    return gene_ontology_id

In [None]:
uniprot['Gene_ontology_id'] = uniprot['Gene Ontology IDs'].apply(lambda id: preprocess_gene_ontology(id))
gene_ontology_unique = len(gene_ontology_counts.values())
print('Number of unique Gene Ontology IDs: {}'.format(gene_ontology_unique))

In [None]:
uniprot

In [None]:
def filter_by_gene_ontology_counts(df, gene_ontology_filter_threshold):
    
    count = 0
    
    for i in range(df.shape[0]):
        if(gene_ontology_counts[df.at[i, 'Gene_ontology_id']] > gene_ontology_filter_threshold):
            if df.at[i, 'Gene_ontology_id'] in filtered_gene_ontology_counts.keys():
                filtered_gene_ontology_counts[df.at[i, 'Gene_ontology_id']] += 1
            else:
                filtered_gene_ontology_counts[df.at[i, 'Gene_ontology_id']] = 1
            count += 1
            continue
        else:
            df = df.drop(i)
    
    print('Number of Proteins after Gene Ontology Filtering: {}'.format(count))
    
    return df

In [None]:
uniprot = filter_by_gene_ontology_counts(uniprot, gene_ontology_filter_threshold)

In [None]:
print('Filtered Gene Ontology Dictionary:{}'.format(filtered_gene_ontology_counts.items()))

In [None]:
uniprot

In [None]:
# To add white spaces between all tokens in the sequence
def preprocess_sequence(df):
    for index, row in df.iterrows():
        tokens = list(row['Sequence'])
        df.at[index, 'Sequence'] = str(tokens[0] + " ".join(tokens[1:-1]) + " " + tokens[-1])
    return df

In [None]:
uniprot = preprocess_sequence(uniprot)

In [None]:
uniprot.head()

In [None]:
from sklearn.preprocessing import OrdinalEncoder

gene_ontology_encoder = OrdinalEncoder()
uniprot['Gene_ontology_id_encoded'] = gene_ontology_encoder.fit_transform(np.array(uniprot['Gene_ontology_id']).reshape(-1,1))

In [None]:
uniprot

In [None]:
preds = uniprot['Gene_ontology_id_encoded']

# 20% for testing:
train = uniprot.head(int(num_sequence * 0.8))
train.reset_index(drop=True, inplace=True)
train_y = preds.head(int(num_sequence * 0.8))
train_y.reset_index(drop=True, inplace=True)

test = uniprot.tail(int(num_sequence * 0.2))
test.reset_index(drop=True, inplace=True)
test_y = preds.tail(int(num_sequence * 0.2))
test_y.reset_index(drop=True, inplace=True)

In [None]:
train

In [None]:
train_y

In [None]:
train = train.reset_index(drop=True)
test = test.reset_index(drop=True)

Initializing the language model from the already pre-trained on MLML model

In [None]:
import torch
from transformers import RobertaConfig
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Loading the already pretrained MLM model
# Load the configuration from huggingface
config = RobertaConfig(
    vocab_size=vocab_size,
    max_position_embeddings=2048,
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=1,
    num_labels = gene_ontology_unique
)

mlm_model = "" # Add pre-trained MLM model here
model = AutoModelForSequenceClassification.from_pretrained(mlm_model, config=config)

In [None]:
model.save_pretrained(model_mlm_dir)

Initializing the tokenizer from the previous configuration:

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(mlm_model, 
    max_len=max_sequence_length,
    truncation=True,
    padding='max_length',
    return_tensors='pt',
    )

Building the training dataset, and tokenizing sequences:

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader

class CustomDataset(Dataset):
    def __init__(self, df, tokenizer):
        
        self.examples = []
        
        for example, label in zip(df['Sequence'].values, df['Gene_ontology_id_encoded'].values):
            # Tokenizing:
            x = tokenizer.encode_plus(example, max_length=512, truncation=True, padding='max_length', return_tensors='pt')
            self.examples.append({'input_ids': x['input_ids'].squeeze(),
                                  'attention_mask': x['attention_mask'].squeeze(),
                                  'labels': torch.tensor(label, dtype=torch.long)})

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, i):
        return self.examples[i]
      
# Create the train dataset
train_dataset = CustomDataset(train, tokenizer)
test_dataset = CustomDataset(test, tokenizer)

train_dataloader = DataLoader(train_dataset, batch_size=2, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=2, shuffle=False)

In [None]:
print('Sampele inputs from the dataloader:')
for batch in train_dataloader:
    print(batch['input_ids'][0])
    print(batch['attention_mask'][0])
    print(batch['labels'][0])
    break

Training:

In [None]:
# Defining parameters:
TRAIN_EPOCHS = 200
LEARNING_RATE = 2e-5
TRAIN_BATCH_SIZE = 2
VALID_BATCH_SIZE = 2
WEIGHT_DECAY = 0.01

In [None]:
from transformers import TrainingArguments, Trainer

args = TrainingArguments(
    output_dir=model_dir,
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=TRAIN_BATCH_SIZE,
    per_device_eval_batch_size=VALID_BATCH_SIZE,
    num_train_epochs=TRAIN_EPOCHS,
    weight_decay=WEIGHT_DECAY,
    save_total_limit=1,
    push_to_hub=False,
)

In [None]:
from sklearn.metrics import accuracy_score
def compute_metrics(p):
    predictions, labels = p.predictions, p.label_ids
    predictions = np.argmax(predictions, axis=1)
    
    # Calculate accuracy
    accuracy = accuracy_score(labels, predictions)

    return {"accuracy": accuracy}

In [None]:
trainer = Trainer(
    model,
    args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
import wandb
key = "" # Add key here
wandb.login(key=key)
trainer.train()

In [None]:
# Save the model
model.save_pretrained(model_dir)

Predicting, and evaluating on test dataset:

In [None]:
def predict(test_dataloader, max_sequence_length, model):
    
    device = 'cuda'
    predicted_labels = []
    model = model.to(device)
    
    with torch.no_grad():
        
        total_correct = 0
        total_labels = 0
        
        for batch in test_dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            logits = outputs.logits

            _, predicted_class = torch.max(logits, 1)
            total_correct += (predicted_class == labels).sum().item()
            total_labels += len(labels)
            
            predicted_labels.extend(predicted_class.cpu().numpy())

    return total_correct / total_labels, predicted_labels

In [None]:
accuracy, test['Predicted_gene_ontology'] = predict(test_dataloader, max_sequence_length, model)

In [None]:
print('Accuracy on Test Dataset: {}'.format(accuracy))

In [None]:
test.head(20)

In [None]:
test.to_csv('/kaggle/working/test.csv')