In [None]:
import os
import gzip
import shutil
import random

import numpy as np
import pandas as pd
import torch

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report

from pyteomics import fasta, parser, mass, achrom

from datasets import Dataset
from transformers import EarlyStoppingCallback

In [None]:
from google.colab import drive

# To mount Google Drive on Colab
drive.mount('/content/drive')

In [None]:
# To set random seeds so that outputs are reproducible
torch.manual_seed(42)
np.random.seed(42)
random.seed(42)

In [None]:
# To load data files
# Define file paths
drive_paths = ['/content/drive/My Drive/Thesis/uniprotkb_ecoli_AND_model_organism_8333_2024_07_03.fasta.gz',
               '/content/drive/My Drive/Thesis/uniprotkb_bacteria_AND_model_organism_2_2024_08_06.fasta.gz']


# Load the files from Drive to runtime
for drive_path in drive_paths:
    file_name = os.path.basename(drive_path)

    # To check if the file already exists in the current runtime
    if not os.path.isfile(file_name):
        # If the file does not exist in the current runtime, check if it exists in Google Drive
        if os.path.isfile(drive_path):
            print(f'Copying the FASTA file from Google Drive: {drive_path}...')
            # Copy the file from Google Drive to the current runtime
            shutil.copy(drive_path, '.')
            print(f'{file_name} copied to the current runtime.')
        else:
            print(f'{file_name} not found in Google Drive.')
    else:
        print(f'{file_name} already exists in the current runtime.')

In [None]:
# Masking techniques definition

# Function to perform Substitution Masking
# This function will randomly insert 'X'
def substitute_mask(peptide, num_maskings=1, substitution='X'):
    peptide_list = list(peptide)
    indices = random.sample(range(len(peptide_list)), min(num_maskings, len(peptide_list)))
    for idx in indices:
        peptide_list[idx] = substitution
    return ''.join(peptide_list)

# Function to perform Random Deletion Masking
def delete_mask(peptide, num_maskings=1):
    peptide_list = list(peptide)
    for _ in range(num_maskings):
        if peptide_list:
            index = random.randint(0, len(peptide_list) - 1)
            del peptide_list[index]
    return ''.join(peptide_list)

# Function to perform Insertion Masking
# amino_acids list has the standard 20 amino acids
def insert_mask(peptide, num_maskings=1, amino_acids="ACDEFGHIKLMNPQRSTVWY"):
    peptide_list = list(peptide)
    for _ in range(num_maskings):
        index = random.randint(0, len(peptide_list))
        peptide_list.insert(index, random.choice(amino_acids))
    return ''.join(peptide_list)

In [None]:
# Function to plot training and validation loss
def plot_training_validation_loss(trainer):

    # Retrieve the train and val loss from the trainer state logs
    logs = trainer.state.log_history

    train_loss = [log['loss'] for log in logs if 'loss' in log]
    eval_loss = [log['eval_loss'] for log in logs if 'eval_loss' in log]
    epochs_train = range(1, len(train_loss) + 1)
    epochs_eval = range(1, len(eval_loss) + 1)

    # Plot the losses
    plt.figure(figsize=(10, 6))
    plt.plot(epochs_train, train_loss, label='Training Loss')
    plt.plot(epochs_eval, eval_loss, label='Validation Loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.title('Training and Validation Loss')
    plt.legend()
    plt.show()

In [None]:
# Function to plot distribution of Protein lenghts

def plot_protein_length_distribution(df, bin_size=200, x_max=2000, title='Distribution of Protein Lengths'):
    bins = np.arange(0, df.max() + 100, bin_size)
   
    plt.figure(figsize=(10, 6))
    sns.histplot(df, bins=bins)
    plt.title(title)
    plt.xlabel('Protein Length')
    plt.ylabel('Frequency')
    plt.xticks(bins) 
    plt.xlim(0, x_max)
    plt.show()

In [None]:
# Function to extract protein details by providing file name

def get_protein_details(data_file):

    # Initialize dictionary to store the extracted protein details
    protein_details = []

    file_list = data_file

    for file_name in file_list:
        # To extract the FASTA file
        with gzip.open(file_name, mode='rt') as gzfile:
            for description, sequence in fasta.FASTA(gzfile):

                # As 'X' is not a standard amino acid and is not supported by Pyteomics, we exclude sequences containing 'X'
                if 'X' in sequence:
                    continue

                # Extracting information from the description using the split function
                protein_name = description.split('|')[2].split(' ')[0]
                gene_name = description.split('GN=')[1].split(' ')[0] if 'GN=' in description else 'Unknown'
                organism_species = description.split('OS=')[1].split('(strain')[0]
                strain = description.split('strain ')[1].split(')')[0]
                
                protein_details.append({
                    'protein_name': protein_name,
                    'gene_name': gene_name,
                    'strain': strain,
                    'organism_species': organism_species,
                    'sequence': sequence,
                    'protein_length': len(sequence)
                })

    # Return protein details as DataFrame
    return pd.DataFrame(protein_details)

In [None]:
# Function to extract peptide details by a dataFrame containing protein details

def get_peptide_details(df_proteins, num_classes = 2001, give_decoy = False, give_masked_data = False,
                        num_of_masks = 2, number_of_samples = 10, masking_techniques = [substitute_mask, delete_mask, insert_mask]):


    masking_techniques = [substitute_mask, delete_mask, insert_mask]

    # Set the bin sizes
    df_0_200 = df_proteins[df_proteins['protein_length'].between(0, 200)]
    df_201_400 = df_proteins[df_proteins['protein_length'].between(201, 400)]
    df_greater_400 = df_proteins[df_proteins['protein_length'] > 400]

    # To get equal samples from each bin
    sample_size = num_classes // 3

    # Randomly sample each category
    df_0_200_sampled = df_0_200.sample(n=sample_size, random_state=42)
    df_201_400_sampled = df_201_400.sample(n=sample_size, random_state=42)
    df_greater_400_sampled = df_greater_400.sample(n=sample_size, random_state=42)

    # Merge sampled data to get the final protein dataset
    df_final_proteins = pd.concat([df_0_200_sampled, df_201_400_sampled, df_greater_400_sampled])
    df_final_proteins.reset_index(drop=True, inplace=True)

    # Step 3: Process the selected proteins to generate peptides
    peptide_details = []

    for index, row in df_final_proteins.iterrows():
        sequence = row['sequence']

        for start_pos, peptide in parser.icleave(sequence, parser.expasy_rules['trypsin']):

            # To neglect peptides that have low mass and will not get extracted in trypsin digestion
            if len(peptide) > 3:
                retention_time = achrom.calculate_RT(peptide, achrom.RCs_guo_ph7_0)
                end_pos = start_pos + len(peptide) - 1
                peptide_mass = mass.calculate_mass(sequence=peptide)

                # Store peptide details
                peptide_details.append({
                    'protein_name': row['protein_name'],
                    'gene_name': row['gene_name'],
                    'strain': row['strain'],
                    'organism_species': row['organism_species'],
                    'protein_length': len(sequence),
                    'peptide_sequence': peptide,
                })

                # To reverse peptides (Decoy)
                if give_decoy == True:

                    reversed_peptide = peptide[::-1]
                    peptide_details.append({
                        'protein_name': row['protein_name'],
                        'gene_name': row['gene_name'],
                        'strain': row['strain'],
                        'organism_species': row['organism_species'],
                        'protein_length': len(sequence),
                        'peptide_sequence': reversed_peptide
                    })

                # To generate masked data
                if give_masked_data == True:
                    for _ in range(number_of_samples):
                        for mask_technique in masking_techniques:
                            masked_peptide = mask_technique(peptide, num_of_masks)
                            peptide_details.append({
                                'protein_name': row['protein_name'],
                                'gene_name': row['gene_name'],
                                'strain': row['strain'],
                                'organism_species': row['organism_species'],
                                'peptide_sequence': masked_peptide
                            })

    # Return peptide details in the form of DataFrame
    return pd.DataFrame(peptide_details)

In [None]:
# To give accuracy and classification report

def evaluate_model_on_test(tokenizer, label_encoder, trainer, test_data):

    test_sequences = test_data['peptide_sequence']
    test_labels = test_data['protein_name']
    tokens = tokenizer(test_sequences, padding=True, truncation=True, max_length=512, return_tensors="pt")

    # Make predictions on the test dataset
    with torch.no_grad():
        outputs = trainer.model(**tokens)
        predictions = torch.argmax(outputs.logits, dim=-1).cpu().numpy()

    # To encode labels to match the prediction format
    true_labels_encoded = label_encoder.transform(test_labels)

    # Calculate accuracy
    accuracy = accuracy_score(true_labels_encoded, predictions)

    # Generate classification report
    report = classification_report(true_labels_encoded, predictions, target_names=label_encoder.classes_)

    return accuracy, report

# 1. Plotting frequency distribution of E. coli proteins

In [None]:
ecoli_data_file = ['uniprotkb_ecoli_AND_model_organism_8333_2024_07_03.fasta.gz']
ecoli_df = get_protein_details(ecoli_data_file)

plot_protein_length_distribution(ecoli_df['protein_length'], bin_size=200, x_max=2000, title='Distribution of Protein Lengths (E. coli K12 Strain)')

# Sampling E. coli proteins to peptides

In [None]:
# Generate peptides using get_protein_details and get_peptide_details function

ecoli_data_file = ['uniprotkb_ecoli_AND_model_organism_8333_2024_07_03.fasta.gz']
ecoli_df = get_protein_details(ecoli_data_file)

ecoli_peptides_exp1 = get_peptide_details(ecoli_df, num_classes = 2001, give_decoy = False, give_masked_data = False)

# 2. BERT with all sequences of E.Coli as peptides

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding

In [None]:
df = ecoli_peptides_exp1
columns_to_keep = ['peptide_sequence']
sample_df_features = df[columns_to_keep]
sample_df_label = df['protein_name']

# Encode labels
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(sample_df_label)
sample_df_features['label'] = encoded_labels

# Final features and labels
X = sample_df_features.drop(columns=['label'])
y = sample_df_features['label']

# To Initialize the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# To process tokenization into batches else it would lead RAM crash
def tokenize_batch(peptides_batch):
    return tokenizer(peptides_batch, padding='max_length', truncation=True, max_length=512, return_tensors='pt')

# Tokenize in batches
batch_size = 500
tokenized_data_list = []

for start in range(0, len(sample_df_features), batch_size):
    end = min(start + batch_size, len(sample_df_features))
    batch_peptides = sample_df_features['peptide_sequence'].iloc[start:end].tolist()
    tokenized_batch = tokenize_batch(batch_peptides)
    tokenized_data_list.append({
        'input_ids': tokenized_batch['input_ids'],
        'attention_mask': tokenized_batch['attention_mask']
    })

# Combine all tokenized data
combined_input_ids = torch.cat([batch['input_ids'] for batch in tokenized_data_list])
combined_attention_mask = torch.cat([batch['attention_mask'] for batch in tokenized_data_list])

# Convert to DataFrame
tokenized_df = pd.DataFrame({
    'input_ids': combined_input_ids.tolist(),
    'attention_mask': combined_attention_mask.tolist(),
    'label': y.tolist()
})

# Split data into training (70%), validation (15%), and test (15%) sets
train_df, temp_df = train_test_split(tokenized_df, test_size=0.3, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

# Converting data into format suitable for Hugging Face
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)

# Prepare dataset features for Hugging Face
dataset_features = {
    'input_ids': 'int32',
    'attention_mask': 'int32',
    'label': 'int32'
}

# For padding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Define the model
num_labels = len(sample_df_label.unique())
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_labels)


In [None]:

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',  # Save the outputs in a directory
    num_train_epochs=10,             # Number of training epochs
    per_device_train_batch_size=64,  # Training batch size
    per_device_eval_batch_size=64,   # Batch size for evaluation
    warmup_steps=50,                 # Warmup steps
    weight_decay=0.01,               # Setting weight decay for L2 regularization
    logging_dir='/content/drive/My Drive/Thesis/saved_model_BERT1/logs',  # Save logs
    logging_steps=len(train_dataset)//64,                # Logging steps
    evaluation_strategy="epoch",     # Evaluate model at the end of every epoch
    save_strategy="epoch",           # Save checkpoints at the end of each epoch
    load_best_model_at_end=True,      # Keep the best model at the end of training
    fp16=True,                        # To improve GPU utilization and reduce training time
    dataloader_num_workers=4
)

# Define the Trainer
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = accuracy_score(labels, predictions)
    return {'accuracy': accuracy}

# Define the Trainer
BERT_trainer_1 = Trainer(
    model=model,                       
    args=training_args,                # Training arguments
    data_collator=data_collator,       # For batching
    train_dataset=train_dataset,       
    eval_dataset=val_dataset,          
    tokenizer=tokenizer,               
    compute_metrics=compute_metrics,   # Metrics computation
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]              # To stop training if validation loss increases for three continuous epochs
)

# Train the model
BERT_trainer_1.train()

In [None]:
# Plot training and validation loss
plot_training_validation_loss(BERT_trainer_1)

In [None]:
# Save the model and tokenizer
model.save_pretrained('./saved_model_BERT1')
tokenizer.save_pretrained('./saved_model_BERT1')

# To save to Google Drive
save_directory = '/content/drive/My Drive/Thesis/saved_model_BERT1'
os.makedirs(save_directory, exist_ok=True)
model.save_pretrained(save_directory)
tokenizer.save_pretrained(save_directory)

In [None]:
# Evaluate the model on the test dataset
test_accuracy, test_report = evaluate_model_on_test(tokenizer, label_encoder, BERT_trainer_1, test_dataset)
print(f"Test Accuracy: {test_accuracy:.4f}")
print("Test Classification Report:")
print(test_report)

# 3. Plotting frequency distribution of B. subtilis proteins

In [None]:
bsub_data_file = ['uniprotkb_bacteria_AND_model_organism_2_2024_08_06.fasta.gz']
bsub_df = get_protein_details(bsub_data_file)

plot_protein_length_distribution(bsub_df['protein_length'], bin_size=200, x_max=2000, title='Distribution of Protein Lengths (B. subtilis 168 Strain)')

# 4. Fine-tuning BERT with the extracted peptide sequences of E. Coli and B. subtilis

In [None]:
# Generate peptides using get_protein_details and get_peptide_details function

ecoli_data_file = ['uniprotkb_ecoli_AND_model_organism_8333_2024_07_03.fasta.gz']
ecoli_df = get_protein_details(ecoli_data_file)

bsub_data_file = ['uniprotkb_bacteria_AND_model_organism_2_2024_08_06.fasta.gz']
bsub_df = get_protein_details(bsub_data_file)


ecoli_peptides_exp2 = get_peptide_details(ecoli_df, num_classes = 2001, give_decoy = False, give_masked_data = False)
bsub_peptides_exp2 = get_peptide_details(bsub_df, num_classes = 2001, give_decoy = False, give_masked_data = False)

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding

# Read the CSV files into DataFrames
df1 = ecoli_peptides_exp2
df2 = bsub_peptides_exp2

# To combine data from both species i.e. E. coli and B. subtilis
df = pd.concat([df1, df2], ignore_index=True)

In [None]:
columns_to_keep = ['peptide_sequence']
sample_df_features = df[columns_to_keep]
sample_df_label = df['protein_name']

# Encode labels
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(sample_df_label)
sample_df_features['label'] = encoded_labels

# Final features and labels
X = sample_df_features.drop(columns=['label'])
y = sample_df_features['label']

# To Initialize the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# To process tokenization into batches else it would lead RAM crash
def tokenize_batch(peptides_batch):
    return tokenizer(peptides_batch, padding='max_length', truncation=True, max_length=512, return_tensors='pt')

# Tokenize in batches
batch_size = 500
tokenized_data_list = []

for start in range(0, len(sample_df_features), batch_size):
    end = min(start + batch_size, len(sample_df_features))
    batch_peptides = sample_df_features['peptide_sequence'].iloc[start:end].tolist()
    tokenized_batch = tokenize_batch(batch_peptides)
    tokenized_data_list.append({
        'input_ids': tokenized_batch['input_ids'],
        'attention_mask': tokenized_batch['attention_mask']
    })

# Combine all tokenized data
combined_input_ids = torch.cat([batch['input_ids'] for batch in tokenized_data_list])
combined_attention_mask = torch.cat([batch['attention_mask'] for batch in tokenized_data_list])

# Convert to DataFrame
tokenized_df = pd.DataFrame({
    'input_ids': combined_input_ids.tolist(),
    'attention_mask': combined_attention_mask.tolist(),
    'label': y.tolist()
})

# Split data into training (70%), validation (15%), and test (15%) sets
train_df, temp_df = train_test_split(tokenized_df, test_size=0.3, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

# Converting data into format suitable for Hugging Face
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)

# Prepare dataset features for Hugging Face
dataset_features = {
    'input_ids': 'int32',
    'attention_mask': 'int32',
    'label': 'int32'
}

# For padding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Define the model
num_labels = len(sample_df_label.unique())
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_labels)


In [None]:

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',  # Save the outputs in a directory
    num_train_epochs=10,              # Number of training epochs
    per_device_train_batch_size=64,  # Training batch size
    per_device_eval_batch_size=64,   # Batch size for evaluation
    warmup_steps=50,                 # Warmup steps
    weight_decay=0.01,               # Setting weight decay for L2 regularization
    logging_dir='/content/drive/My Drive/Thesis/saved_model_BERT_2/logs',  # Save logs
    logging_steps=len(train_dataset)//64,                # Logging steps
    evaluation_strategy="epoch",     # Evaluate model at the end of every epoch
    save_strategy="epoch",           # Save checkpoints at the end of each epoch
    
    load_best_model_at_end=True,      # Keep the best model at the end of training
    fp16=True,                        # To improve GPU utilization and reduce training time
    dataloader_num_workers=5
)

# Define the Trainer
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = accuracy_score(labels, predictions)
    return {'accuracy': accuracy}

# Define the Trainer
BERT_trainer_2 = Trainer(
    model=model,                       
    args=training_args,                # Training arguments
    data_collator=data_collator,       # For batching
    train_dataset=train_dataset,       
    eval_dataset=val_dataset,          
    tokenizer=tokenizer,               
    compute_metrics=compute_metrics,   # Metrics computation
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]              # To stop training if validation loss increases for three continuous epochs
)

# Train the model
BERT_trainer_2.train()

In [None]:
plot_training_validation_loss(BERT_trainer_2)

In [None]:
# Save the model and tokenizer
model.save_pretrained('./saved_model_BERT_2')
tokenizer.save_pretrained('./saved_model_BERT_2')

# To save to Google Drive
save_directory = '/content/drive/My Drive/Thesis/saved_model_BERT_2'
os.makedirs(save_directory, exist_ok=True)
model.save_pretrained(save_directory)
tokenizer.save_pretrained(save_directory)

In [None]:
# Evaluate the model on the test dataset
test_accuracy, test_report = evaluate_model_on_test(tokenizer, label_encoder, BERT_trainer_2, test_dataset)
print(f"Test Accuracy: {test_accuracy:.4f}")
print("Test Classification Report:")
print(test_report)

# 5. Fine-tuning BERT with the Extracted Peptide Sequences of E. coli and Their Decoy Sequences (Reverse Peptides)

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding

In [None]:
# Generate peptides using get_protein_details and get_peptide_details function

ecoli_data_file = ['uniprotkb_ecoli_AND_model_organism_8333_2024_07_03.fasta.gz']
ecoli_df = get_protein_details(ecoli_data_file)

ecoli_peptides_exp5 = get_peptide_details(ecoli_df, num_classes = 2001, give_decoy = True, give_masked_data = False)

df = ecoli_peptides_exp5

In [None]:
columns_to_keep = ['peptide_sequence']
sample_df_features = df[columns_to_keep]
sample_df_label = df['protein_name']

# Encode labels
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(sample_df_label)
sample_df_features['label'] = encoded_labels

# Final features and labels
X = sample_df_features.drop(columns=['label'])
y = sample_df_features['label']

# To Initialize the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# To process tokenization into batches else it would lead RAM crash
def tokenize_batch(peptides_batch):
    return tokenizer(peptides_batch, padding='max_length', truncation=True, max_length=512, return_tensors='pt')

# Tokenize in batches
batch_size = 500
tokenized_data_list = []


print(sample_df_features['peptide_sequence'].dtype)
for start in range(0, len(sample_df_features), batch_size):
    end = min(start + batch_size, len(sample_df_features))
    batch_peptides = sample_df_features['peptide_sequence'].iloc[start:end].tolist()
    tokenized_batch = tokenize_batch(batch_peptides)
    tokenized_data_list.append({
        'input_ids': tokenized_batch['input_ids'],
        'attention_mask': tokenized_batch['attention_mask']
    })

# Combine all tokenized data
combined_input_ids = torch.cat([batch['input_ids'] for batch in tokenized_data_list])
combined_attention_mask = torch.cat([batch['attention_mask'] for batch in tokenized_data_list])

# Convert to DataFrame
tokenized_df = pd.DataFrame({
    'input_ids': combined_input_ids.tolist(),
    'attention_mask': combined_attention_mask.tolist(),
    'label': y.tolist()
})

# Split data into training (70%), validation (15%), and test (15%) sets
train_df, temp_df = train_test_split(tokenized_df, test_size=0.3, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

# Converting data into format suitable for Hugging Face
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)

# Prepare dataset features for Hugging Face
dataset_features = {
    'input_ids': 'int32',
    'attention_mask': 'int32',
    'label': 'int32'
}

# For padding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Define the model
num_labels = len(sample_df_label.unique())
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_labels)

In [None]:
# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',  # Save the outputs in a directory
    num_train_epochs=10,              # Number of training epochs
    per_device_train_batch_size=64,  # Training batch size
    per_device_eval_batch_size=64,   # Batch size for evaluation
    warmup_steps=50,                 # Warmup steps
    weight_decay=0.01,               # Setting weight decay for L2 regularization
    logging_dir='/content/drive/My Drive/Thesis/saved_model_BERT_3/logs',  # Save logs
    logging_steps=len(train_dataset)//64,                # Logging steps
    evaluation_strategy="epoch",     # Evaluate model at the end of every epoch
    save_strategy="epoch",           # Save checkpoints at the end of each epoch
    
    load_best_model_at_end=True,      # Keep the best model at the end of training
    fp16=True,                        # To improve GPU utilization and reduce training time
    dataloader_num_workers=5
)

# Define the Trainer
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = accuracy_score(labels, predictions)
    return {'accuracy': accuracy}

# Define the Trainer
BERT_trainer_3 = Trainer(
    model=model,                       
    args=training_args,                # Training arguments
    data_collator=data_collator,       # For batching
    train_dataset=train_dataset,       
    eval_dataset=val_dataset,          
    tokenizer=tokenizer,               
    compute_metrics=compute_metrics,   # Metrics computation
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]              # To stop training if validation loss increases for three continuous epochs
)

# Train the model
BERT_trainer_3.train()

In [None]:
# Plot training and validation loss
plot_training_validation_loss(BERT_trainer_3)

In [None]:
# Save the model and tokenizer
model.save_pretrained('./saved_model_BERT_3')
tokenizer.save_pretrained('./saved_model_BERT_3')

# To save to Google Drive
save_directory = '/content/drive/My Drive/Thesis/saved_model_BERT_3'
os.makedirs(save_directory, exist_ok=True)
model.save_pretrained(save_directory)
tokenizer.save_pretrained(save_directory)

In [None]:
# Evaluate the model on the test dataset
test_accuracy, test_report = evaluate_model_on_test(tokenizer, label_encoder, BERT_trainer_3, test_dataset)
print(f"Test Accuracy: {test_accuracy:.4f}")
print("Test Classification Report:")
print(test_report)

# 6. Generating masked artificial peptides of E. coli

In [None]:
# Generate peptides using get_protein_details and get_peptide_details function

ecoli_data_file = ['uniprotkb_ecoli_AND_model_organism_8333_2024_07_03.fasta.gz']
ecoli_df = get_protein_details(ecoli_data_file)

ecoli_peptides_exp6 = get_peptide_details(ecoli_df, num_classes = 201, give_decoy = False, give_masked_data = True)

# 7. Fine-tuning BERT with the extracted peptide sequences of E. coli and their masked sequences (artificial data)

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding

In [None]:
df = ecoli_peptides_exp6

columns_to_keep = ['peptide_sequence']
sample_df_features = df[columns_to_keep]
sample_df_label = df['protein_name']

# Encode labels
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(sample_df_label)
sample_df_features['label'] = encoded_labels

# Final features and labels
X = sample_df_features.drop(columns=['label'])
y = sample_df_features['label']

# To Initialize the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# To process tokenization into batches else it would lead RAM crash
def tokenize_batch(peptides_batch):
    return tokenizer(peptides_batch, padding='max_length', truncation=True, max_length=512, return_tensors='pt')

# Tokenize in batches
batch_size = 500
tokenized_data_list = []


sample_df_features['peptide_sequence'] = sample_df_features['peptide_sequence']
sample_df_features['peptide_sequence'] = sample_df_features['peptide_sequence'].fillna('').astype(str)


for start in range(0, len(sample_df_features), batch_size):
    end = min(start + batch_size, len(sample_df_features))
    batch_peptides = sample_df_features['peptide_sequence'].iloc[start:end].tolist()
    tokenized_batch = tokenize_batch(batch_peptides)
    tokenized_data_list.append({
        'input_ids': tokenized_batch['input_ids'],
        'attention_mask': tokenized_batch['attention_mask']
    })

# Combine all tokenized data
combined_input_ids = torch.cat([batch['input_ids'] for batch in tokenized_data_list])
combined_attention_mask = torch.cat([batch['attention_mask'] for batch in tokenized_data_list])

# Convert to DataFrame
tokenized_df = pd.DataFrame({
    'input_ids': combined_input_ids.tolist(),
    'attention_mask': combined_attention_mask.tolist(),
    'label': y.tolist()
})

# Split data into training (70%), validation (15%), and test (15%) sets
train_df, temp_df = train_test_split(tokenized_df, test_size=0.3, random_state=42, stratify=y)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42, stratify=temp_df['label'])

# Converting data into format suitable for Hugging Face
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)

# Prepare dataset features for Hugging Face
dataset_features = {
    'input_ids': 'int32',
    'attention_mask': 'int32',
    'label': 'int32'
}

# For padding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Define the model
num_labels = len(sample_df_label.unique())
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_labels)

In [None]:
# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',  # Save the outputs in a directory
    num_train_epochs=10,              # Number of training epochs
    per_device_train_batch_size=64,  # Training batch size
    per_device_eval_batch_size=64,   # Batch size for evaluation
    warmup_steps=50,                 # Warmup steps
    weight_decay=0.01,               # Setting weight decay for L2 regularization
    logging_dir='/content/drive/My Drive/Thesis/saved_model_BERT_4/logs',  # Save logs
    logging_steps=len(train_dataset)//64,                # Logging steps
    evaluation_strategy="epoch",     # Evaluate model at the end of every epoch
    save_strategy="epoch",           # Save checkpoints at the end of each epoch
    
    load_best_model_at_end=True,      # Keep the best model at the end of training
    fp16=True,                        # To improve GPU utilization and reduce training time
    dataloader_num_workers=5
)

# Define the Trainer
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = accuracy_score(labels, predictions)
    return {'accuracy': accuracy}

# Define the Trainer
BERT_trainer_4 = Trainer(
    model=model,                       
    args=training_args,                # Training arguments
    data_collator=data_collator,       # For batching
    train_dataset=train_dataset,       
    eval_dataset=val_dataset,          
    tokenizer=tokenizer,               
    compute_metrics=compute_metrics,   # Metrics computation
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]              # To stop training if validation loss increases for three continuous epochs
)

# Train the model
BERT_trainer_4.train()

In [None]:
# Plot training and validation loss
plot_training_validation_loss(BERT_trainer_4)

In [None]:
# Save the model and tokenizer
model.save_pretrained('./saved_model_BERT_4')
tokenizer.save_pretrained('./saved_model_BERT_4')

# To save to Google Drive
save_directory = '/content/drive/My Drive/Thesis/saved_model_BERT_4'
os.makedirs(save_directory, exist_ok=True)
model.save_pretrained(save_directory)
tokenizer.save_pretrained(save_directory)

In [None]:
# Evaluate the model on the test dataset
test_accuracy, test_report = evaluate_model_on_test(tokenizer, label_encoder, BERT_trainer_4, test_dataset)
print(f"Test Accuracy: {test_accuracy:.4f}")
print("Test Classification Report:")
print(test_report)

# 8. Fine-tuning RoBERTa with the extracted peptide sequences of E.coli and their masked sequences

In [None]:
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding

In [None]:
# Read the CSV files into DataFrames
df = ecoli_peptides_exp6

In [None]:
columns_to_keep = ['peptide_sequence']
sample_df_features = df[columns_to_keep]
sample_df_label = df['protein_name']

# Encode labels
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(sample_df_label)
sample_df_features['label'] = encoded_labels

# Final features and labels
X = sample_df_features.drop(columns=['label'])
y = sample_df_features['label']

# To Initialize the BERT tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

# To process tokenization into batches else it would lead RAM crash
def tokenize_batch(peptides_batch):
    return tokenizer(peptides_batch, padding='max_length', truncation=True, max_length=512, return_tensors='pt')

# Tokenize in batches
batch_size = 500
tokenized_data_list = []


sample_df_features['peptide_sequence'] = sample_df_features['peptide_sequence']
sample_df_features['peptide_sequence'] = sample_df_features['peptide_sequence'].fillna('').astype(str)

for start in range(0, len(sample_df_features), batch_size):
    end = min(start + batch_size, len(sample_df_features))
    batch_peptides = sample_df_features['peptide_sequence'].iloc[start:end].tolist()
    tokenized_batch = tokenize_batch(batch_peptides)
    tokenized_data_list.append({
        'input_ids': tokenized_batch['input_ids'],
        'attention_mask': tokenized_batch['attention_mask']
    })

# Combine all tokenized data
combined_input_ids = torch.cat([batch['input_ids'] for batch in tokenized_data_list])
combined_attention_mask = torch.cat([batch['attention_mask'] for batch in tokenized_data_list])

# Convert to DataFrame
tokenized_df = pd.DataFrame({
    'input_ids': combined_input_ids.tolist(),
    'attention_mask': combined_attention_mask.tolist(),
    'label': y.tolist()
})

# Split data into training (70%), validation (15%), and test (15%) sets
train_df, temp_df = train_test_split(tokenized_df, test_size=0.3, random_state=42, stratify=y)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42, stratify=temp_df['label'])

# Converting data into format suitable for Hugging Face
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)

# Prepare dataset features for Hugging Face
dataset_features = {
    'input_ids': 'int32',
    'attention_mask': 'int32',
    'label': 'int32'
}

# For padding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Define the model
num_labels = len(sample_df_label.unique())
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=num_labels)

In [None]:
# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',  # Save the outputs in a directory
    num_train_epochs=20,              # Number of training epochs
    per_device_train_batch_size=64,  # Training batch size
    per_device_eval_batch_size=64,   # Batch size for evaluation
    warmup_steps=50,                 # Warmup steps
    weight_decay=0.01,               # Setting weight decay for L2 regularization
    logging_dir='/content/drive/My Drive/Thesis/saved_model_RoBERTa_5/logs',  # Save logs
    logging_steps=len(train_dataset)//64,                # Logging steps
    evaluation_strategy="epoch",     # Evaluate model at the end of every epoch
    save_strategy="epoch",           # Save checkpoints at the end of each epoch
    
    load_best_model_at_end=True,      # Keep the best model at the end of training
    fp16=True,                        # To improve GPU utilization and reduce training time
    dataloader_num_workers=5
)

# Define the Trainer
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = accuracy_score(labels, predictions)
    return {'accuracy': accuracy}

# Define the Trainer
RoBERTa_trainer_5 = Trainer(
    model=model,                       
    args=training_args,                # Training arguments
    data_collator=data_collator,       # For batching
    train_dataset=train_dataset,       
    eval_dataset=val_dataset,          
    tokenizer=tokenizer,               
    compute_metrics=compute_metrics,   # Metrics computation
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]              # To stop training if validation loss increases for three continuous epochs
)

# Train the model
RoBERTa_trainer_5.train()

In [None]:
# Plot training and validation loss
plot_training_validation_loss(RoBERTa_trainer_5)

In [None]:
# Save the model and tokenizer
model.save_pretrained('./saved_model_RoBERTa_5')
tokenizer.save_pretrained('./saved_model_RoBERTa_5')

# To save to Google Drive
save_directory = '/content/drive/My Drive/Thesis/saved_model_RoBERTa_5'
os.makedirs(save_directory, exist_ok=True)
model.save_pretrained(save_directory)
tokenizer.save_pretrained(save_directory)

In [None]:
# Evaluate the model on the test dataset
test_accuracy, test_report = evaluate_model_on_test(tokenizer, label_encoder, RoBERTa_trainer_5, test_dataset)
print(f"Test Accuracy: {test_accuracy:.4f}")
print("Test Classification Report:")
print(test_report)

# 9. Fine-tuning ALBERT with the extracted peptide sequences of E.coli and their masked sequences

In [None]:
from transformers import AlbertTokenizer, AlbertForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding

In [None]:
# Read the CSV files into DataFrames
df = ecoli_peptides_exp6

In [None]:
columns_to_keep = ['peptide_sequence']
sample_df_features = df[columns_to_keep]
sample_df_label = df['protein_name']

# Encode labels
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(sample_df_label)
sample_df_features['label'] = encoded_labels

# Final features and labels
X = sample_df_features.drop(columns=['label'])
y = sample_df_features['label']

# To Initialize the BERT tokenizer
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')

# To process tokenization into batches else it would lead RAM crash
def tokenize_batch(peptides_batch):
    return tokenizer(peptides_batch, padding='max_length', truncation=True, max_length=512, return_tensors='pt')

# Tokenize in batches
batch_size = 500
tokenized_data_list = []


sample_df_features['peptide_sequence'] = sample_df_features['peptide_sequence']
sample_df_features['peptide_sequence'] = sample_df_features['peptide_sequence'].fillna('').astype(str)

for start in range(0, len(sample_df_features), batch_size):
    end = min(start + batch_size, len(sample_df_features))
    batch_peptides = sample_df_features['peptide_sequence'].iloc[start:end].tolist()
    tokenized_batch = tokenize_batch(batch_peptides)
    tokenized_data_list.append({
        'input_ids': tokenized_batch['input_ids'],
        'attention_mask': tokenized_batch['attention_mask']
    })

# Combine all tokenized data
combined_input_ids = torch.cat([batch['input_ids'] for batch in tokenized_data_list])
combined_attention_mask = torch.cat([batch['attention_mask'] for batch in tokenized_data_list])

# Convert to DataFrame
tokenized_df = pd.DataFrame({
    'input_ids': combined_input_ids.tolist(),
    'attention_mask': combined_attention_mask.tolist(),
    'label': y.tolist()
})

# Split data into training (70%), validation (15%), and test (15%) sets
train_df, temp_df = train_test_split(tokenized_df, test_size=0.3, random_state=42, stratify=y)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42, stratify=temp_df['label'])

# Converting data into format suitable for Hugging Face
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)

# Prepare dataset features for Hugging Face
dataset_features = {
    'input_ids': 'int32',
    'attention_mask': 'int32',
    'label': 'int32'
}

# For padding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Define the model
num_labels = len(sample_df_label.unique())
model = AlbertForSequenceClassification.from_pretrained('albert-base-v2', num_labels=num_labels)

In [None]:
# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',  # Save the outputs in a directory
    num_train_epochs=20,              # Number of training epochs
    per_device_train_batch_size=32,  # Training batch size
    per_device_eval_batch_size=32,   # Batch size for evaluation
    warmup_steps=50,                 # Warmup steps
    weight_decay=0.01,               # Setting weight decay for L2 regularization
    logging_dir='/content/drive/My Drive/Thesis/saved_model_ALBERT_6/logs',  # Save logs
    logging_steps=len(train_dataset)//32,                # Logging steps
    evaluation_strategy="epoch",     # Evaluate model at the end of every epoch
    save_strategy="epoch",           # Save checkpoints at the end of each epoch
    
    load_best_model_at_end=True,      # Keep the best model at the end of training
    fp16=True,                        # To improve GPU utilization and reduce training time
    dataloader_num_workers=4
)

# Define the Trainer
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = accuracy_score(labels, predictions)
    return {'accuracy': accuracy}

# Define the Trainer
ALBERT_trainer_6 = Trainer(
    model=model,                       
    args=training_args,                # Training arguments
    data_collator=data_collator,       # For batching
    train_dataset=train_dataset,       
    eval_dataset=val_dataset,          
    tokenizer=tokenizer,               
    compute_metrics=compute_metrics,   # Metrics computation
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]              # To stop training if validation loss increases for three continuous epochs
)

# Train the model
ALBERT_trainer_6.train()

In [None]:
# Plot training and validation loss
plot_training_validation_loss(ALBERT_trainer_6)

In [None]:
# Save the model and tokenizer
model.save_pretrained('./saved_model_ALBERT_6')
tokenizer.save_pretrained('./saved_model_ALBERT_6')

# To save to Google Drive
save_directory = '/content/drive/My Drive/Thesis/saved_model_ALBERT_6'
os.makedirs(save_directory, exist_ok=True)
model.save_pretrained(save_directory)
tokenizer.save_pretrained(save_directory)

In [None]:
# Evaluate the model on the test dataset
test_accuracy, test_report = evaluate_model_on_test(tokenizer, label_encoder, ALBERT_trainer_6, test_dataset)
print(f"Test Accuracy: {test_accuracy:.4f}")
print("Test Classification Report:")
print(test_report)

# 10. Fine-tuning ProteinBERT with the extracted protein sequences of E. Coli

In [None]:
# The following code was run on local system as Colab doesn't support Tensorflow version 2.4.0
# Used ProteinBERT from https://github.com/nadavbra/protein_bert
import os
import pandas as pd
from IPython.display import display
from tensorflow import keras
from sklearn.model_selection import train_test_split
from proteinbert import OutputType, OutputSpec, FinetuningModelGenerator, load_pretrained_model, finetune, evaluate_by_len
from proteinbert.conv_and_global_attention_model import get_model_with_hidden_layers_as_outputs

In [None]:
# Load the dataset
ecoli_data_file = ['uniprotkb_ecoli_AND_model_organism_8333_2024_07_03.fasta.gz']
ecoli_df = get_protein_details(ecoli_data_file)

protein_ecoli_data = ecoli_df

In [None]:
# Split the data in train, test and validation sets in the 70%, 15% and 15%

train_set, temp_set = train_test_split(protein_ecoli_data, test_size = 0.3, random_state = 42)
test_set, valid_set = train_test_split(temp_set, test_size = 0.5, random_state = 42)

In [None]:
print(f'{len(train_set)} training set records, {len(valid_set)} validation set records, {len(test_set)} test set records.')
# To extract unique labels from dataset
all_labels = set(set(train_set['label']).union(set(valid_set['label'])).union(set(test_set['label'])))

# Convert the set of all labels to a sorted list
UNIQUE_LABELS = sorted(list(all_labels))

# A local (non-global) binary output
OUTPUT_TYPE = OutputType(False, 'categorical')
OUTPUT_SPEC = OutputSpec(OUTPUT_TYPE, UNIQUE_LABELS)


In [None]:
# Load Pre-trained ProteinBERT Model
pretrained_model_generator, input_encoder = load_pretrained_model()

# Create the Fine-tuning Model
model_generator = FinetuningModelGenerator(pretrained_model_generator, OUTPUT_SPEC, 
    pretraining_model_manipulation_function=get_model_with_hidden_layers_as_outputs, dropout_rate=0.5)

# Step 5: Set up Callbacks for Training
training_callbacks = [
    keras.callbacks.ReduceLROnPlateau(patience = 1, factor = 0.25, min_lr = 1e-05, verbose = 1),
    keras.callbacks.EarlyStopping(patience = 2, restore_best_weights = True),
]

# Step 6: Fine-tune the Model
finetune(model_generator, input_encoder, OUTPUT_SPEC, train_set['seq'], train_set['label'], valid_set['seq'], valid_set['label'], 
    seq_len=512, batch_size=32, max_epochs_per_stage=5, lr=1e-04, begin_with_frozen_pretrained_layers=True, 
    lr_with_frozen_pretrained_layers=1e-02, n_final_epochs=1, final_seq_len=1024, final_lr=1e-05, callbacks=training_callbacks)

# Step 7: Evaluate the Model on the Test Set
results, confusion_matrix = evaluate_by_len(model_generator, input_encoder, OUTPUT_SPEC, test_set['seq'], test_set['label'], start_seq_len=1024, start_batch_size=32)

print('Test-set performance:')
print(results)
print('Confusion matrix:')
print(confusion_matrix)