In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install fair-esm
!pip install biopython

Collecting fair-esm
  Downloading fair_esm-2.0.0-py3-none-any.whl.metadata (37 kB)
Downloading fair_esm-2.0.0-py3-none-any.whl (93 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/93.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m93.1/93.1 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fair-esm
Successfully installed fair-esm-2.0.0
Collecting biopython
  Downloading biopython-1.84-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Downloading biopython-1.84-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m43.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: biopython
Successfully installed biopython-1.84


In [4]:
import torch
import esm
import pandas as pd

# Load ESM-2 model  esm1v_t33_650M_UR90S_1()  esm2_t6_8M_UR50D
#https://github.com/facebookresearch/esm
model, alphabet = esm.pretrained.esm2_t6_8M_UR50D()
batch_converter = alphabet.get_batch_converter()
model.eval()  # disables dropout for deterministic results

# Read protein sequences from a FASTA file
def read_fasta(file_path):
    sequences = []
    with open(file_path, 'r') as file:
        sequence = ''
        for line in file:
            if line.startswith('>'):
                if sequence:
                    sequences.append(sequence)
                    sequence = ''
            else:
                sequence += line.strip()
        if sequence:
            sequences.append(sequence)
    return sequences

path ='/content/drive/MyDrive/Watashara_Projects/TIP/'
sequences = read_fasta(path+"Features_extraction/TR_IND_Pos_Neg.fasta")

# Reduce batch size (e.g., to 2)
batch_size = 2
batches = [sequences[i:i+batch_size] for i in range(0, len(sequences), batch_size)]

# Initialize list to accumulate representations
all_sequence_representations = []

# Process batches
for batch_idx, batch in enumerate(batches):
    data = [(f'protein{i}', seq) for i, seq in enumerate(batch)]
    batch_labels, batch_strs, batch_tokens = batch_converter(data)
    batch_lens = (batch_tokens != alphabet.padding_idx).sum(1)

    # Extract per-residue representations (on CPU)
    with torch.no_grad():
        results = model(batch_tokens, repr_layers=[6], return_contacts=True)
    token_representations = results["representations"][6]

    # Generate per-sequence representations via averaging
    # NOTE: token 0 is always a beginning-of-sequence token, so the first residue is token 1.
    sequence_representations = []
    for i, tokens_len in enumerate(batch_lens):
        sequence_representations.append(token_representations[i, 1 : tokens_len - 1].mean(0))

    # Accumulate representations
    all_sequence_representations.extend(sequence_representations)

# Convert tensor representations to numpy arrays
all_sequence_representations_numpy = [tensor.numpy() for tensor in all_sequence_representations]

# Convert numpy arrays to list of lists
all_sequence_representations_list = [list(arr) for arr in all_sequence_representations_numpy]

# Write to CSV
feat = pd.DataFrame(all_sequence_representations_list)
feat.to_csv(path+'features/sm2_TIP_all.csv', index=False, header=False)
