In [4]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
!pip  install transformers



In [3]:
import os
import re
import sys
import pandas as pd
import torch
from transformers import BertTokenizer, BertModel

def read_peptide_sequences(file):
    if not os.path.exists(file):
        print(f'Error: file {file} does not exist.')
        sys.exit(1)

    with open(file) as f:
        records = f.read()

    if '>' not in records:
        print(f'Error: the input file {file} seems not in FASTA format!')
        sys.exit(1)

    records = records.split('>')[1:]
    peptide_sequences = []
    for fasta in records:
        array = fasta.split('\n')
        header, sequence = array[0], ''.join(array[1:]).upper()
        peptide_sequences.append(sequence)

    return peptide_sequences

def extract_features(sequence, tokenizer, model):
    encoded_input = tokenizer.encode_plus(
        sequence,
        add_special_tokens=True,
        padding='max_length',
        truncation=True,
        max_length=512,
        return_tensors='pt'
    )

    with torch.no_grad():
        outputs = model(**encoded_input)

    hidden_states = outputs.last_hidden_state
    pooled_output = torch.mean(hidden_states, dim=1).squeeze()
    features = pooled_output.numpy()

    return features

def main():
    path = '/content/drive/MyDrive/Watashara_Projects/TIP/'
    file_path = path+'Features_extraction/TR_IND_Pos_Neg.fasta'
    output_csv = path + 'features/bert_features_ACE.csv'

    # Load pre-trained BERT model and tokenizer
    model_name = 'bert-base-uncased'
    tokenizer = BertTokenizer.from_pretrained(model_name)
    model = BertModel.from_pretrained(model_name)

    # Read peptide sequences
    peptide_sequences = read_peptide_sequences(file_path)

    # Extract BERT features for each sequence
    features_list = []
    for seq in peptide_sequences:
        features = extract_features(seq, tokenizer, model)
        features_list.append(features)

    # Save features to CSV
    features_df = pd.DataFrame(features_list)
    features_df.to_csv(output_csv, index=False)

if __name__ == "__main__":
    main()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]