In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [6]:
!pip  install transformers



In [7]:
import os
import re
import sys
import pandas as pd
import torch
from transformers import BertTokenizer, BertModel

def read_peptide_sequences(file):
    if not os.path.exists(file):
        print(f'Error: file {file} does not exist.')
        sys.exit(1)
    
    with open(file) as f:
        records = f.read()
    
    if '>' not in records:
        print(f'Error: the input file {file} seems not in FASTA format!')
        sys.exit(1)
    
    records = records.split('>')[1:]
    peptide_sequences = []
    for fasta in records:
        array = fasta.split('\n')
        header, sequence = array[0], ''.join(array[1:]).upper()
        peptide_sequences.append(sequence)
    
    return peptide_sequences

def extract_features(sequence, tokenizer, model):
    encoded_input = tokenizer.encode_plus(
        sequence,
        add_special_tokens=True,
        padding='max_length',
        truncation=True,
        max_length=512,
        return_tensors='pt'
    )

    with torch.no_grad():
        outputs = model(**encoded_input)

    hidden_states = outputs.last_hidden_state
    pooled_output = torch.mean(hidden_states, dim=1).squeeze()
    features = pooled_output.numpy()

    return features

def main():
    file_path = '/kaggle/input/ace-dataset/ACE_full_dataset.txt'
    output_csv = '/kaggle/working//bert_features_ACE.csv'

    # Load pre-trained BERT model and tokenizer
    model_name = 'bert-base-uncased'
    tokenizer = BertTokenizer.from_pretrained(model_name)
    model = BertModel.from_pretrained(model_name)

    # Read peptide sequences
    peptide_sequences = read_peptide_sequences(file_path)

    # Extract BERT features for each sequence
    features_list = []
    for seq in peptide_sequences:
        features = extract_features(seq, tokenizer, model)
        features_list.append(features)

    # Save features to CSV
    features_df = pd.DataFrame(features_list)
    features_df.to_csv(output_csv, index=False)

if __name__ == "__main__":
    main()


In [None]:
import numpy as np
from transformers import BertTokenizer, BertModel
import torch

import pandas as pd
import numpy as np
import re, os, sys
from itertools import product

def read_nucleotide_sequences(file):
    if os.path.exists(file) == False:
        print('Error: file %s does not exist.' % file)
        sys.exit(1)
    with open(file) as f:
        records = f.read()
    if re.search('>', records) == None:
        print('Error: the input file %s seems not in FASTA format!' % file)
        sys.exit(1)
    records = records.split('>')[1:]
    fasta_sequences = []
    for fasta in records:
        array = fasta.split('\n')
        header, sequence = array[0].split()[0], re.sub('[^ACGTU-]', '-', ''.join(array[1:]).upper())
        header_array = header.split('|')
        name = header_array[0]
        label = header_array[1] if len(header_array) >= 2 else '0'
        label_train = header_array[2] if len(header_array) >= 3 else 'training'
        sequence = re.sub('U', 'T', sequence)
        fasta_sequences.append(sequence)
    return fasta_sequences

#!/usr/bin/env python
#_*_coding:utf-8_*_

import re

def check_fasta_with_equal_length(fastas):
    status = True
    lenList = set()
    for i in fastas:
        lenList.add(len(i[1]))
    if len(lenList) == 1:
        return True
    else:
        return False

def get_min_sequence_length(fastas):
    minLen = 10000
    for i in fastas:
        if minLen > len(i[1]):
            minLen = len(i[1])
    return minLen

def get_min_sequence_length_1(fastas):
    minLen = 10000
    for i in fastas:
        if minLen > len(re.sub('-', '', i[1])):
            minLen = len(re.sub('-', '', i[1]))
    return minLen
def readFasta(file):
    if os.path.exists(file) == False:
        print('Error: "' + file + '" does not exist.')
        sys.exit(1)

    with open(file) as f:
        records = f.read()

    if re.search('>', records) == None:
        print('The input file seems not in fasta format.')
        sys.exit(1)

    records = records.split('>')[1:]
    myFasta = []
    for fasta in records:
        array = fasta.split('\n')
        name, sequence = array[0].split()[0], re.sub('[^ARNDCQEGHILKMFPSTWYV-]', '-', ''.join(array[1:]).upper())
        myFasta.append([name, sequence])
    return myFasta


# Load pre-trained BERT model and tokenizer
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)


def extract_features(sequence):
    # Tokenize the input sequence
    encoded_input = tokenizer.encode_plus(
        sequence,
        add_special_tokens=True,
        padding='max_length',
        truncation=True,
        max_length=512,
        return_tensors='pt'
    )

    # Pass the tokenized input through the BERT model
    with torch.no_grad():
        outputs = model(**encoded_input)

    # Extract the hidden states from BERT's output
    hidden_states = outputs.last_hidden_state

    # Perform pooling to obtain a fixed-size feature vector
    pooled_output = torch.mean(hidden_states, dim=1).squeeze()

    # Convert the feature vector to a numpy array
    features = pooled_output.numpy()

    return features
    

# Example usage
dna_sequences = ['ATCGATCGATCG', 'CGATCGATCGATCG']

fastas = read_nucleotide_sequences('/content/drive/MyDrive/Smile_feature/AMPylation_data.fasta')
features = extract_features(fastas)
data_csv=pd.DataFrame(features)
data_csv.to_csv('/content/drive/MyDrive/Smile_feature/FastText_feature_dna.csv')

print(features)
