In [3]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [1]:
!pip install transformers



In [5]:
import os
import re
import sys
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from collections import Counter


def read_peptide_sequences(file):
    if not os.path.exists(file):
        print(f'Error: file {file} does not exist.')
        sys.exit(1)

    with open(file) as f:
        records = f.read()

    if '>' not in records:
        print(f'Error: the input file {file} seems not in FASTA format!')
        sys.exit(1)

    records = records.split('>')[1:]
    peptide_sequences = []
    for fasta in records:
        array = fasta.split('\n')
        header, sequence = array[0], ''.join(array[1:]).upper()
        peptide_sequences.append(sequence)

    return peptide_sequences

def extract_features(peptide_sequences, vector_size=100, window=5, min_count=1):
    # Prepare data for Word2Vec
    tokenized_sequences = [list(sequence) for sequence in peptide_sequences]

    # Train Word2Vec model
    model = Word2Vec(tokenized_sequences, vector_size=vector_size, window=window, min_count=min_count)

    # Create a vocabulary list
    vocabulary = list(model.wv.index_to_key)

    # Extract BoW + Word2Vec features
    features = []
    for sequence in tokenized_sequences:
        # Bag of Words representation
        bow = Counter(sequence)
        bow_vector = [bow[token] for token in vocabulary]

        # Word2Vec representation
        word2vec_vector = np.zeros(vector_size)
        for token in sequence:
            if token in model.wv:
                word2vec_vector += model.wv[token]
        word2vec_vector /= len(sequence)

        # Combine BoW and Word2Vec vectors
        combined_vector = np.concatenate([bow_vector, word2vec_vector])
        features.append(combined_vector)

    return np.array(features), vocabulary

def main():
    # File paths
    path = '/content/drive/MyDrive/Watashara_Projects/TIP/'
    file_path = path+'Features_extraction/TR_IND_Pos_Neg.fasta'
    output_csv = path + 'features/Fasttext_features_TIP.csv'

    # Read peptide sequences
    peptide_sequences = read_peptide_sequences(file_path)

    # Extract features using BoW + Word2Vec
    features, vocabulary = extract_features(peptide_sequences)

    # Create a DataFrame with the combined features
    bow_columns = [f'bow_{token}' for token in vocabulary]
    word2vec_columns = [f'word2vec_{i}' for i in range(features.shape[1] - len(vocabulary))]
    columns = bow_columns + word2vec_columns

    features_df = pd.DataFrame(features, columns=columns)
    features_df.to_csv(output_csv, index=False)

    print("Features extracted and saved to CSV successfully.")
    print(features)

if __name__ == "__main__":
    main()


NameError: name 'output_file' is not defined