In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
!pip install biopython

Collecting biopython
  Downloading biopython-1.84-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Downloading biopython-1.84-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m24.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: biopython
Successfully installed biopython-1.84


In [5]:
from gensim.models import Word2Vec
from Bio import SeqIO

def preprocess_sequences(sequences, window_size=3):
    """
    Generate k-mers for each sequence in the list.
    :param sequences: List of sequences.
    :param window_size: Size of k-mers.
    :return: List of k-mers for all sequences.
    """
    k_mer_list = []
    for seq in sequences:
        seq = seq.replace("U", "X").replace("Z", "X").replace("O", "X")  # Replace uncommon amino acids
        padded_seq = "-" + seq + "-"
        k_mers = [padded_seq[i:i + window_size] for i in range(len(seq))]
        k_mer_list.append(k_mers)
    return k_mer_list


def train_word2vec_model(k_mer_list, vector_size=512, window=5, min_count=1, workers=4):
    """
    Train a Word2Vec model on k-mers.
    :param k_mer_list: List of k-mers for training.
    :param vector_size: Dimensionality of the feature vectors.
    :param window: Maximum distance between the current and predicted word.
    :param min_count: Ignores k-mers with total frequency lower than this.
    :param workers: Number of worker threads.
    :return: Trained Word2Vec model.
    """
    model = Word2Vec(sentences=k_mer_list, vector_size=vector_size, window=window, min_count=min_count, workers=workers)
    return model


# Load sequences from a FASTA file
def load_sequences_from_fasta(file_path):
    sequences = []
    for record in SeqIO.parse(file_path, "fasta"):
        sequences.append(str(record.seq))
    return sequences


# Main workflow
def main():
    path = '/content/drive/MyDrive/Watashara_Projects/IL6/'
    fasta_file = path+'Features_extraction/IND_Pos_IND_Neg.txt'  # Update with the path to your FASTA file
    sequences = load_sequences_from_fasta(fasta_file)

    # Generate k-mers
    k_mer_list = preprocess_sequences(sequences)

    # Train Word2Vec model
    model = train_word2vec_model(k_mer_list)

    # Save the model
    model.save(path+"Features_extraction/trained_word2vec.model")
    print("Model saved as 'trained_word2vec.model'")


if __name__ == "__main__":
    main()


Model saved as 'trained_word2vec.model'


In [10]:
!pip install bio-embeddings[all]

Collecting bio-embeddings[all]
  Using cached bio_embeddings-0.1.6-py3-none-any.whl.metadata (10 kB)
Collecting appdirs<2.0.0,>=1.4.4 (from bio-embeddings[all])
  Using cached appdirs-1.4.4-py2.py3-none-any.whl.metadata (9.0 kB)
Collecting gensim<4.0.0,>=3.8.2 (from bio-embeddings[all])
  Using cached gensim-3.8.3.tar.gz (23.4 MB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting h5py<3.0.0,>=2.10.0 (from bio-embeddings[all])
  Using cached h5py-2.10.0.tar.gz (301 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting importlib_metadata<2.0.0,>=1.7.0 (from bio-embeddings[all])
  Using cached importlib_metadata-1.7.0-py2.py3-none-any.whl.metadata (2.1 kB)
INFO: pip is looking at multiple versions of bio-embeddings[all] to determine which version is compatible with other requirements. This could take a while.
Collecting bio-embeddings[all]
  Using cached bio_embeddings-0.1.5-py3-none-any.whl.metadata (11 kB)
  Using cached bio_embeddings-0.1.4-py3-none-any.whl.

In [20]:
!pip install torch==1.10.0



[31mERROR: Could not find a version that satisfies the requirement torch==1.10.0 (from versions: 1.11.0, 1.12.0, 1.12.1, 1.13.0, 1.13.1, 2.0.0, 2.0.1, 2.1.0, 2.1.1, 2.1.2, 2.2.0, 2.2.1, 2.2.2, 2.3.0, 2.3.1, 2.4.0, 2.4.1, 2.5.0, 2.5.1)[0m[31m
[0m[31mERROR: No matching distribution found for torch==1.10.0[0m[31m
[0m

In [21]:
!pip install -U "bio-embeddings[all] @ git+https://github.com/sacdallago/bio_embeddings.git"

Collecting bio-embeddings@ git+https://github.com/sacdallago/bio_embeddings.git (from bio-embeddings[all]@ git+https://github.com/sacdallago/bio_embeddings.git)
  Cloning https://github.com/sacdallago/bio_embeddings.git to /tmp/pip-install-hilqtvtj/bio-embeddings_437d1127795745aaaed632d44602fa4e
  Running command git clone --filter=blob:none --quiet https://github.com/sacdallago/bio_embeddings.git /tmp/pip-install-hilqtvtj/bio-embeddings_437d1127795745aaaed632d44602fa4e
  Resolved https://github.com/sacdallago/bio_embeddings.git to commit efb9801f0de9b9d51d19b741088763a7d2d0c3a2
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting appdirs<2.0.0,>=1.4.4 (from bio-embeddings@ git+https://github.com/sacdallago/bio_embeddings.git->bio-embeddings[all]@ git+https://github.com/sacdallago/bio_embeddings.git)
  Using cached appdirs-1.4.4-py2.py3-none-any.whl.metadata 

In [25]:
!pip uninstall -y torch


Found existing installation: torch 1.11.0
Uninstalling torch-1.11.0:
  Successfully uninstalled torch-1.11.0


In [43]:
!pip install torch==1.11.0+cpu torchvision==0.12.0+cpu torchaudio==0.11.0 --extra-index-url https://download.pytorch.org/whl/cpu


Looking in indexes: https://pypi.org/simple, https://download.pytorch.org/whl/cpu


In [26]:
# Install PyTorch version compatible with bio-embeddings
!pip install torch==1.10.0

# Install bio-embeddings
!pip install git+https://github.com/sacdallago/bio_embeddings.git


[31mERROR: Could not find a version that satisfies the requirement torch==1.10.0 (from versions: 1.11.0, 1.12.0, 1.12.1, 1.13.0, 1.13.1, 2.0.0, 2.0.1, 2.1.0, 2.1.1, 2.1.2, 2.2.0, 2.2.1, 2.2.2, 2.3.0, 2.3.1, 2.4.0, 2.4.1, 2.5.0, 2.5.1)[0m[31m
[0m[31mERROR: No matching distribution found for torch==1.10.0[0m[31m
[0mCollecting git+https://github.com/sacdallago/bio_embeddings.git
  Cloning https://github.com/sacdallago/bio_embeddings.git to /tmp/pip-req-build-og0jgb69
  Running command git clone --filter=blob:none --quiet https://github.com/sacdallago/bio_embeddings.git /tmp/pip-req-build-og0jgb69
  Resolved https://github.com/sacdallago/bio_embeddings.git to commit efb9801f0de9b9d51d19b741088763a7d2d0c3a2
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting appdirs<2.0.0,>=1.4.4 (from bio-embeddings==0.2.3)
  Using cached appdirs-1.4.4-py2.py3-none-any.w

In [40]:
!pip install torch==1.11.0+cpu torchvision==0.12.0+cpu torchaudio==0.11.0 -f https://download.pytorch.org/whl/torch_stable.html



Looking in links: https://download.pytorch.org/whl/torch_stable.html
Collecting torch==1.11.0+cpu
  Using cached https://download.pytorch.org/whl/cpu/torch-1.11.0%2Bcpu-cp310-cp310-linux_x86_64.whl (169.2 MB)
Collecting torchvision==0.12.0+cpu
  Downloading https://download.pytorch.org/whl/cpu/torchvision-0.12.0%2Bcpu-cp310-cp310-linux_x86_64.whl (14.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.7/14.7 MB[0m [31m20.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting torchaudio==0.11.0
  Downloading https://download.pytorch.org/whl/rocm4.5.2/torchaudio-0.11.0%2Brocm4.5.2-cp310-cp310-linux_x86_64.whl (2.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.7/2.7 MB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: torch, torchvision, torchaudio
  Attempting uninstall: torchvision
    Found existing installation: torchvision 0.20.1+cu121
    Uninstalling torchvision-0.20.1+cu121:
      Successfully uninstalled torchvisio

In [44]:
!pip install git+https://github.com/sacdallago/bio_embeddings.git


Collecting git+https://github.com/sacdallago/bio_embeddings.git
  Cloning https://github.com/sacdallago/bio_embeddings.git to /tmp/pip-req-build-qu3tlfk7
  Running command git clone --filter=blob:none --quiet https://github.com/sacdallago/bio_embeddings.git /tmp/pip-req-build-qu3tlfk7
  Resolved https://github.com/sacdallago/bio_embeddings.git to commit efb9801f0de9b9d51d19b741088763a7d2d0c3a2
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting appdirs<2.0.0,>=1.4.4 (from bio-embeddings==0.2.3)
  Using cached appdirs-1.4.4-py2.py3-none-any.whl.metadata (9.0 kB)
Collecting atomicwrites<2.0.0,>=1.4.0 (from bio-embeddings==0.2.3)
  Using cached atomicwrites-1.4.1.tar.gz (14 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting celery<6.0.0,>=5.2.7 (from bio-embeddings==0.2.3)
  Using cached celery-5.4.0-py3-none-any.whl.metadata (21 kB)
Collecting

In [45]:
import re
import numpy as np
from gensim.models.keyedvectors import KeyedVectors
from numpy import ndarray
from bio_embeddings.embed.embedder_interfaces import EmbedderInterface
from Bio import SeqIO
import pandas as pd


class Word2VecEmbedder(EmbedderInterface):
    name = "word2vec"
    embedding_dimension = 512
    number_of_layers = 1
    necessary_files = ["model_file"]

    def __init__(self, **kwargs):
        """
        :param model_file: path of model file. If not supplied, will be downloaded.
        """
        super().__init__(**kwargs)

        self._model_file = self._options.get("model_file")
        self._model = KeyedVectors.load(str(self._model_file), mmap="r")
        self._vector_size = 512
        self._zero_vector = np.zeros(self._vector_size, dtype=np.float32)
        self._window_size = 3

    def embed(self, sequence: str) -> ndarray:
        sequence = re.sub(r"[UZOB]", "X", sequence)
        # pad sequence with special character (only 3-mers are considered)
        padded_sequence = "-" + sequence + "-"

        # container
        embedding = np.zeros((len(sequence), self._vector_size), dtype=np.float32)

        # for each aa in the sequence, retrieve k-mer
        for index in range(len(sequence)):
            try:
                k_mer = "".join(padded_sequence[index : index + self._window_size])
                embedding[index, :] = self._get_kmer_representation(k_mer)
            # end of sequence reached
            except IndexError:
                break

        return embedding

    def _get_kmer_representation(self, k_mer):
        # try to retrieve embedding for k-mer
        try:
            return self._model.wv[k_mer]
        # in case of padded or out-of-vocab character
        except KeyError:
            # if single AA was not part of corpus (or no AA)
            if len(k_mer) <= 1:
                return self._zero_vector
            # handle border cases at start/end of seq
            elif "-" in k_mer:
                idx_center = int(len(k_mer) / 2)
                return self._get_kmer_representation(k_mer[idx_center])

    @staticmethod
    def reduce_per_protein(embedding: ndarray) -> ndarray:
        return embedding.mean(axis=0)


# # Load sequences from a FASTA file
# def load_sequences_from_fasta(file_path):
#     sequences = []
#     for record in SeqIO.parse(file_path, "fasta"):
#         sequences.append(str(record.seq))
#     return sequences


# # Load sequences from a plain text file
# def load_sequences_from_txt(file_path):
#     with open(file_path, "r") as file:
#         sequences = [line.strip() for line in file if line.strip()]
#     return sequences
# Load sequences from a specific column in a CSV file
def load_sequences_from_csv(file_path, column_name):
    """
    Load sequences from a specific column in a CSV file.
    :param file_path: Path to the CSV file.
    :param column_name: Name of the column containing the sequences.
    :return: List of sequences.
    """
    df = pd.read_csv(file_path)
    if column_name not in df.columns:
        raise ValueError(f"Column '{column_name}' not found in the CSV file.")
    return df[column_name].dropna().astype(str).tolist()

# Extract features for all sequences
def extract_features(embedder, sequences):
    all_embeddings = []
    for sequence in sequences:
        embedding = embedder.embed(sequence)
        protein_feature = embedder.reduce_per_protein(embedding)  # Reduce to single vector
        all_embeddings.append(protein_feature)
    return np.array(all_embeddings)


# Save features to a CSV file
def save_features_to_csv(features, output_path):
    df = pd.DataFrame(features)
    df.to_csv(output_path, index=False, header=False)


# Main workflow
def main():

    path = '/content/drive/MyDrive/Watashara_Projects/IL6/'
    fasta_file = path+'Features_extraction/IND_Pos_IND_Neg.txt'

    # txt_file = "path_to_your_sequences.txt"      # Update with the path to your text file
    model_file = path+"Features_extraction/trained_word2vec.model"  # Path to your pre-trained Word2Vec model

    # # Choose file format
    sequences = load_sequences_from_fasta(fasta_file)  # Or use load_sequences_from_txt(txt_file)

    # csv_file = "path_to_your_sequences.csv"  # Path to your CSV file
    # column_name = "sequence_column"         # Replace with the actual column name containing sequences
    # model_file = "path_to_your_word2vec_model.model"  # Path to your pre-trained Word2Vec model

    # # Load sequences from the CSV file
    # sequences = load_sequences_from_csv(csv_file, column_name)

    # Initialize the embedder
    embedder = Word2VecEmbedder(model_file=model_file)

    # Extract features
    features = extract_features(embedder, sequences)

    # Save features to CSV
    output_csv = "extracted_features.csv"
    save_features_to_csv(features, output_csv)

    print(f"Features saved to {output_csv}")


if __name__ == "__main__":
    main()


ModuleNotFoundError: No module named 'bio_embeddings'