In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.29.2-py3-none-any.whl (7.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m52.8 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m20.3 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m56.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 tokenizers-0.13.3 transformers-4.29.2


In [5]:
import numpy as np
from gensim.models import FastText
import numpy as np
import torch
from transformers import BertModel, BertTokenizer

import pandas as pd
import numpy as np
import re, os, sys
from itertools import product

from google.colab import drive
drive.mount('/content/drive')

def read_nucleotide_sequences(file):
    if os.path.exists(file) == False:
        print('Error: file %s does not exist.' % file)
        sys.exit(1)
    with open(file) as f:
        records = f.read()
    if re.search('>', records) == None:
        print('Error: the input file %s seems not in FASTA format!' % file)
        sys.exit(1)
    records = records.split('>')[1:]
    fasta_sequences = []
    for fasta in records:
        array = fasta.split('\n')
        header, sequence = array[0].split()[0], re.sub('[^ACGTU-]', '-', ''.join(array[1:]).upper())
        header_array = header.split('|')
        name = header_array[0]
        label = header_array[1] if len(header_array) >= 2 else '0'
        label_train = header_array[2] if len(header_array) >= 3 else 'training'
        sequence = re.sub('U', 'T', sequence)
        fasta_sequences.append(sequence)
    return fasta_sequences

#!/usr/bin/env python
#_*_coding:utf-8_*_

import re

def check_fasta_with_equal_length(fastas):
    status = True
    lenList = set()
    for i in fastas:
        lenList.add(len(i[1]))
    if len(lenList) == 1:
        return True
    else:
        return False

def get_min_sequence_length(fastas):
    minLen = 10000
    for i in fastas:
        if minLen > len(i[1]):
            minLen = len(i[1])
    return minLen

def get_min_sequence_length_1(fastas):
    minLen = 10000
    for i in fastas:
        if minLen > len(re.sub('-', '', i[1])):
            minLen = len(re.sub('-', '', i[1]))
    return minLen
def readFasta(file):
    if os.path.exists(file) == False:
        print('Error: "' + file + '" does not exist.')
        sys.exit(1)

    with open(file) as f:
        records = f.read()

    if re.search('>', records) == None:
        print('The input file seems not in fasta format.')
        sys.exit(1)

    records = records.split('>')[1:]
    myFasta = []
    for fasta in records:
        array = fasta.split('\n')
        name, sequence = array[0].split()[0], re.sub('[^ARNDCQEGHILKMFPSTWYV-]', '-', ''.join(array[1:]).upper())
        myFasta.append([name, sequence])
    return myFasta


def extract_features(dna_sequences, vector_size=100, window=5, min_count=1):
    # Prepare data for FastText
    tokenized_sequences = [list(sequence) for sequence in dna_sequences]

    # Train FastText model
    model = FastText(tokenized_sequences, vector_size=vector_size, window=window, min_count=min_count)

    # Extract features
    features = np.zeros((len(dna_sequences), vector_size))
    for i, sequence in enumerate(tokenized_sequences):
        for token in sequence:
            features[i] += model.wv[token]
        features[i] /= len(sequence)

    return features

# Example usage
dna_sequences = ['ATCGATCGATCG', 'CGATCGATCGATCG']

fastas = read_nucleotide_sequences('/content/drive/MyDrive/Research Work/Arabidopsis_train.fasta')
features = extract_features(fastas)
data_csv=pd.DataFrame(features)
data_csv.to_csv('/content/drive/MyDrive/Research Work/FastText_Arabidopsis_train.csv')

print(features)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
[[-0.13529807  0.06433191  0.03500054 ...  0.01327619 -0.11958317
   0.10745705]
 [-0.13797145  0.06834678  0.03556608 ...  0.0139749  -0.13021139
   0.11581534]
 [-0.0670877   0.048177    0.00020939 ...  0.000347    0.06653122
  -0.09902307]
 ...
 [-0.08067543  0.05662995  0.00589048 ...  0.00258245  0.03493547
  -0.06475744]
 [-0.10267424  0.05997989  0.01753549 ...  0.00598394 -0.01219003
  -0.00587908]
 [-0.09141966  0.06659426  0.00959213 ...  0.00379118  0.01915838
  -0.04591381]]
