<a href="https://colab.research.google.com/github/skovz99/DNATokenizer-2/blob/main/DNA_tokenizer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install tokenizers

Collecting tokenizers
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m26.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tokenizers
Successfully installed tokenizers-0.13.3


In [None]:
# Lets create a DNA transformer tokenizer part that takes a 3 nucleotide window as a single string for tokenization
# first thing to do, convert the DNA sequence into strings of 3 in length
DNAseq = 'ATGATGGGATCGGCGCTAGCTAGCTAGCTGGGTCAT'

# define a function to separate the DNA sequence (all will be much longer than this example above) into equal parts of a k value prior to separating into three
def equal_parts(DNA, part_size):
  length = len(DNAseq)
  parts = [DNA[i:i + part_size] for i in range(0, length, part_size)]
  return parts

# define a function to create spaces between the nucleotide windows in the DNA sequence
def string(DNA, k):
  result = []
  for i in range(0, len(DNA), k):
    chunk = "".join(DNA[i:i+3])
    result.append(chunk)
  return result

three_string = string(DNAseq, k = 3)

# once the DNA sequence has been split into 3 nucleotide wide strings define a function that determines the length of the vocabulary
def vocab(initial_DNA, token_size):
  DNA_length = len(initial_DNA)
  unique_characters = len(list(set(initial_DNA)))
  possible_tokens = unique_characters ** token_size
  return possible_tokens

vocabulary = vocab(DNAseq, 3)

# create a vocabulary and tokenize the DNAseq, BOS = 0 and EOS = 1, no padding and no unknown because all sequences input into transformer are same size and no unknown nucleotides in initial DNA sequence
# when the value of the size of nucleotide window is changed, this function will have to be updated to include either more or less for loops for the letters
def combo_library(DNA, token_size, three_string, vocabulary):
  uni = list(set(DNA))
  combos = []
  for letter1 in uni:
    for letter2 in uni:
      for letter3 in uni:
        combination = letter1 + letter2 + letter3
        combos.append(combination)
  combo_tokens = [num for num in range(2, 66)]
  token_assignment = list(zip(combos, combo_tokens))
  result_tokens = []
  for s in three_string:
    for combo, token in token_assignment:
      if s == combo:
        result_tokens.append(token)
        break
      else:
        result_tokens.append(vocabulary + 2)
  result_tokens = [ele for ele in result_tokens if ele != (vocabulary + 2)]
  result_tokens.insert(0, 0)
  result_tokens.append(1)
  return result_tokens

DNA_tokens = combo_library(DNAseq, 3, three_string, vocabulary) # token size should be the same size as nucleotide window

# convert the DNA tokens to a pytorch tensor
import torch
import torch.nn as nn

def tensor_tokens(DNA_tokens):
  DNA_tensor = torch.tensor(DNA_tokens, dtype=torch.long)
  return DNA_tensor

DNA_tensor = tensor_tokens(DNA_tokens)

# create embeddings for the DNA tensor
def DNA_embeddings(dimension, vocabulary, DNA_tensor):
  embedding_layer = nn.Embedding(vocabulary, dimension)
  embedded_output = embedding_layer(DNA_tensor)
  return embedded_output

DNA_embed = DNA_embeddings(100, vocabulary, DNA_tensor)