# Create Token Embeddings

In [1]:
import sys

sys.path.append('..')

In [2]:
import sqlite3

import pandas as pd
from gensim import models, utils
from tokenizers import Tokenizer
from tqdm import tqdm

from adna.pylib import consts

In [3]:
SQL = consts.DATA_DIR / 'UF46992.sqlite'
TOKENIZER = consts.DATA_DIR / 'UF46992.tokenizer.json'
EMBEDDINGS = consts.DATA_DIR / 'UF46992.gensim.emb'

In [4]:
VECTOR_SIZE = 128  # Keeping things small for now
WORKERS = 6

## Create an iterator for feeding the trainer

In [5]:
class DnaCorpus:
    def __iter__(self):
        tokenizer = Tokenizer.from_file(str(TOKENIZER))
        with sqlite3.connect(SQL) as cxn:
            df = pd.read_sql('select seq from seqs', cxn)
            for seq in df.seq:
                enc = tokenizer.encode(seq)
                yield enc.tokens

## Train the embedding model

In [6]:
model = models.Word2Vec(
    sentences=DnaCorpus(),
    vector_size=VECTOR_SIZE,
    workers=WORKERS,
    min_count=0,
)

## Save the embedding model

In [7]:
model.wv.save(str(EMBEDDINGS))

In [8]:
len(model.wv)

4998

In [9]:
model.wv['A']

array([-3.148573  ,  3.0176463 ,  3.944402  ,  2.0478578 , -1.2311232 ,
       -4.968159  ,  1.5306941 , -1.0481359 ,  0.7687681 ,  0.2744158 ,
       -5.0154777 , -3.35976   , -2.1089144 , -1.4445684 ,  3.1185875 ,
       -0.10967441,  0.46813488, -0.34797794, -1.4066111 , -0.7476404 ,
        1.1855088 , -2.3770523 , -4.4823785 ,  1.0867794 ,  1.1143287 ,
        0.11890692,  0.3796609 ,  3.216489  , -0.63060194,  5.165122  ,
       -0.506519  ,  2.2266257 , -1.1080312 ,  0.9464418 , -0.2013732 ,
       -3.1642244 , -1.9638917 ,  2.5215325 ,  2.840156  , -2.6347342 ,
        1.9930321 , -0.4166729 ,  0.25781706, -2.0545137 , -1.8403842 ,
        2.607237  ,  0.16682309,  1.8484675 ,  6.2648244 ,  2.8217394 ,
       -2.550083  ,  3.015338  ,  1.4118621 , -2.4448884 , -5.9083395 ,
       -2.419133  , -2.702717  , -3.5714824 , -2.6342323 , -3.5725007 ,
        3.4027436 , -1.2430289 , -2.10639   ,  1.3110543 ,  0.8194913 ,
        0.36082622, -1.2697253 ,  0.566665  , -3.3379893 , -2.31