In [1]:
import spacy
import floret
import more_itertools
import glob

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
files = glob.glob("euk_data/*.txt")
len(files)

4

In [26]:
files

['euk_data\\Acropora_millepora_unclassified_reads_illumina_top_1.txt',
 'euk_data\\Acropora_millepora_unclassified_reads_pacbio_top.txt',
 'euk_data\\Aurelia_aurita_unclassified_reads_pacbio_top.txt',
 'euk_data\\Conus_ventricosus_unclassified_reads_illumina_top_1.txt']

In [27]:
parts = []
for file in files:
    if "illumina" in file:
        with open(file, "r") as f:
            data = f.read()
        parts = parts+data.splitlines()
len(parts)

6000000

In [28]:
dna_samples = [dna for dna in parts if dna[0] != ">"]
lengths = []
for sample in dna_samples:
    lengths.append(len(sample))
len(dna_samples)

3679790

In [29]:
lengths = list(set(lengths))
lengths.sort()
len(lengths)

101

In [30]:
lengths[-1]

101

In [32]:
def create_kmer(dna, k):
    kmers = []
    end = len(dna)-k+1
    for i, c in enumerate(dna):
        if i < end:
            kmers.append(f"{dna[i:i+k]}")
    return kmers
kmers = [create_kmer(dna, 3) for dna in dna_samples]
len(kmers)

3679790

In [33]:
text = [" ".join(kmer) for kmer in kmers]

In [34]:
with open("data/dna/test.txt", "w", encoding="utf-8") as f:
    f.write("\n".join(text))

In [35]:
model = floret.train_unsupervised(
    "data/dna/test.txt",
    model="cbow",
    mode="floret",
    hashCount=2,
    bucket=100000,
    minn=2,
    maxn=4,
)

In [36]:
model.get_nearest_neighbors("ACG")

[(0.7077134251594543, 'GCG'),
 (0.4824638366699219, 'CGA'),
 (0.47026094794273376, 'GAC'),
 (0.42510664463043213, 'CGG'),
 (0.4234273135662079, 'ACA'),
 (0.4070490002632141, 'TCG'),
 (0.40543290972709656, 'GCA'),
 (0.39913803339004517, 'CCG'),
 (0.35598939657211304, 'GGC'),
 (0.339338093996048, 'AGC')]

In [37]:
model.save_floret_vectors(f"dna.floret")

In [38]:
!python -m spacy init vectors en dna.floret dna --mode floret

[i] Creating blank nlp object for language 'en'
[+] Successfully converted 100000 vectors
[+] Saved nlp object with vectors to output directory. You can now use the path
to it in your config as the 'vectors' setting in [initialize].
C:\Users\wma22\OneDrive\Documents\GitHub\dna-spacy\dna


[2023-01-26 20:30:37,889] [INFO] Reading vectors from dna.floret

0it [00:00, ?it/s]
4496it [00:00, 44633.03it/s]
9340it [00:00, 46665.86it/s]
14155it [00:00, 47184.07it/s]
18909it [00:00, 47188.49it/s]
23628it [00:00, 47065.60it/s]
28335it [00:00, 46474.13it/s]
33001it [00:00, 46429.94it/s]
37800it [00:00, 46810.38it/s]
42482it [00:00, 46266.94it/s]
47111it [00:01, 45346.94it/s]
51650it [00:01, 43827.19it/s]
56236it [00:01, 44321.87it/s]
60960it [00:01, 45083.20it/s]
65762it [00:01, 45852.15it/s]
70533it [00:01, 46305.81it/s]
75325it [00:01, 46684.01it/s]
80078it [00:01, 46828.63it/s]
84907it [00:01, 47161.75it/s]
89708it [00:01, 47312.38it/s]
94455it [00:02, 47260.09it/s]
99183it [00:02, 45936.32it/s]
100000it [00:02, 46118.54it/s]
[2023-01-26 20:30:40,065] [INFO] Loaded vectors from dna.floret


In [39]:
nlp = spacy.load("dna")

In [40]:
text[100]

'TAT ATG TGT GTC TCA CAG AGT GTC TCT CTT TTC TCT CTG TGT GTC TCG CGA GAA AAC ACA CAA AAG AGA GAA AAA AAC ACA CAC ACC CCT CTG TGG GGT GTC TCG CGA'

In [41]:
test = text[100]
doc = nlp(test)
doc.vector

array([-0.10669075,  0.03422643, -0.217521  , -0.00553851, -0.03192304,
        0.00903709,  0.14453909, -0.07844248,  0.09435757,  0.07674704,
        0.19162507,  0.07498356,  0.48662388,  0.08764622, -0.01794419,
       -0.01768433,  0.0544269 ,  0.09571764, -0.22966401, -0.1569899 ,
       -0.01242326, -0.3275781 , -0.12189689, -0.00231595, -0.20053759,
       -0.15646413, -0.10512868, -0.25012103,  0.01763087, -0.01026774,
       -0.04373965, -0.406593  , -0.21207842,  0.10584654,  0.15437946,
       -0.12949775, -0.14768505, -0.08978966, -0.14477102,  0.10527462,
       -0.07602212,  0.11838529, -0.14083941,  0.18513161,  0.07500212,
        0.04502864, -0.02104128,  0.15704128,  0.14762227,  0.1312362 ,
       -0.09392218, -0.25147855, -0.09225856,  0.22476405,  0.17672923,
       -0.07478819, -0.00717883,  0.06669654, -0.27066502, -0.01748163,
        0.11110567, -0.21152176, -0.05286286,  0.07872289,  0.41071954,
        0.29264286,  0.16258956, -0.33279118,  0.01269514,  0.26