In [1]:
import pandas as pd
import numpy as np
from utils import read_file, tokenize_latex
df = read_file("data/1601/*")


error with: data/1601/1601.06597.json


In [2]:
import re
def clean_exp(exp):
    exp = re.sub("\\\\begin{equation[^}]*}","" ,exp , flags=re.IGNORECASE)
    exp = re.sub("\\\\end{equation[^}]*}","" ,exp , flags=re.IGNORECASE)
    exp = re.sub("\\\\begin{split[^}]*}","\\\\begin{aligned}" ,exp , flags=re.IGNORECASE)
    exp = re.sub("\\\\end{split[^}]*}","\\\\end{aligned}" ,exp , flags=re.IGNORECASE)
    exp = re.sub("\\\\begin{gather[^}]*}","\\\\begin{aligned}" ,exp , flags=re.IGNORECASE)
    exp = re.sub("\\\\end{gather[^}]*}","\\\\end{aligned}" ,exp , flags=re.IGNORECASE)
    exp = re.sub("\\\\begin{align[^}]*}","\\\\begin{aligned}" ,exp , flags=re.IGNORECASE)
    exp = re.sub("\\\\end{align[^}]*}","\\\\end{aligned}" ,exp, flags=re.IGNORECASE)
    exp = re.sub("\\\\label{[^}]*}","" ,exp , flags=re.IGNORECASE)
    exp = re.sub("\\\\n", "",exp, flags=re.IGNORECASE)
    exp = re.sub("\\$", "",exp , flags=re.IGNORECASE)
    return exp

In [3]:
df["processed"] = df["text"].apply(lambda x: clean_exp(x)).apply(lambda x: tokenize_latex(x))

In [4]:
from gensim import corpora
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
equations = df["processed"].tolist()
dictionary = corpora.Dictionary(equations)

corpus = [dictionary.doc2bow(eq) for eq in equations]
corpora.MmCorpus.serialize('/tmp/equations.mm', corpus)
# our vector space model
print(corpus[1:5])

2017-11-30 09:44:50,477 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2017-11-30 09:44:51,029 : INFO : adding document #10000 to Dictionary(15769 unique tokens: [u'\\xi\\leq', u'4E+6F', u'0.000004', u'-P\\cdot\\frac', u'i|S']...)
2017-11-30 09:44:51,849 : INFO : adding document #20000 to Dictionary(25840 unique tokens: [u'\\xi\\leq', u'eF', u'q-1', u'q-2', u'+i+1']...)
2017-11-30 09:44:52,454 : INFO : adding document #30000 to Dictionary(35023 unique tokens: [u'\\xi\\leq', u'eF', u'-5AB\\sqrt', u'q-1', u'q-2']...)
2017-11-30 09:44:52,950 : INFO : adding document #40000 to Dictionary(43511 unique tokens: [u'\\xi\\leq', u'eF', u'-5AB\\sqrt', u'Model~III.~~~~~~~c', u'q-1']...)
2017-11-30 09:44:53,437 : INFO : adding document #50000 to Dictionary(53030 unique tokens: [u'\\xi\\leq', u'eF', u'-5AB\\sqrt', u'Model~III.~~~~~~~c', u'q-1']...)
2017-11-30 09:44:53,731 : INFO : built Dictionary(57545 unique tokens: [u'\\xi\\leq', u'eF', u'-5AB\\sqrt', u'Model~III.~~~~~~~c', u'q-1'

[[(5, 1), (6, 1), (8, 3), (12, 4), (13, 3), (14, 4), (15, 4), (17, 1), (18, 3), (19, 3), (21, 1), (22, 1), (23, 1)], [(0, 1), (3, 1), (5, 1), (6, 1), (8, 5), (9, 1), (10, 4), (12, 3), (13, 2), (14, 5), (15, 5), (17, 2), (18, 3), (19, 5), (20, 1), (22, 2), (24, 1), (25, 1), (26, 1)], [(2, 1), (4, 1), (5, 1), (6, 1), (12, 2), (14, 6), (15, 6), (17, 1), (19, 1), (27, 1), (28, 1), (29, 1), (30, 1), (31, 1), (32, 1), (33, 1), (34, 1), (35, 1), (36, 1), (37, 1), (38, 1), (39, 1)], [(1, 1), (2, 1), (4, 1), (5, 3), (6, 3), (7, 3), (8, 1), (14, 6), (15, 6), (17, 1), (19, 2), (20, 2), (29, 1), (40, 1), (41, 2), (42, 1), (43, 2)]]


In [5]:
from gensim import corpora, models, similarities, matutils
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]
from gensim import similarities
index = similarities.Similarity("index/", corpus_tfidf, len(dictionary.keys()), num_best = 100)

2017-11-30 09:45:05,757 : INFO : collecting document frequencies
2017-11-30 09:45:05,763 : INFO : PROGRESS: processing document #0
2017-11-30 09:45:05,870 : INFO : PROGRESS: processing document #10000
2017-11-30 09:45:05,997 : INFO : PROGRESS: processing document #20000
2017-11-30 09:45:06,089 : INFO : PROGRESS: processing document #30000
2017-11-30 09:45:06,179 : INFO : PROGRESS: processing document #40000
2017-11-30 09:45:06,366 : INFO : PROGRESS: processing document #50000
2017-11-30 09:45:06,437 : INFO : calculating IDF weights for 56379 documents and 57544 features (1222940 matrix non-zeros)
2017-11-30 09:45:06,501 : INFO : starting similarity index under index/
2017-11-30 09:45:12,063 : INFO : PROGRESS: fresh_shard size=10000
2017-11-30 09:45:16,092 : INFO : PROGRESS: fresh_shard size=20000
2017-11-30 09:45:19,773 : INFO : PROGRESS: fresh_shard size=30000
2017-11-30 09:45:20,786 : INFO : creating sparse index
2017-11-30 09:45:20,787 : INFO : creating sparse matrix from corpus
201

In [6]:
docs = ["".join(eq) for eq in df["processed"].tolist()]

In [7]:
class Query:
    '''
    The important thing here is that the labels on the data coincide with the labels on the columns.
    '''
    def __init__(self, index, docs, dictionary, tokenize_latex, cols):
        self.index = index
        self.docs = docs
        self.dictionary = dictionary
        self.tokenize_latex = tokenize_latex
        self.columns = cols
    def _convert_query(self, query):
        query = self.dictionary.doc2bow(self.tokenize_latex(query))
        sims = self.index[query]
        neighbors = sorted(sims, key=lambda item: -item[1])
        neighbors = {"neighbors":[{self.columns[0]: {"data": self.docs[n[0]], "fmt": "math"}, self.columns[1]: {"data": float(n[1])}} for n in neighbors]} if neighbors else {"neighbors": []}
        return neighbors
    def query(self, q):
        return self._convert_query(q)
        



In [8]:
index

<gensim.similarities.docsim.Similarity at 0x109585650>

In [9]:
dictionary

<gensim.corpora.dictionary.Dictionary at 0x109616550>

In [10]:
q = Query(index, docs, dictionary, tokenize_latex, ["neighbor", "similarity_score"])
q.query("\\frac")

2017-11-30 09:45:32,947 : INFO : creating sparse index
2017-11-30 09:45:32,953 : INFO : creating sparse matrix from corpus
2017-11-30 09:45:32,958 : INFO : PROGRESS: at document #0/23611
2017-11-30 09:45:33,760 : INFO : PROGRESS: at document #10000/23611
2017-11-30 09:45:34,391 : INFO : PROGRESS: at document #20000/23611
2017-11-30 09:45:34,631 : INFO : created <23611x57545 sparse matrix of type '<type 'numpy.float32'>'
	with 515393 stored elements in Compressed Sparse Row format>
2017-11-30 09:45:34,632 : INFO : creating sparse shard #1
2017-11-30 09:45:34,633 : INFO : saving index shard to index/.1
2017-11-30 09:45:34,635 : INFO : saving SparseMatrixSimilarity object under index/.1, separately None
2017-11-30 09:45:34,658 : INFO : saved index/.1
2017-11-30 09:45:34,662 : INFO : loading SparseMatrixSimilarity object from index/.1
2017-11-30 09:45:34,675 : INFO : loaded index/.1


{'neighbors': [{'neighbor': {'data': u'c=c_{1}+c_{2}', 'fmt': 'math'},
   'similarity_score': {'data': 0.6769928336143494}},
  {'neighbor': {'data': u'v_{c}=\\frac{R_{c}}{\\tau_{c}}', 'fmt': 'math'},
   'similarity_score': {'data': 0.6608393788337708}},
  {'neighbor': {'data': u'A_{c_{1}c_{2}}=\\frac{N(c_{1}c_{2}>0)-N(c_{1}c_{2}<0)}{N(c_{1}c_{2}>0)+N(c_{1}c_{2}<0)},',
    'fmt': 'math'},
   'similarity_score': {'data': 0.5799717903137207}},
  {'neighbor': {'data': u'\\frac{u_{c}}{T_{c}}\\geq1', 'fmt': 'math'},
   'similarity_score': {'data': 0.5707259774208069}},
  {'neighbor': {'data': u'\\Phi(c)=\\frac{2}{c}-\\phi(c)\\,.', 'fmt': 'math'},
   'similarity_score': {'data': 0.5399872064590454}},
  {'neighbor': {'data': u'T_d\\approx\\frac{FL_{c}}{\\pic}=\\frac{1}{2\\pi\\Deltau_c},',
    'fmt': 'math'},
   'similarity_score': {'data': 0.5264416933059692}},
  {'neighbor': {'data': u'v_c=\\left(\\frac{F_c}{\\rho}\\right)^{\\frac{1}{3}};',
    'fmt': 'math'},
   'similarity_score': {'data': 

In [11]:
from mathviz_hopper.src.table import Table
t = Table(q)
t.print_ipython()

# TODO: lambda function to convert each column
# Schema:
# also, enable non-autocomplete selections (the models should be able to handle this)
# also, add a warning if the settings are still loading, and other notifications.
# Data: [{Column A: {name: A, index: [], filter_func: lambda x : f(x)}} , {Column B: {name: B, index: [], filter_func: lambda x : f(x)}} 

Bottle v0.13-dev server starting up (using MyWSGIRefServer())...
Listening on http://localhost:8081/
Hit Ctrl-C to quit.



127.0.0.1 - - [30/Nov/2017 09:45:37] "GET /settings HTTP/1.1" 200 11824016


{'neighbors': [{'similarity_score': {'data': 0.9574124813079834}, 'neighbor': {'fmt': 'math', 'data': u'c=c_{1}+c_{2}'}}, {'similarity_score': {'data': 0.8170486092567444}, 'neighbor': {'fmt': 'math', 'data': u'v_{c}=\\frac{R_{c}}{\\tau_{c}}'}}, {'similarity_score': {'data': 0.7862759828567505}, 'neighbor': {'fmt': 'math', 'data': u'A_{c_{1}c_{2}}=\\frac{N(c_{1}c_{2}>0)-N(c_{1}c_{2}<0)}{N(c_{1}c_{2}>0)+N(c_{1}c_{2}<0)},'}}, {'similarity_score': {'data': 0.723685085773468}, 'neighbor': {'fmt': 'math', 'data': u's_n(c_0,c_1,\\ldots,c_{n-1})=(c_{n-1},c_0,\\ldots,c_{n-2}),'}}, {'similarity_score': {'data': 0.679811954498291}, 'neighbor': {'fmt': 'math', 'data': u'd(p_q,c_j)-m_j<2\\left(\\min_{c_i\\inC,c_iec_j}d(c_i,c_j)\\right)'}}, {'similarity_score': {'data': 0.6731098890304565}, 'neighbor': {'fmt': 'math', 'data': u'\\mathbf{c}(t)=(c_{1}(t),c_{2}(t),...,c_{N}(t)),'}}, {'similarity_score': {'data': 0.6695380806922913}, 'neighbor': {'fmt': 'math', 'data': u'\\lambda_{c}^{2}=z_{c}^{\\gamma

127.0.0.1 - - [30/Nov/2017 09:45:41] "POST /query HTTP/1.1" 200 18252


{'neighbors': [{'similarity_score': {'data': 0.2996787428855896}, 'neighbor': {'fmt': 'math', 'data': u'\\frac{0.5R_{\\rmB}}{v_{\\rm\\infty}}\\sim\\frac{10^{11}\\,cm}{1100\\,km/s}\\sim1000\\,s'}}, {'similarity_score': {'data': 0.21038341522216797}, 'neighbor': {'fmt': 'math', 'data': u'{\\frac{{\\eta}}{{s}}}={\\frac{{1}}{{4\\pi}}}\\,.'}}, {'similarity_score': {'data': 0.20868271589279175}, 'neighbor': {'fmt': 'math', 'data': u'{C}=\\left\\{{\\mathbf{{e}}}_{{1}},\\ldots,{\\mathbf{{e}}}_{{k}},{{{\\mathbf{{p}}}_{{0}}}|_{{{S}_{{1,1}}}}},\\ldots,{{{\\mathbf{{p}}}_{{0}}}|_{{{S}_{{1,{l_{{1}}}}}}}},{{{\\mathbf{{p}}}_{{0}}}|_{{{S}_{{2,1}}}}},\\ldots,{{{\\mathbf{{p}}}_{{0}}}|_{{{S}_{{2,{l_{{2}}}}}}}},\\ldots,{{{\\mathbf{{p}}}_{{0}}}|_{{{S}_{{m,1}}}}},\\ldots,{{{\\mathbf{{p}}}_{{0}}}|_{{{S}_{{m,{l_{{m}}}}}}}},{\\mathbf{{p}}}_{{1}},\\ldots,{\\mathbf{{p}}}_{{d-2}}\\right\\}.'}}, {'similarity_score': {'data': 0.20802809298038483}, 'neighbor': {'fmt': 'math', 'data': u'\\delta_{u}\\,=\\,-\\frac{1}{\\

127.0.0.1 - - [30/Nov/2017 09:45:52] "POST /query HTTP/1.1" 200 27735


{'neighbors': [{'similarity_score': {'data': 0.38589873909950256}, 'neighbor': {'fmt': 'math', 'data': u'\\frac{1}{1+\\sigma_i}+\\beta(\\sigma_{i}-\\sigma_{i,A})=0\\hspace{0.1cm}s.t.\\hspace{0.1cm}\\sigma_i\\ge0.'}}, {'similarity_score': {'data': 0.3413463234901428}, 'neighbor': {'fmt': 'math', 'data': u'W_i=\\frac{1}{\\sigma_i^2}.'}}, {'similarity_score': {'data': 0.3395869731903076}, 'neighbor': {'fmt': 'math', 'data': u'\\xi^{(4)i}=(A^{i}{_{a}},\\beta^{i},e^{i}{_{a}},\\alpha^{i},\\lambda^{i}{_{a}},\\Gamma^{i},\\varphi,\\rho_{i},\\omega_{i},\\tau_{i},\\sigma),'}}, {'similarity_score': {'data': 0.3220563232898712}, 'neighbor': {'fmt': 'math', 'data': u'\\sum_{i=1}^{2}I_{A_{i}C_{i}}\\leq\\sum_{i=1}^{2}H(q_{i}^{t}).'}}, {'similarity_score': {'data': 0.3080526292324066}, 'neighbor': {'fmt': 'math', 'data': u'\\[\\sum_{i=1}^n\\frac{(y_1(t_i)-V(t_i))^2}{\\sigma_1^2}+\\frac{(y_2(t_i)-R(t_i))^2}{\\sigma_2^2}.\\]'}}, {'similarity_score': {'data': 0.3079829812049866}, 'neighbor': {'fmt': 'math

127.0.0.1 - - [30/Nov/2017 09:45:55] "POST /query HTTP/1.1" 200 21301


In [10]:
import requests
r = requests.post('http://localhost:8081/query', json={"query": "\\begin{equation}\n\\Phi_{z}(L) = \\sum_{i=1}^{N} \\frac{1}{C_{i} \\times V_{\\rm max, i}} ,\n\\label{EQ1}\n\\end{equation}\n"})

{'neighbors': [{'similarity_score': {'data': 0.2762615382671356}, 'neighbor': {'fmt': 'math', 'data': u'\\begin{aligned}\\min_{L,S}\\sum_{m=1}^{M}\\sum_{i=1}^{N_{m}}&\\frac{1}{2}[max(0,1-Y_{m}^{i}(Ls^{m})^{T}X_{m}^{i})]^{2}\\\\&+\\gamma\\|L\\|_{1}+\\lambda\\|L\\|^{2}_{F}\\\\\\end{aligned}'}}, {'similarity_score': {'data': 0.2758598029613495}, 'neighbor': {'fmt': 'math', 'data': u'\\begin{aligned}\\min_{L,S}\\sum_{m=1}^{M}\\sum_{i=1}^{N_{m}}&\\frac{1}{2}[max(0,1-Y_{m}^{i}(Ls^{m})^{T}X_{m}^{i})]^{2}\\\\&+\\mu\\sum_{k=1}^{K}\\sum_{g=1}^{G}\\|s_{k}^{g}\\|_{2}\\\\\\end{aligned}'}}, {'similarity_score': {'data': 0.2687106728553772}, 'neighbor': {'fmt': 'math', 'data': u'\\begin{aligned}\\min_{L,S}\\sum_{m=1}^{M}\\sum_{i=1}^{N_{m}}&\\frac{1}{2}[max(0,1-Y_{m}^{i}(Ls^{m})^{T}X_{m}^{i})]^{2}\\\\&+\\mu\\sum_{k=1}^{K}\\sum_{g=1}^{G}\\|s_{k}^{g}\\|_{2}+\\gamma\\|L\\|_{1}+\\lambda\\|L\\|^{2}_{F}\\\\\\end{aligned}'}}]}


127.0.0.1 - - [28/Nov/2017 14:12:21] "POST /query HTTP/1.1" 200 914


In [11]:
r.json()

{u'neighbors': [{u'neighbor': {u'data': u'\\begin{aligned}\\min_{L,S}\\sum_{m=1}^{M}\\sum_{i=1}^{N_{m}}&\\frac{1}{2}[max(0,1-Y_{m}^{i}(Ls^{m})^{T}X_{m}^{i})]^{2}\\\\&+\\gamma\\|L\\|_{1}+\\lambda\\|L\\|^{2}_{F}\\\\\\end{aligned}',
    u'fmt': u'math'},
   u'similarity_score': {u'data': 0.2762615382671356}},
  {u'neighbor': {u'data': u'\\begin{aligned}\\min_{L,S}\\sum_{m=1}^{M}\\sum_{i=1}^{N_{m}}&\\frac{1}{2}[max(0,1-Y_{m}^{i}(Ls^{m})^{T}X_{m}^{i})]^{2}\\\\&+\\mu\\sum_{k=1}^{K}\\sum_{g=1}^{G}\\|s_{k}^{g}\\|_{2}\\\\\\end{aligned}',
    u'fmt': u'math'},
   u'similarity_score': {u'data': 0.2758598029613495}},
  {u'neighbor': {u'data': u'\\begin{aligned}\\min_{L,S}\\sum_{m=1}^{M}\\sum_{i=1}^{N_{m}}&\\frac{1}{2}[max(0,1-Y_{m}^{i}(Ls^{m})^{T}X_{m}^{i})]^{2}\\\\&+\\mu\\sum_{k=1}^{K}\\sum_{g=1}^{G}\\|s_{k}^{g}\\|_{2}+\\gamma\\|L\\|_{1}+\\lambda\\|L\\|^{2}_{F}\\\\\\end{aligned}',
    u'fmt': u'math'},
   u'similarity_score': {u'data': 0.2687106728553772}}]}

In [12]:
import requests
r = requests.get('http://localhost:8081/settings')

127.0.0.1 - - [28/Nov/2017 14:12:21] "GET /settings HTTP/1.1" 200 1007


In [13]:
r.json()

{u'columns': [{u'Header': u'neighbor', u'accessor': u'neighbor'},
  {u'Header': u'similarity_score', u'accessor': u'similarity_score'}],
 u'docs': {u'\\': {u'b': {u'e': {u'g': {u'i': {u'n': {u'{': {u'a': {u'l': {u'i': {u'g': {u'n': {u'e': {u'd': {u'}': {u'\\': {u'm': {u'i': {u'n': {u'_': {u'{': {},
                      u'{L,S}\\sum_{m=1}^{M}\\sum_{i=1}^{N_{m}}&\\frac{1}{2}[max(0,1-Y_{m}^{i}(Ls^{m})^{T}X_{m}^{i})]^{2}\\\\&+\\gamma\\|L\\|_{1}+\\lambda\\|L\\|^{2}_{F}\\\\\\end{aligned}': {u'full_word': 1},
                      u'{L,S}\\sum_{m=1}^{M}\\sum_{i=1}^{N_{m}}&\\frac{1}{2}[max(0,1-Y_{m}^{i}(Ls^{m})^{T}X_{m}^{i})]^{2}\\\\&+\\mu\\sum_{k=1}^{K}\\sum_{g=1}^{G}\\|s_{k}^{g}\\|_{2}+\\gamma\\|L\\|_{1}+\\lambda\\|L\\|^{2}_{F}\\\\\\end{aligned}': {u'full_word': 1},
                      u'{L,S}\\sum_{m=1}^{M}\\sum_{i=1}^{N_{m}}&\\frac{1}{2}[max(0,1-Y_{m}^{i}(Ls^{m})^{T}X_{m}^{i})]^{2}\\\\&+\\mu\\sum_{k=1}^{K}\\sum_{g=1}^{G}\\|s_{k}^{g}\\|_{2}\\\\\\end{aligned}': {u'full_word': 1}}}}}},
   

In [14]:

def insert(st, trie):
    i = 0
    for s in st:
        if s not in trie.keys(): 
            trie[s] = {}
        if i == 20:
            break
        trie = trie[s]
        i+=1
        
    if i == 20: 
        trie[st[i:]] = {}
        trie = trie[st[i:]]
    trie["full_word"] = 1
        
    
def construct_trie(list_of_str):
    trie = {}
    for st in list_of_str:
        insert(st, trie)
    return trie
    

In [15]:
docs_trie = construct_trie(clean_docs)


In [16]:

opts = {
    "columns": [
            {
                "Header": "Word",
                "accessor": "word"
            },
            {
                "Header": "Similarity",
                "accessor": "sim"
            }
   ],
    "port": "8081",
    "docs": docs_trie
}

In [17]:
import json
with open("mathviz_hopper/webpage/mathviz-js-components/public/settings.json", "w+") as f:
    json.dump(opts, f)

127.0.0.1 - - [28/Nov/2017 14:12:21] "GET /settings HTTP/1.1" 200 1007


IOError: [Errno 2] No such file or directory: 'mathviz_hopper/webpage/mathviz-js-components/public/settings.json'

{'neighbors': [{'similarity_score': {'data': 0.32032012939453125}, 'neighbor': {'fmt': 'math', 'data': u'\\begin{aligned}\\min_{L,S}\\sum_{m=1}^{M}\\sum_{i=1}^{N_{m}}&\\frac{1}{2}[max(0,1-Y_{m}^{i}(Ls^{m})^{T}X_{m}^{i})]^{2}\\\\&+\\mu\\sum_{k=1}^{K}\\sum_{g=1}^{G}\\|s_{k}^{g}\\|_{2}\\\\\\end{aligned}'}}, {'similarity_score': {'data': 0.31747668981552124}, 'neighbor': {'fmt': 'math', 'data': u'\\begin{aligned}\\min_{L,S}\\sum_{m=1}^{M}\\sum_{i=1}^{N_{m}}&\\frac{1}{2}[max(0,1-Y_{m}^{i}(Ls^{m})^{T}X_{m}^{i})]^{2}\\\\&+\\mu\\sum_{k=1}^{K}\\sum_{g=1}^{G}\\|s_{k}^{g}\\|_{2}+\\gamma\\|L\\|_{1}+\\lambda\\|L\\|^{2}_{F}\\\\\\end{aligned}'}}, {'similarity_score': {'data': 0.2810024917125702}, 'neighbor': {'fmt': 'math', 'data': u'\\begin{aligned}\\min_{L,S}\\sum_{m=1}^{M}\\sum_{i=1}^{N_{m}}&\\frac{1}{2}[max(0,1-Y_{m}^{i}(Ls^{m})^{T}X_{m}^{i})]^{2}\\\\&+\\gamma\\|L\\|_{1}+\\lambda\\|L\\|^{2}_{F}\\\\\\end{aligned}'}}]}


127.0.0.1 - - [28/Nov/2017 14:12:44] "POST /query HTTP/1.1" 200 916


In [None]:
def word_exists(trie, word):
    
    for i, s in enumerate(word):        
        if s in trie.keys(): 
            trie = trie[s]
        else: return False
            
        if i == 19: 
            break
            
    if i == 19:
        s = word[i+1:]
        if s not in trie.keys(): return False
        trie = trie[s]

    return True if 'full_word' in trie.keys() else False
        
        

In [None]:
word_exists(trie, "fdakfkajshdfjkahsdfkjhadfdfdfdffdfdsdjkfhaskj")

In [None]:
docs = ["".join(eq) for eq in df["processed"].tolist()]
docs_trie = construct_trie(docs)