In [9]:
#settings

THRESHOLD = 0.01
TOKEN_LENGTH = 300
JSON_FILE = 'database.json'
DICT_NAME = 'vedabase.dict'

#imports

import os
import subprocess
import pandas as pd

from nltk import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import FreqDist

import re
import string

import json

#preprocessing

def preprocessing():
    files = list(filter((lambda x: x), [x if 'txt' in x else None for x in os.listdir()]))

    documents = []
    
    for line,fname in enumerate(files):
        with open(fname,'r') as f:
            txt = f.read().split('\n')
        
        # tokenize
        txt = [word_tokenize(line) for line in txt]
    
        # remove punctuation
        pattern = re.compile('[{}]'.format(re.escape(string.punctuation)))
        for i in range(len(txt)):
            txt[i] = list(filter(None, [pattern.sub('',token) for token in txt[i]]))
    
        #lowercase
        for i in range(len(txt)):
            txt[i] = [ x.lower() for x in txt[i] ]
    
        #remove stopwords
        sw = nltk.corpus.stopwords.words('english')
        for i in range(len(txt)):
            txt[i] = [w for w in txt[i] if not w in sw]
    
        #stemmer
        s = PorterStemmer()
        for i in range(len(txt)):
            txt[i] = [ s.stem(w) for w in txt[i] ]
        
        #lemmatizer
        l = WordNetLemmatizer()
        for i in range(len(txt)):
            txt[i] = [l.lemmatize(w) for w in txt[i]]
        
        #eliminate most frequent words
        all_words = [word for item in txt for word in item]
        fdist = FreqDist(all_words)
        sw = fdist.most_common(int(len(fdist) * THRESHOLD) )
        sw = [x[0] for x in sw]
        for i in range(len(txt)):
            txt[i] = [w for w in txt[i] if not w in sw]
        
        #fit lines to minimum token length
        all_words = [word for item in txt for word in item]
        txt_tokenized = [all_words[x:x+TOKEN_LENGTH] for x in range(0,len(all_words),TOKEN_LENGTH) ]
    
        #add line labels
        counter = 0
        labels = []
        for line in txt:
            counter += len(line)
            labels.append(counter // TOKEN_LENGTH)
    
        past_labels = [0] + labels[0:len(labels)-1]
        labels = list(zip(past_labels, labels))
    
        documents.append({
            'labels' : labels,
            'tokens' : txt_tokenized,
            'source' : fname
        })
    
    with open(JSON_FILE,'w') as f:
        json.dump(documents,f)
        
    return

#preprocessing()

In [10]:
from gensim import corpora

with open(JSON_FILE,'r') as f:
    db = json.load(f)
    

In [11]:
texts = []
for dct in db:
    texts += dct['tokens']
    
dictionary = corpora.Dictionary(texts)
dictionary.save(DICT_NAME)

In [12]:
corpus = [dictionary.doc2bow(text) for text in texts]

In [14]:
from gensim.models.hdpmodel import HdpModel

hdp_model = HdpModel(corpus,dictionary)
lda_model = hdp_model.suggested_lda_model()

In [34]:
topic_vectors = []

for vec in lda_model[corpus]:
    distribution = [0.0] * 150
    for tup in vec:
        distribution[tup[0]] = tup[1]
    topic_vectors.append(distribution)

In [36]:
topic_labels = []
for idx,dct in enumerate(db):
    for d_idx, d in enumerate(dct['tokens']):
        topic_labels.append((d_idx, idx))

topic_vectors = list(zip(topic_labels,topic_vectors))


35406
35406


In [45]:
from annoy import AnnoyIndex
import random

ANN_FILE = 'test.ann'

nearest_neighbors = []
vec = [x[1] for x in topic_vectors]

f = len(vec[0])

def build_ann():
    t = AnnoyIndex(f)  # Length of item vector that will be indexed
    for i in range(len(vec)):
        t.add_item(i, vec[i])
    t.build(2000)
    t.save(ANN_FILE)

#build_ann()

In [58]:
%%time

u = AnnoyIndex(f)
u.load(ANN_FILE) # super fast, will just mmap the file
    
for i in range(len(vec)):
    nearest_neighbors.append(u.get_nns_by_item(i,200))

In [59]:
topic_vectors = list(zip(topic_labels,nearest_neighbors))

In [62]:
VECTORS = 'vectors.json'

with open(VECTORS,'w') as f:
    json.dump(topic_vectors,f)