In [33]:
import json
import gensim
import numpy as np
import re
import string
from tqdm import tqdm
from scipy import spatial

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [34]:
#Utils
def readjson(filename): #Reads JSON data
    reader = open(filename)
    data = json.load(reader)
    reader.close()

    return data

def writejson(data,filename):
    writer = open(filename,'w')
    json.dump(data,writer)
    writer.close()

def textclean(text): #Cleans text
    #Converts to lowercase, removes punctuation, unicode and newlines
    text = text.lower()
    text = text.encode('ascii', 'ignore').decode()
    text = re.sub(r'https*\S+', ' ', text)
    text = re.sub(r'@\S+', ' ', text)
    text = re.sub(r'#\S+', ' ', text)
    text = re.sub(r'\'\w+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), ' ', text)
    text = re.sub(r'\w*\d+\w*', '', text)
    text = re.sub(r'\s{2,}', ' ', text)
    
    return text

def corpusort(text): #Sorts input into documents and titles
    doc = []
    title = []
    for i in tqdm(range(len(text))):
        header = text[i][0]
        
        if header[0:3] == 'doc':
            doc.append(text[i])
        else:
            title.append(text[i])
    return doc,title

def cleandocs(docs): #Tokenises and preps the document data
    clean_docs = []
    for i in tqdm(range(len(docs))):
        tokens = gensim.utils.simple_preprocess(textclean(docs[i][1]))
        clean_docs.append(gensim.models.doc2vec.TaggedDocument(tokens,docs[i][0]))

    return clean_docs

def cleantitles(titles): #Tokenises and preps the title data
    clean_titles = []
    for i in tqdm(range(len(titles))):
        clean_titles.append([gensim.utils.simple_preprocess(textclean(titles[i][1])),titles[i][0]])
    
    return clean_titles

def cosinerank(primary,array):
    dist = []
    vector1 = model.infer_vector(primary)
    for i in range(len(array)):
        vector2 = model.infer_vector(array[i][0])
        dist.append([spatial.distance.cosine(vector1,vector2),array[i][1]])
    
    return (sorted(dist, key = lambda tup: tup[0],reverse = True))

In [3]:
#Main
corpus = readjson('data/corpus.json')
docs, titles = corpusort(corpus)

docs = cleandocs(docs)
titles = cleantitles(titles)

model = gensim.models.doc2vec.Doc2Vec(vector_size = 50, min_count = 2, epochs = 40)
model.build_vocab(docs)

100%|███████████████████████████████████████████████████████████████████████| 35080/35080 [00:00<00:00, 3347046.96it/s]
100%|███████████████████████████████████████████████████████████████████████████| 17540/17540 [00:24<00:00, 723.72it/s]
100%|█████████████████████████████████████████████████████████████████████████| 17540/17540 [00:00<00:00, 35132.42it/s]
2021-08-28 00:29:06,546 : INFO : Doc2Vec lifecycle event {'params': 'Doc2Vec(dm/m,d50,n5,w5,mc2,s0.001,t3)', 'datetime': '2021-08-28T00:29:06.545233', 'gensim': '4.0.1', 'python': '3.8.9 (tags/v3.8.9:a743f81, Apr  2 2021, 11:10:41) [MSC v.1928 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19041-SP0', 'event': 'created'}
2021-08-28 00:29:06,547 : INFO : collecting all words and their counts
2021-08-28 00:29:06,547 : INFO : PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags
2021-08-28 00:29:07,510 : INFO : PROGRESS: at example #10000, processed 6123051 words (6359149/s), 167252 word types, 13 tags
2021-08-28 00:29

In [4]:
#Model Training
model.train(docs, total_examples=model.corpus_count, epochs=model.epochs)

2021-08-28 00:29:10,268 : INFO : Doc2Vec lifecycle event {'msg': 'training model with 3 workers on 125583 vocabulary and 50 features, using sg=0 hs=0 sample=0.001 negative=5 window=5', 'datetime': '2021-08-28T00:29:10.268637', 'gensim': '4.0.1', 'python': '3.8.9 (tags/v3.8.9:a743f81, Apr  2 2021, 11:10:41) [MSC v.1928 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19041-SP0', 'event': 'train'}
2021-08-28 00:29:11,270 : INFO : EPOCH 1 - PROGRESS: at 15.59% examples, 1295294 words/s, in_qsize 5, out_qsize 0
2021-08-28 00:29:12,274 : INFO : EPOCH 1 - PROGRESS: at 31.13% examples, 1290699 words/s, in_qsize 5, out_qsize 0
2021-08-28 00:29:13,278 : INFO : EPOCH 1 - PROGRESS: at 46.67% examples, 1291765 words/s, in_qsize 5, out_qsize 0
2021-08-28 00:29:14,285 : INFO : EPOCH 1 - PROGRESS: at 61.98% examples, 1294796 words/s, in_qsize 5, out_qsize 0
2021-08-28 00:29:15,296 : INFO : EPOCH 1 - PROGRESS: at 76.98% examples, 1285953 words/s, in_qsize 5, out_qsize 0
2021-08-28 00:29:16,306 : INFO : 

2021-08-28 00:29:58,641 : INFO : EPOCH 8 - PROGRESS: at 47.45% examples, 1312469 words/s, in_qsize 5, out_qsize 0
2021-08-28 00:29:59,642 : INFO : EPOCH 8 - PROGRESS: at 63.02% examples, 1315849 words/s, in_qsize 5, out_qsize 0
2021-08-28 00:30:00,646 : INFO : EPOCH 8 - PROGRESS: at 78.75% examples, 1316215 words/s, in_qsize 5, out_qsize 0
2021-08-28 00:30:01,652 : INFO : EPOCH 8 - PROGRESS: at 94.45% examples, 1313936 words/s, in_qsize 5, out_qsize 0
2021-08-28 00:30:01,993 : INFO : worker thread finished; awaiting finish of 2 more threads
2021-08-28 00:30:01,994 : INFO : worker thread finished; awaiting finish of 1 more threads
2021-08-28 00:30:01,995 : INFO : worker thread finished; awaiting finish of 0 more threads
2021-08-28 00:30:01,995 : INFO : EPOCH - 8 : training on 10752561 raw words (8380655 effective words) took 6.4s, 1316440 effective words/s
2021-08-28 00:30:03,003 : INFO : EPOCH 9 - PROGRESS: at 15.95% examples, 1317772 words/s, in_qsize 5, out_qsize 0
2021-08-28 00:30:0

2021-08-28 00:30:48,384 : INFO : worker thread finished; awaiting finish of 1 more threads
2021-08-28 00:30:48,386 : INFO : worker thread finished; awaiting finish of 0 more threads
2021-08-28 00:30:48,386 : INFO : EPOCH - 15 : training on 10752561 raw words (8380850 effective words) took 6.6s, 1276749 effective words/s
2021-08-28 00:30:49,388 : INFO : EPOCH 16 - PROGRESS: at 15.30% examples, 1273784 words/s, in_qsize 5, out_qsize 0
2021-08-28 00:30:50,390 : INFO : EPOCH 16 - PROGRESS: at 30.43% examples, 1262793 words/s, in_qsize 5, out_qsize 0
2021-08-28 00:30:51,395 : INFO : EPOCH 16 - PROGRESS: at 46.01% examples, 1272651 words/s, in_qsize 5, out_qsize 0
2021-08-28 00:30:52,404 : INFO : EPOCH 16 - PROGRESS: at 61.40% examples, 1281203 words/s, in_qsize 5, out_qsize 0
2021-08-28 00:30:53,408 : INFO : EPOCH 16 - PROGRESS: at 77.08% examples, 1288609 words/s, in_qsize 5, out_qsize 0
2021-08-28 00:30:54,408 : INFO : EPOCH 16 - PROGRESS: at 93.02% examples, 1293071 words/s, in_qsize 5, 

2021-08-28 00:31:36,606 : INFO : EPOCH 23 - PROGRESS: at 46.67% examples, 1288851 words/s, in_qsize 5, out_qsize 0
2021-08-28 00:31:37,609 : INFO : EPOCH 23 - PROGRESS: at 61.73% examples, 1287945 words/s, in_qsize 5, out_qsize 0
2021-08-28 00:31:38,611 : INFO : EPOCH 23 - PROGRESS: at 77.17% examples, 1290306 words/s, in_qsize 5, out_qsize 0
2021-08-28 00:31:39,613 : INFO : EPOCH 23 - PROGRESS: at 92.53% examples, 1285479 words/s, in_qsize 5, out_qsize 0
2021-08-28 00:31:40,095 : INFO : worker thread finished; awaiting finish of 2 more threads
2021-08-28 00:31:40,104 : INFO : worker thread finished; awaiting finish of 1 more threads
2021-08-28 00:31:40,110 : INFO : worker thread finished; awaiting finish of 0 more threads
2021-08-28 00:31:40,110 : INFO : EPOCH - 23 : training on 10752561 raw words (8381572 effective words) took 6.5s, 1285753 effective words/s
2021-08-28 00:31:41,116 : INFO : EPOCH 24 - PROGRESS: at 16.15% examples, 1335754 words/s, in_qsize 5, out_qsize 0
2021-08-28 0

2021-08-28 00:32:27,308 : INFO : worker thread finished; awaiting finish of 2 more threads
2021-08-28 00:32:27,311 : INFO : worker thread finished; awaiting finish of 1 more threads
2021-08-28 00:32:27,322 : INFO : worker thread finished; awaiting finish of 0 more threads
2021-08-28 00:32:27,322 : INFO : EPOCH - 30 : training on 10752561 raw words (8380979 effective words) took 6.5s, 1297360 effective words/s
2021-08-28 00:32:28,334 : INFO : EPOCH 31 - PROGRESS: at 13.82% examples, 1144413 words/s, in_qsize 5, out_qsize 0
2021-08-28 00:32:29,351 : INFO : EPOCH 31 - PROGRESS: at 29.28% examples, 1199721 words/s, in_qsize 5, out_qsize 0
2021-08-28 00:32:30,352 : INFO : EPOCH 31 - PROGRESS: at 44.15% examples, 1211470 words/s, in_qsize 5, out_qsize 0
2021-08-28 00:32:31,356 : INFO : EPOCH 31 - PROGRESS: at 58.53% examples, 1216053 words/s, in_qsize 5, out_qsize 0
2021-08-28 00:32:32,364 : INFO : EPOCH 31 - PROGRESS: at 73.19% examples, 1218680 words/s, in_qsize 5, out_qsize 0
2021-08-28 0

2021-08-28 00:33:15,544 : INFO : EPOCH 38 - PROGRESS: at 31.95% examples, 1316846 words/s, in_qsize 5, out_qsize 0
2021-08-28 00:33:16,547 : INFO : EPOCH 38 - PROGRESS: at 45.85% examples, 1262508 words/s, in_qsize 5, out_qsize 0
2021-08-28 00:33:17,548 : INFO : EPOCH 38 - PROGRESS: at 59.37% examples, 1235943 words/s, in_qsize 5, out_qsize 0
2021-08-28 00:33:18,551 : INFO : EPOCH 38 - PROGRESS: at 71.80% examples, 1198757 words/s, in_qsize 5, out_qsize 0
2021-08-28 00:33:19,558 : INFO : EPOCH 38 - PROGRESS: at 84.59% examples, 1174994 words/s, in_qsize 5, out_qsize 0
2021-08-28 00:33:20,558 : INFO : EPOCH 38 - PROGRESS: at 98.29% examples, 1171820 words/s, in_qsize 5, out_qsize 0
2021-08-28 00:33:20,677 : INFO : worker thread finished; awaiting finish of 2 more threads
2021-08-28 00:33:20,678 : INFO : worker thread finished; awaiting finish of 1 more threads
2021-08-28 00:33:20,685 : INFO : worker thread finished; awaiting finish of 0 more threads
2021-08-28 00:33:20,686 : INFO : EPOC

In [13]:
#Test the model
print(titles[0][0])
vector = model.infer_vector(titles[0][0])
print (model.dv.most_similar([vector],topn=len(model.dv)))

['virginia', 'mixson', 'geraty']
[('9', 0.8143532276153564), ('8', 0.813854992389679), ('1', 0.8121998906135559), ('0', 0.8088366985321045), ('3', 0.8060170412063599), ('6', 0.8046064376831055), ('7', 0.8023767471313477), ('4', 0.8017450571060181), ('2', 0.7966170310974121), ('5', 0.7813290953636169), ('o', -0.5379925966262817), ('d', -0.5474048852920532), ('c', -0.699870228767395)]


In [36]:
#Read the test JSON and rank stuff
to_tag = readjson('data/test_q.json')
docs = np.array(docs)
titles = np.array(titles)
results = to_tag

for i in tqdm(range(len(to_tag))):
    taglist = to_tag[i]['candidates']
    
    doc_tokens = []
    for key in taglist:
        val = np.where(docs==key)
        doc_tokens.append(docs[(val[0][0])])
        
    val = np.where(titles==to_tag[i]['title_id'])
    title_token = titles[(val[0][0])][0]
    
    sorted_docs = cosinerank(title_token,doc_tokens)
    
    sorted_docs_column = []
    for j in range(len(sorted_docs)):
        sorted_docs_column.append(sorted_docs[j][1])
    results[i]['candidates'] = sorted_docs_column


100%|██████████████████████████████████████████████████████████████████████████████| 3508/3508 [17:44<00:00,  3.30it/s]


FileNotFoundError: [Errno 2] No such file or directory: '/data/yeet.json'

In [38]:
writejson(results,'data/suggestion.json')