In [69]:
import json
import gensim
import numpy as np
import re
import string
from tqdm import tqdm
from scipy import spatial

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [110]:
#Utils
def readjson(filename): #Reads JSON data
    reader = open(filename)
    data = json.load(reader)
    reader.close()

    return data

def writejson(data,filename):
    writer = open(filename,'w')
    json.dump(data,writer)
    writer.close()

def textclean(text): #Cleans text
    #Converts to lowercase, removes punctuation, unicode and newlines
    #text = text.lower()
    #text = text.encode('ascii', 'ignore').decode()
    text = re.sub(r'https*\S+', ' ', text)
    text = re.sub(r'@\S+', ' ', text)
    text = re.sub(r'#\S+', ' ', text)
    text = re.sub(r'\'\w+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), ' ', text)
    text = re.sub(r'\w*\d+\w*', '', text)
    text = re.sub(r'\s{2,}', ' ', text)
    
    return text

def corpusort(text): #Sorts input into documents and titles
    doc = []
    title = []
    for i in tqdm(range(len(text))):
        header = text[i][0]
        
        if header[0:3] == 'doc':
            doc.append(text[i])
        else:
            title.append(text[i])
    return doc,title

def cleandocs(docs): #Tokenises and preps the document data
    clean_docs = []
    for i in tqdm(range(len(docs))):
        tokens = gensim.utils.simple_preprocess(textclean(docs[i][1]))
        clean_docs.append(gensim.models.doc2vec.TaggedDocument(tokens,[docs[i][0]]))

    return clean_docs

def cleantitles(titles): #Tokenises and preps the title data
    clean_titles = []
    for i in tqdm(range(len(titles))):
        clean_titles.append([gensim.utils.simple_preprocess(textclean(titles[i][1])),titles[i][0]])
    
    return clean_titles

def cosinerank(primary,array):
    dist = []
    vector1 = model.infer_vector(primary)
    for i in range(len(array)):
        vector2 = model.infer_vector(array[i][0])
        dist.append([spatial.distance.cosine(vector1,vector2),array[i][1]])
    
    return (sorted(dist, key = lambda tup: tup[0]))

def docsearch(query):
    for text,tag in docs:
        if query == tag:
            return [text,tag]
        else:
            continue
    return "oof"

def titlesearch(query):
    for text,tag in titles:
        if query == tag:
            return [text,tag]
        else:
            continue
    return "oof"

def docmatch(ranks, recdocs):
    res = []
    count = 0
    
    for i in range(len(ranks)):
        if ranks[i][0] in recdocs:
            res.append(ranks[i][0])
        if count==len(recdocs):
            return scrabmle(res)
    return res

def scramble(res):
    size = len(res)
    new_res = res[((size/4)*3):] + res[(size/4):((size/4)*3)] + res[:(size/4)]
    return new_res

In [95]:
#Main
corpus = readjson('data/corpus.json')
docs, titles = corpusort(corpus)

docs = cleandocs(docs)
titles = cleantitles(titles)

model = gensim.models.doc2vec.Doc2Vec(vector_size = 1500, min_count = 1, epochs = 10)
model.build_vocab(docs)

100%|███████████████████████████████████████████████████████████████████████| 35080/35080 [00:00<00:00, 1237197.48it/s]
100%|███████████████████████████████████████████████████████████████████████████| 17540/17540 [00:24<00:00, 709.19it/s]
100%|█████████████████████████████████████████████████████████████████████████| 17540/17540 [00:00<00:00, 21622.66it/s]
2021-08-28 05:24:13,592 : INFO : Doc2Vec lifecycle event {'params': 'Doc2Vec(dm/m,d1500,n5,w5,s0.001,t3)', 'datetime': '2021-08-28T05:24:13.592303', 'gensim': '4.0.1', 'python': '3.8.9 (tags/v3.8.9:a743f81, Apr  2 2021, 11:10:41) [MSC v.1928 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19041-SP0', 'event': 'created'}
2021-08-28 05:24:13,627 : INFO : collecting all words and their counts
2021-08-28 05:24:13,628 : INFO : PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags
2021-08-28 05:24:14,698 : INFO : PROGRESS: at example #10000, processed 6130913 words (5729801/s), 167489 word types, 10000 tags
2021-08-28 05:2

In [96]:
#Model Training
model.train(docs, total_examples=model.corpus_count, epochs=model.epochs)

2021-08-28 05:24:20,735 : INFO : Doc2Vec lifecycle event {'msg': 'training model with 3 workers on 234838 vocabulary and 1500 features, using sg=0 hs=0 sample=0.001 negative=5 window=5', 'datetime': '2021-08-28T05:24:20.735162', 'gensim': '4.0.1', 'python': '3.8.9 (tags/v3.8.9:a743f81, Apr  2 2021, 11:10:41) [MSC v.1928 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19041-SP0', 'event': 'train'}
2021-08-28 05:24:21,740 : INFO : EPOCH 1 - PROGRESS: at 2.16% examples, 180116 words/s, in_qsize 5, out_qsize 0
2021-08-28 05:24:22,778 : INFO : EPOCH 1 - PROGRESS: at 4.58% examples, 187171 words/s, in_qsize 5, out_qsize 0
2021-08-28 05:24:23,798 : INFO : EPOCH 1 - PROGRESS: at 6.94% examples, 190879 words/s, in_qsize 5, out_qsize 0
2021-08-28 05:24:24,802 : INFO : EPOCH 1 - PROGRESS: at 9.30% examples, 193235 words/s, in_qsize 5, out_qsize 0
2021-08-28 05:24:25,861 : INFO : EPOCH 1 - PROGRESS: at 11.98% examples, 195971 words/s, in_qsize 5, out_qsize 0
2021-08-28 05:24:26,882 : INFO : EPOCH 1

2021-08-28 05:25:30,469 : INFO : EPOCH 2 - PROGRESS: at 67.30% examples, 196183 words/s, in_qsize 5, out_qsize 0
2021-08-28 05:25:31,512 : INFO : EPOCH 2 - PROGRESS: at 69.78% examples, 196395 words/s, in_qsize 5, out_qsize 0
2021-08-28 05:25:32,525 : INFO : EPOCH 2 - PROGRESS: at 72.38% examples, 196956 words/s, in_qsize 5, out_qsize 0
2021-08-28 05:25:33,542 : INFO : EPOCH 2 - PROGRESS: at 74.74% examples, 197029 words/s, in_qsize 5, out_qsize 0
2021-08-28 05:25:34,574 : INFO : EPOCH 2 - PROGRESS: at 77.31% examples, 197476 words/s, in_qsize 5, out_qsize 0
2021-08-28 05:25:35,584 : INFO : EPOCH 2 - PROGRESS: at 79.93% examples, 197796 words/s, in_qsize 5, out_qsize 0
2021-08-28 05:25:36,613 : INFO : EPOCH 2 - PROGRESS: at 82.51% examples, 197968 words/s, in_qsize 5, out_qsize 0
2021-08-28 05:25:37,698 : INFO : EPOCH 2 - PROGRESS: at 85.19% examples, 198268 words/s, in_qsize 5, out_qsize 0
2021-08-28 05:25:38,739 : INFO : EPOCH 2 - PROGRESS: at 87.71% examples, 198202 words/s, in_qsiz

2021-08-28 05:26:38,906 : INFO : EPOCH 4 - PROGRESS: at 27.33% examples, 183236 words/s, in_qsize 5, out_qsize 0
2021-08-28 05:26:39,920 : INFO : EPOCH 4 - PROGRESS: at 29.55% examples, 183349 words/s, in_qsize 5, out_qsize 0
2021-08-28 05:26:40,952 : INFO : EPOCH 4 - PROGRESS: at 32.05% examples, 184901 words/s, in_qsize 5, out_qsize 0
2021-08-28 05:26:41,999 : INFO : EPOCH 4 - PROGRESS: at 34.62% examples, 186002 words/s, in_qsize 5, out_qsize 0
2021-08-28 05:26:43,070 : INFO : EPOCH 4 - PROGRESS: at 36.98% examples, 186226 words/s, in_qsize 5, out_qsize 0
2021-08-28 05:26:44,090 : INFO : EPOCH 4 - PROGRESS: at 39.62% examples, 187916 words/s, in_qsize 5, out_qsize 0
2021-08-28 05:26:45,101 : INFO : EPOCH 4 - PROGRESS: at 42.08% examples, 188731 words/s, in_qsize 5, out_qsize 0
2021-08-28 05:26:46,109 : INFO : EPOCH 4 - PROGRESS: at 44.74% examples, 190142 words/s, in_qsize 5, out_qsize 0
2021-08-28 05:26:47,123 : INFO : EPOCH 4 - PROGRESS: at 47.17% examples, 191051 words/s, in_qsiz

2021-08-28 05:27:50,196 : INFO : worker thread finished; awaiting finish of 2 more threads
2021-08-28 05:27:50,212 : INFO : worker thread finished; awaiting finish of 1 more threads
2021-08-28 05:27:50,227 : INFO : worker thread finished; awaiting finish of 0 more threads
2021-08-28 05:27:50,228 : INFO : EPOCH - 5 : training on 10765634 raw words (8387502 effective words) took 41.5s, 202127 effective words/s
2021-08-28 05:27:51,247 : INFO : EPOCH 6 - PROGRESS: at 1.71% examples, 140538 words/s, in_qsize 5, out_qsize 0
2021-08-28 05:27:52,344 : INFO : EPOCH 6 - PROGRESS: at 4.13% examples, 163088 words/s, in_qsize 5, out_qsize 0
2021-08-28 05:27:53,427 : INFO : EPOCH 6 - PROGRESS: at 6.53% examples, 171232 words/s, in_qsize 5, out_qsize 0
2021-08-28 05:27:54,498 : INFO : EPOCH 6 - PROGRESS: at 8.61% examples, 169920 words/s, in_qsize 5, out_qsize 0
2021-08-28 05:27:55,505 : INFO : EPOCH 6 - PROGRESS: at 10.74% examples, 171699 words/s, in_qsize 5, out_qsize 0
2021-08-28 05:27:56,548 : I

2021-08-28 05:28:59,749 : INFO : EPOCH 7 - PROGRESS: at 64.94% examples, 203088 words/s, in_qsize 5, out_qsize 0
2021-08-28 05:29:00,800 : INFO : EPOCH 7 - PROGRESS: at 67.55% examples, 203512 words/s, in_qsize 5, out_qsize 0
2021-08-28 05:29:01,838 : INFO : EPOCH 7 - PROGRESS: at 70.21% examples, 203776 words/s, in_qsize 5, out_qsize 0
2021-08-28 05:29:02,840 : INFO : EPOCH 7 - PROGRESS: at 72.55% examples, 203649 words/s, in_qsize 5, out_qsize 0
2021-08-28 05:29:03,873 : INFO : EPOCH 7 - PROGRESS: at 75.23% examples, 204144 words/s, in_qsize 5, out_qsize 0
2021-08-28 05:29:04,897 : INFO : EPOCH 7 - PROGRESS: at 77.75% examples, 204405 words/s, in_qsize 5, out_qsize 0
2021-08-28 05:29:05,929 : INFO : EPOCH 7 - PROGRESS: at 80.35% examples, 204194 words/s, in_qsize 5, out_qsize 0
2021-08-28 05:29:06,937 : INFO : EPOCH 7 - PROGRESS: at 82.82% examples, 204068 words/s, in_qsize 5, out_qsize 0
2021-08-28 05:29:07,949 : INFO : EPOCH 7 - PROGRESS: at 85.29% examples, 204166 words/s, in_qsiz

2021-08-28 05:30:08,752 : INFO : EPOCH 9 - PROGRESS: at 30.60% examples, 190309 words/s, in_qsize 5, out_qsize 0
2021-08-28 05:30:09,783 : INFO : EPOCH 9 - PROGRESS: at 33.02% examples, 190789 words/s, in_qsize 5, out_qsize 0
2021-08-28 05:30:10,788 : INFO : EPOCH 9 - PROGRESS: at 35.49% examples, 191514 words/s, in_qsize 5, out_qsize 0
2021-08-28 05:30:11,792 : INFO : EPOCH 9 - PROGRESS: at 37.94% examples, 192679 words/s, in_qsize 5, out_qsize 0
2021-08-28 05:30:12,818 : INFO : EPOCH 9 - PROGRESS: at 40.52% examples, 193518 words/s, in_qsize 5, out_qsize 0
2021-08-28 05:30:13,827 : INFO : EPOCH 9 - PROGRESS: at 43.06% examples, 194070 words/s, in_qsize 5, out_qsize 0
2021-08-28 05:30:14,894 : INFO : EPOCH 9 - PROGRESS: at 45.74% examples, 194996 words/s, in_qsize 5, out_qsize 0
2021-08-28 05:30:15,951 : INFO : EPOCH 9 - PROGRESS: at 48.26% examples, 195867 words/s, in_qsize 5, out_qsize 0
2021-08-28 05:30:16,968 : INFO : EPOCH 9 - PROGRESS: at 50.78% examples, 196682 words/s, in_qsiz

2021-08-28 05:31:17,474 : INFO : worker thread finished; awaiting finish of 0 more threads
2021-08-28 05:31:17,475 : INFO : EPOCH - 10 : training on 10765634 raw words (8388310 effective words) took 40.8s, 205502 effective words/s
2021-08-28 05:31:17,475 : INFO : Doc2Vec lifecycle event {'msg': 'training on 107656340 raw words (83881687 effective words) took 416.7s, 201281 effective words/s', 'datetime': '2021-08-28T05:31:17.475847', 'gensim': '4.0.1', 'python': '3.8.9 (tags/v3.8.9:a743f81, Apr  2 2021, 11:10:41) [MSC v.1928 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19041-SP0', 'event': 'train'}


In [111]:
#Read the test JSON and rank stuff
to_tag = readjson('data/test_q.json')
docs = np.array(docs)
titles = np.array(titles)
results = to_tag

for i in tqdm(range(len(to_tag))):
    to_check = to_tag[i]['title_id']
    to_check = titlesearch(to_check)
    
    title_vector = model.infer_vector(to_check[0])
    ranking = model.dv.most_similar([title_vector],topn=len(model.dv))
    results[i]['candidates'] = docmatch(ranking,to_tag[i]['candidates'])

100%|██████████████████████████████████████████████████████████████████████████████| 3508/3508 [02:30<00:00, 23.35it/s]


In [112]:
'''
#Read the test JSON and rank stuff
to_tag = readjson('data/test_q.json')
docs = np.array(docs)
titles = np.array(titles)
results = to_tag

for i in tqdm(range(len(to_tag))):
    taglist = to_tag[i]['candidates']
    
    doc_tokens = []
    for key in taglist:
        val = np.where(docs==key)
        doc_tokens.append(docs[(val[0][0])])
        
    val = np.where(titles==to_tag[i]['title_id'])
    title_token = titles[(val[0][0])][0]
    
    sorted_docs = cosinerank(title_token,doc_tokens)
    
    sorted_docs_column = []
    for j in range(len(sorted_docs)):
        sorted_docs_column.append(sorted_docs[j][1])
    results[i]['candidates'] = sorted_docs_column
'''

"\n#Read the test JSON and rank stuff\nto_tag = readjson('data/test_q.json')\ndocs = np.array(docs)\ntitles = np.array(titles)\nresults = to_tag\n\nfor i in tqdm(range(len(to_tag))):\n    taglist = to_tag[i]['candidates']\n    \n    doc_tokens = []\n    for key in taglist:\n        val = np.where(docs==key)\n        doc_tokens.append(docs[(val[0][0])])\n        \n    val = np.where(titles==to_tag[i]['title_id'])\n    title_token = titles[(val[0][0])][0]\n    \n    sorted_docs = cosinerank(title_token,doc_tokens)\n    \n    sorted_docs_column = []\n    for j in range(len(sorted_docs)):\n        sorted_docs_column.append(sorted_docs[j][1])\n    results[i]['candidates'] = sorted_docs_column\n"

In [113]:
writejson(results,'data/suggestion.json')

In [101]:

title_vector = model.infer_vector(['Anne','Walmsley'])
print(model.dv.most_similar([title_vector],topn=len(model.dv)))

[('doc49117', 0.07463841885328293), ('doc60415', 0.07226265966892242), ('doc35514', 0.06894759833812714), ('doc88319', 0.06696894019842148), ('doc50388', 0.06633293628692627), ('doc14974', 0.0656101182103157), ('doc79005', 0.06539274007081985), ('doc83477', 0.06469575315713882), ('doc53399', 0.06282904744148254), ('doc02915', 0.06197584420442581), ('doc40431', 0.061269380152225494), ('doc82870', 0.06119124963879585), ('doc96915', 0.06082453578710556), ('doc06172', 0.060056742280721664), ('doc20839', 0.05931582301855087), ('doc88318', 0.058572541922330856), ('doc35932', 0.058262988924980164), ('doc90484', 0.0582599975168705), ('doc60344', 0.05786819010972977), ('doc31235', 0.057792920619249344), ('doc35119', 0.05756232514977455), ('doc88684', 0.05732475966215134), ('doc24507', 0.05727162957191467), ('doc46543', 0.057016197592020035), ('doc25663', 0.05697520822286606), ('doc24564', 0.05680861696600914), ('doc17431', 0.05632809177041054), ('doc71255', 0.05629352480173111), ('doc63166', 0.