In [1]:
import json
import gensim
import numpy as np
import re
import string
from tqdm import tqdm
from scipy import spatial

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)



In [10]:
#Utils
def readjson(filename): #Reads JSON data
    reader = open(filename)
    data = json.load(reader)
    reader.close()

    return data

def writejson(data,filename):
    writer = open(filename,'w')
    json.dump(data,writer)
    writer.close()

def textclean(text): #Cleans text
    #Converts to lowercase, removes punctuation, unicode and newlines
    #text = text.lower()
    #text = text.encode('ascii', 'ignore').decode()
    text = re.sub(r'https*\S+', ' ', text)
    text = re.sub(r'@\S+', ' ', text)
    text = re.sub(r'#\S+', ' ', text)
    text = re.sub(r'\'\w+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), ' ', text)
    text = re.sub(r'\w*\d+\w*', '', text)
    text = re.sub(r'\s{2,}', ' ', text)
    
    return text

def corpusort(text): #Sorts input into documents and titles
    doc = []
    title = []
    for i in tqdm(range(len(text))):
        header = text[i][0]
        
        if header[0:3] == 'doc':
            doc.append(text[i])
        else:
            title.append(text[i])
    return doc,title

def cleandocs(docs): #Tokenises and preps the document data
    clean_docs = []
    for i in tqdm(range(len(docs))):
        tokens = gensim.utils.simple_preprocess(textclean(docs[i][1]))
        clean_docs.append(gensim.models.doc2vec.TaggedDocument(tokens,[docs[i][0]]))

    return clean_docs

def cleantitles(titles): #Tokenises and preps the title data
    clean_titles = []
    for i in tqdm(range(len(titles))):
        clean_titles.append([gensim.utils.simple_preprocess(textclean(titles[i][1])),titles[i][0]])
    
    return clean_titles

def cosinerank(primary,array):
    dist = []
    #vector1 = model.infer_vector(primary)
    vector1 = primary
    for i in range(len(array)):
        vector2 = model.dv[array[i]]
        dist.append([spatial.distance.cosine(vector1,vector2),array[i]])
    
    return (sorted(dist, key = lambda tup: tup[0]))

def docsearch(query):
    for text,tag in docs:
        if query == tag:
            return [text,tag]
        else:
            continue
    return "oof"

def titlesearch(query):
    for text,tag in titles:
        if query == tag:
            return [text,tag]
        else:
            continue
    return "oof"

def docmatch(ranks, recdocs):
    res = []
    count = 0
    
    for i in range(len(ranks)):
        if ranks[i][0] in recdocs:
            res.append(ranks[i][0])
        if count==len(recdocs):
            return res
    return res

def scramble(res):
    size = len(res)
    new_res = res[((size/4)*3):] + res[(size/4):((size/4)*3)] + res[:(size/4)]
    return new_res

In [3]:
#Main
corpus = readjson('data/corpus.json')
docs, titles = corpusort(corpus)

docs = cleandocs(docs)
titles = cleantitles(titles)

model = gensim.models.doc2vec.Doc2Vec(vector_size = 150, min_count = 1, epochs = 10)
model.build_vocab(docs)

100%|███████████████████████████████████████████████████████████████████████| 35080/35080 [00:00<00:00, 3045604.20it/s]
100%|███████████████████████████████████████████████████████████████████████████| 17540/17540 [00:24<00:00, 706.74it/s]
100%|█████████████████████████████████████████████████████████████████████████| 17540/17540 [00:00<00:00, 37208.28it/s]
2021-08-28 05:55:30,804 : INFO : Doc2Vec lifecycle event {'params': 'Doc2Vec(dm/m,d150,n5,w5,s0.001,t3)', 'datetime': '2021-08-28T05:55:30.803429', 'gensim': '4.0.1', 'python': '3.8.9 (tags/v3.8.9:a743f81, Apr  2 2021, 11:10:41) [MSC v.1928 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19041-SP0', 'event': 'created'}
2021-08-28 05:55:30,805 : INFO : collecting all words and their counts
2021-08-28 05:55:30,805 : INFO : PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags
2021-08-28 05:55:31,778 : INFO : PROGRESS: at example #10000, processed 6130913 words (6307286/s), 167489 word types, 10000 tags
2021-08-28 05:55

In [4]:
#Model Training
model.train(docs, total_examples=model.corpus_count, epochs=model.epochs)

2021-08-28 05:55:36,350 : INFO : Doc2Vec lifecycle event {'msg': 'training model with 3 workers on 234838 vocabulary and 150 features, using sg=0 hs=0 sample=0.001 negative=5 window=5', 'datetime': '2021-08-28T05:55:36.350376', 'gensim': '4.0.1', 'python': '3.8.9 (tags/v3.8.9:a743f81, Apr  2 2021, 11:10:41) [MSC v.1928 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19041-SP0', 'event': 'train'}
2021-08-28 05:55:37,367 : INFO : EPOCH 1 - PROGRESS: at 11.98% examples, 989477 words/s, in_qsize 5, out_qsize 0
2021-08-28 05:55:38,375 : INFO : EPOCH 1 - PROGRESS: at 23.73% examples, 971054 words/s, in_qsize 5, out_qsize 0
2021-08-28 05:55:39,381 : INFO : EPOCH 1 - PROGRESS: at 36.10% examples, 991380 words/s, in_qsize 5, out_qsize 0
2021-08-28 05:55:40,386 : INFO : EPOCH 1 - PROGRESS: at 48.10% examples, 994942 words/s, in_qsize 5, out_qsize 0
2021-08-28 05:55:41,391 : INFO : EPOCH 1 - PROGRESS: at 59.98% examples, 997913 words/s, in_qsize 6, out_qsize 0
2021-08-28 05:55:42,391 : INFO : EPOC

2021-08-28 05:56:27,114 : INFO : EPOCH 7 - PROGRESS: at 12.26% examples, 1025884 words/s, in_qsize 5, out_qsize 0
2021-08-28 05:56:28,117 : INFO : EPOCH 7 - PROGRESS: at 24.97% examples, 1032889 words/s, in_qsize 5, out_qsize 0
2021-08-28 05:56:29,119 : INFO : EPOCH 7 - PROGRESS: at 37.26% examples, 1031372 words/s, in_qsize 5, out_qsize 0
2021-08-28 05:56:30,123 : INFO : EPOCH 7 - PROGRESS: at 49.66% examples, 1034130 words/s, in_qsize 6, out_qsize 0
2021-08-28 05:56:31,132 : INFO : EPOCH 7 - PROGRESS: at 61.94% examples, 1036027 words/s, in_qsize 5, out_qsize 0
2021-08-28 05:56:32,134 : INFO : EPOCH 7 - PROGRESS: at 74.40% examples, 1037693 words/s, in_qsize 5, out_qsize 0
2021-08-28 05:56:33,136 : INFO : EPOCH 7 - PROGRESS: at 87.09% examples, 1038571 words/s, in_qsize 5, out_qsize 0
2021-08-28 05:56:34,148 : INFO : EPOCH 7 - PROGRESS: at 99.50% examples, 1038572 words/s, in_qsize 5, out_qsize 0
2021-08-28 05:56:34,173 : INFO : worker thread finished; awaiting finish of 2 more threa

In [13]:
#Read the test JSON and rank stuff
to_tag = readjson('data/test_q.json')
docs = np.array(docs)
titles = np.array(titles)
results = to_tag

for i in tqdm(range(len(to_tag))):
    to_check = to_tag[i]['title_id']
    to_check = titlesearch(to_check)
    
    title_vector = model.infer_vector(to_check[0])
    ranking = cosinerank(title_vector,to_tag[i]['candidates'])
    
    temp = []
    for j in range(len(ranking)):
        temp.append(ranking[j][1])
        
    results[i]['candidates'] = temp
    
    #ranking = model.dv.most_similar([title_vector],topn=len(model.dv))
    #results[i]['candidates'] = docmatch(ranking,to_tag[i]['candidates'])
      
    
    

100%|█████████████████████████████████████████████████████████████████████████████| 3508/3508 [00:23<00:00, 148.90it/s]


In [112]:
'''
#Read the test JSON and rank stuff
to_tag = readjson('data/test_q.json')
docs = np.array(docs)
titles = np.array(titles)
results = to_tag

for i in tqdm(range(len(to_tag))):
    taglist = to_tag[i]['candidates']
    
    doc_tokens = []
    for key in taglist:
        val = np.where(docs==key)
        doc_tokens.append(docs[(val[0][0])])
        
    val = np.where(titles==to_tag[i]['title_id'])
    title_token = titles[(val[0][0])][0]
    
    sorted_docs = cosinerank(title_token,doc_tokens)
    
    sorted_docs_column = []
    for j in range(len(sorted_docs)):
        sorted_docs_column.append(sorted_docs[j][1])
    results[i]['candidates'] = sorted_docs_column
'''

"\n#Read the test JSON and rank stuff\nto_tag = readjson('data/test_q.json')\ndocs = np.array(docs)\ntitles = np.array(titles)\nresults = to_tag\n\nfor i in tqdm(range(len(to_tag))):\n    taglist = to_tag[i]['candidates']\n    \n    doc_tokens = []\n    for key in taglist:\n        val = np.where(docs==key)\n        doc_tokens.append(docs[(val[0][0])])\n        \n    val = np.where(titles==to_tag[i]['title_id'])\n    title_token = titles[(val[0][0])][0]\n    \n    sorted_docs = cosinerank(title_token,doc_tokens)\n    \n    sorted_docs_column = []\n    for j in range(len(sorted_docs)):\n        sorted_docs_column.append(sorted_docs[j][1])\n    results[i]['candidates'] = sorted_docs_column\n"

In [14]:
writejson(results,'data/suggestion.json')

In [6]:
print(model.dv['doc49117'])
#title_vector = model.infer_vector(['Anne','Walmsley'])
#print(model.dv.most_similar([title_vector],topn=len(model.dv)))

[-2.20003039e-01 -5.67086041e-02 -3.41204330e-02  8.65936056e-02
 -1.69310346e-01  6.33304358e-01  1.55121922e-01  1.51920462e+00
 -2.07499325e-01 -4.98444378e-01  3.87282133e-01 -1.14784867e-01
 -2.58946389e-01  1.42954409e+00 -4.82909739e-01  6.45391792e-02
 -2.71909852e-02  5.84906816e-01  1.56846782e-03  6.94743395e-01
 -1.26823202e-01  1.75164521e-01  9.90388632e-01 -1.66007541e-02
  5.14289021e-01  1.08363010e-01  3.61230299e-02  1.12642936e-01
  7.50880480e-01 -6.28488064e-01 -5.25296748e-01  2.00667453e+00
 -2.70202667e-01  1.73684388e-01  1.68475711e+00 -6.98680580e-01
  5.26426435e-01  3.13155539e-02 -1.04095154e-01 -3.08611274e-01
  1.07599473e+00 -7.66604483e-01  1.09134614e+00 -8.74592960e-01
 -5.16031325e-01 -5.51534474e-01 -2.67924756e-01 -8.79276991e-01
 -8.37159514e-01  6.91151440e-01 -1.02591538e+00  7.73221493e-01
  5.19679725e-01 -1.07962632e+00 -4.26729053e-01 -4.66520160e-01
  2.93620855e-01 -1.11001976e-01 -2.31866643e-01 -2.75087655e-01
 -1.61305532e-01  4.48993