In [69]:
import json
import gensim
import numpy as np
import re
import string
from tqdm import tqdm
from scipy import spatial

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [87]:
#Utils
def readjson(filename): #Reads JSON data
    reader = open(filename)
    data = json.load(reader)
    reader.close()

    return data

def writejson(data,filename):
    writer = open(filename,'w')
    json.dump(data,writer)
    writer.close()

def textclean(text): #Cleans text
    #Converts to lowercase, removes punctuation, unicode and newlines
    #text = text.lower()
    #text = text.encode('ascii', 'ignore').decode()
    text = re.sub(r'https*\S+', ' ', text)
    text = re.sub(r'@\S+', ' ', text)
    text = re.sub(r'#\S+', ' ', text)
    text = re.sub(r'\'\w+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), ' ', text)
    text = re.sub(r'\w*\d+\w*', '', text)
    text = re.sub(r'\s{2,}', ' ', text)
    
    return text

def corpusort(text): #Sorts input into documents and titles
    doc = []
    title = []
    for i in tqdm(range(len(text))):
        header = text[i][0]
        
        if header[0:3] == 'doc':
            doc.append(text[i])
        else:
            title.append(text[i])
    return doc,title

def cleandocs(docs): #Tokenises and preps the document data
    clean_docs = []
    for i in tqdm(range(len(docs))):
        tokens = gensim.utils.simple_preprocess(textclean(docs[i][1]))
        clean_docs.append(gensim.models.doc2vec.TaggedDocument(tokens,[docs[i][0]]))

    return clean_docs

def cleantitles(titles): #Tokenises and preps the title data
    clean_titles = []
    for i in tqdm(range(len(titles))):
        clean_titles.append([gensim.utils.simple_preprocess(textclean(titles[i][1])),titles[i][0]])
    
    return clean_titles

def cosinerank(primary,array):
    dist = []
    vector1 = model.infer_vector(primary)
    for i in range(len(array)):
        vector2 = model.infer_vector(array[i][0])
        dist.append([spatial.distance.cosine(vector1,vector2),array[i][1]])
    
    return (sorted(dist, key = lambda tup: tup[0]))

def docsearch(query):
    for text,tag in docs:
        if query == tag:
            return [text,tag]
        else:
            continue
    return "oof"

def titlesearch(query):
    for text,tag in titles:
        if query == tag:
            return [text,tag]
        else:
            continue
    return "oof"

def docmatch(ranks, recdocs):
    res = []
    count = 0
    
    for i in range(len(ranks)):
        if ranks[i][0] in recdocs:
            res.append(ranks[i][0])
        if count==len(recdocs):
            return res
    return res

In [88]:
#Main
corpus = readjson('data/corpus.json')
docs, titles = corpusort(corpus)

docs = cleandocs(docs)
titles = cleantitles(titles)

model = gensim.models.doc2vec.Doc2Vec(vector_size = 50, min_count = 1, epochs = 10)
model.build_vocab(docs)

100%|███████████████████████████████████████████████████████████████████████| 35080/35080 [00:00<00:00, 1349526.58it/s]
100%|███████████████████████████████████████████████████████████████████████████| 17540/17540 [00:27<00:00, 635.76it/s]
100%|█████████████████████████████████████████████████████████████████████████| 17540/17540 [00:00<00:00, 41954.45it/s]
2021-08-28 05:05:44,685 : INFO : Doc2Vec lifecycle event {'params': 'Doc2Vec(dm/m,d50,n5,w5,s0.001,t3)', 'datetime': '2021-08-28T05:05:44.685206', 'gensim': '4.0.1', 'python': '3.8.9 (tags/v3.8.9:a743f81, Apr  2 2021, 11:10:41) [MSC v.1928 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19041-SP0', 'event': 'created'}
2021-08-28 05:05:44,735 : INFO : collecting all words and their counts
2021-08-28 05:05:44,736 : INFO : PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags
2021-08-28 05:05:46,759 : INFO : PROGRESS: at example #10000, processed 6130913 words (3030552/s), 167489 word types, 10000 tags
2021-08-28 05:05:

In [89]:
#Model Training
model.train(docs, total_examples=model.corpus_count, epochs=model.epochs)

2021-08-28 05:05:52,263 : INFO : Doc2Vec lifecycle event {'msg': 'training model with 3 workers on 234838 vocabulary and 50 features, using sg=0 hs=0 sample=0.001 negative=5 window=5', 'datetime': '2021-08-28T05:05:52.263025', 'gensim': '4.0.1', 'python': '3.8.9 (tags/v3.8.9:a743f81, Apr  2 2021, 11:10:41) [MSC v.1928 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19041-SP0', 'event': 'train'}
2021-08-28 05:05:53,270 : INFO : EPOCH 1 - PROGRESS: at 13.31% examples, 1112326 words/s, in_qsize 5, out_qsize 0
2021-08-28 05:05:54,271 : INFO : EPOCH 1 - PROGRESS: at 27.57% examples, 1141457 words/s, in_qsize 5, out_qsize 0
2021-08-28 05:05:55,283 : INFO : EPOCH 1 - PROGRESS: at 41.37% examples, 1141741 words/s, in_qsize 6, out_qsize 0
2021-08-28 05:05:56,293 : INFO : EPOCH 1 - PROGRESS: at 55.52% examples, 1155217 words/s, in_qsize 5, out_qsize 0
2021-08-28 05:05:57,305 : INFO : EPOCH 1 - PROGRESS: at 69.53% examples, 1158526 words/s, in_qsize 5, out_qsize 0
2021-08-28 05:05:58,320 : INFO : 

2021-08-28 05:06:41,839 : INFO : EPOCH 7 - PROGRESS: at 85.75% examples, 1192938 words/s, in_qsize 5, out_qsize 0
2021-08-28 05:06:42,837 : INFO : worker thread finished; awaiting finish of 2 more threads
2021-08-28 05:06:42,841 : INFO : EPOCH 7 - PROGRESS: at 99.97% examples, 1193234 words/s, in_qsize 1, out_qsize 1
2021-08-28 05:06:42,842 : INFO : worker thread finished; awaiting finish of 1 more threads
2021-08-28 05:06:42,848 : INFO : worker thread finished; awaiting finish of 0 more threads
2021-08-28 05:06:42,849 : INFO : EPOCH - 7 : training on 10765634 raw words (8389144 effective words) took 7.0s, 1192377 effective words/s
2021-08-28 05:06:43,853 : INFO : EPOCH 8 - PROGRESS: at 13.59% examples, 1136358 words/s, in_qsize 5, out_qsize 0
2021-08-28 05:06:44,857 : INFO : EPOCH 8 - PROGRESS: at 28.40% examples, 1174634 words/s, in_qsize 5, out_qsize 0
2021-08-28 05:06:45,863 : INFO : EPOCH 8 - PROGRESS: at 43.06% examples, 1188467 words/s, in_qsize 5, out_qsize 0
2021-08-28 05:06:4

In [90]:
#Read the test JSON and rank stuff
to_tag = readjson('data/test_q.json')
docs = np.array(docs)
titles = np.array(titles)
results = to_tag

for i in tqdm(range(len(to_tag))):
    to_check = to_tag[i]['title_id']
    to_check = titlesearch(to_check)
    
    title_vector = model.infer_vector(to_check[0])
    ranking = model.dv.most_similar([title_vector],topn=len(model.dv))
    results[i]['candidates'] = docmatch(ranking,to_tag[i]['candidates'])

  docs = np.array(docs)
  titles = np.array(titles)
100%|██████████████████████████████████████████████████████████████████████████████| 3508/3508 [02:53<00:00, 20.25it/s]


In [91]:
'''
#Read the test JSON and rank stuff
to_tag = readjson('data/test_q.json')
docs = np.array(docs)
titles = np.array(titles)
results = to_tag

for i in tqdm(range(len(to_tag))):
    taglist = to_tag[i]['candidates']
    
    doc_tokens = []
    for key in taglist:
        val = np.where(docs==key)
        doc_tokens.append(docs[(val[0][0])])
        
    val = np.where(titles==to_tag[i]['title_id'])
    title_token = titles[(val[0][0])][0]
    
    sorted_docs = cosinerank(title_token,doc_tokens)
    
    sorted_docs_column = []
    for j in range(len(sorted_docs)):
        sorted_docs_column.append(sorted_docs[j][1])
    results[i]['candidates'] = sorted_docs_column
'''

"\n#Read the test JSON and rank stuff\nto_tag = readjson('data/test_q.json')\ndocs = np.array(docs)\ntitles = np.array(titles)\nresults = to_tag\n\nfor i in tqdm(range(len(to_tag))):\n    taglist = to_tag[i]['candidates']\n    \n    doc_tokens = []\n    for key in taglist:\n        val = np.where(docs==key)\n        doc_tokens.append(docs[(val[0][0])])\n        \n    val = np.where(titles==to_tag[i]['title_id'])\n    title_token = titles[(val[0][0])][0]\n    \n    sorted_docs = cosinerank(title_token,doc_tokens)\n    \n    sorted_docs_column = []\n    for j in range(len(sorted_docs)):\n        sorted_docs_column.append(sorted_docs[j][1])\n    results[i]['candidates'] = sorted_docs_column\n"

In [92]:
writejson(results,'data/suggestion.json')