In [1]:
import json
import gensim
import numpy as np
import re
import string
from tqdm import tqdm

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)



In [2]:
#Utils
def readjson(filename): #Reads JSON data
    reader = open(filename)
    data = json.load(reader)
    reader.close()

    return data

def textclean(text): #Cleans text
    #Converts to lowercase, removes punctuation, unicode and newlines
    text = text.lower()
    text = text.encode('ascii', 'ignore').decode()
    text = re.sub(r'https*\S+', ' ', text)
    text = re.sub(r'@\S+', ' ', text)
    text = re.sub(r'#\S+', ' ', text)
    text = re.sub(r'\'\w+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), ' ', text)
    text = re.sub(r'\w*\d+\w*', '', text)
    text = re.sub(r'\s{2,}', ' ', text)
    
    return text

def corpusort(text): #Sorts input into documents and titles
    doc = []
    title = []
    for i in tqdm(range(len(text))):
        header = text[i][0]
        
        if header[0:3] == 'doc':
            doc.append(text[i])
        else:
            title.append(text[i])
    return doc,title

def cleandocs(docs): #Tokenises and preps the document data
    clean_docs = []
    for i in tqdm(range(len(docs))):
        tokens = gensim.utils.simple_preprocess(textclean(docs[i][1]))
        clean_docs.append(gensim.models.doc2vec.TaggedDocument(tokens,docs[i][0]))

    return clean_docs

def cleantitles(titles): #Tokenises and preps the title data
    clean_titles = []
    for i in tqdm(range(len(titles))):
        clean_titles.append([gensim.utils.simple_preprocess(textclean(titles[i][1])),titles[i][0]])
    
    return clean_titles

In [3]:
#Main
corpus = readjson('data/corpus.json')
docs, titles = corpusort(corpus)

docs = cleandocs(docs)
titles = cleantitles(titles)

model = gensim.models.doc2vec.Doc2Vec(vector_size = 50, min_count = 2, epochs = 40)
model.build_vocab(docs)

100%|███████████████████████████████████████████████████████████████████████| 35080/35080 [00:00<00:00, 3347046.96it/s]
100%|███████████████████████████████████████████████████████████████████████████| 17540/17540 [00:24<00:00, 723.72it/s]
100%|█████████████████████████████████████████████████████████████████████████| 17540/17540 [00:00<00:00, 35132.42it/s]
2021-08-28 00:29:06,546 : INFO : Doc2Vec lifecycle event {'params': 'Doc2Vec(dm/m,d50,n5,w5,mc2,s0.001,t3)', 'datetime': '2021-08-28T00:29:06.545233', 'gensim': '4.0.1', 'python': '3.8.9 (tags/v3.8.9:a743f81, Apr  2 2021, 11:10:41) [MSC v.1928 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19041-SP0', 'event': 'created'}
2021-08-28 00:29:06,547 : INFO : collecting all words and their counts
2021-08-28 00:29:06,547 : INFO : PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags
2021-08-28 00:29:07,510 : INFO : PROGRESS: at example #10000, processed 6123051 words (6359149/s), 167252 word types, 13 tags
2021-08-28 00:29

In [None]:
#Model Training
model.train(docs, total_examples=model.corpus_count, epochs=model.epochs)

2021-08-28 00:29:10,268 : INFO : Doc2Vec lifecycle event {'msg': 'training model with 3 workers on 125583 vocabulary and 50 features, using sg=0 hs=0 sample=0.001 negative=5 window=5', 'datetime': '2021-08-28T00:29:10.268637', 'gensim': '4.0.1', 'python': '3.8.9 (tags/v3.8.9:a743f81, Apr  2 2021, 11:10:41) [MSC v.1928 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19041-SP0', 'event': 'train'}
2021-08-28 00:29:11,270 : INFO : EPOCH 1 - PROGRESS: at 15.59% examples, 1295294 words/s, in_qsize 5, out_qsize 0
2021-08-28 00:29:12,274 : INFO : EPOCH 1 - PROGRESS: at 31.13% examples, 1290699 words/s, in_qsize 5, out_qsize 0
2021-08-28 00:29:13,278 : INFO : EPOCH 1 - PROGRESS: at 46.67% examples, 1291765 words/s, in_qsize 5, out_qsize 0
2021-08-28 00:29:14,285 : INFO : EPOCH 1 - PROGRESS: at 61.98% examples, 1294796 words/s, in_qsize 5, out_qsize 0
2021-08-28 00:29:15,296 : INFO : EPOCH 1 - PROGRESS: at 76.98% examples, 1285953 words/s, in_qsize 5, out_qsize 0
2021-08-28 00:29:16,306 : INFO : 

2021-08-28 00:29:58,641 : INFO : EPOCH 8 - PROGRESS: at 47.45% examples, 1312469 words/s, in_qsize 5, out_qsize 0
2021-08-28 00:29:59,642 : INFO : EPOCH 8 - PROGRESS: at 63.02% examples, 1315849 words/s, in_qsize 5, out_qsize 0
2021-08-28 00:30:00,646 : INFO : EPOCH 8 - PROGRESS: at 78.75% examples, 1316215 words/s, in_qsize 5, out_qsize 0
2021-08-28 00:30:01,652 : INFO : EPOCH 8 - PROGRESS: at 94.45% examples, 1313936 words/s, in_qsize 5, out_qsize 0
2021-08-28 00:30:01,993 : INFO : worker thread finished; awaiting finish of 2 more threads
2021-08-28 00:30:01,994 : INFO : worker thread finished; awaiting finish of 1 more threads
2021-08-28 00:30:01,995 : INFO : worker thread finished; awaiting finish of 0 more threads
2021-08-28 00:30:01,995 : INFO : EPOCH - 8 : training on 10752561 raw words (8380655 effective words) took 6.4s, 1316440 effective words/s
2021-08-28 00:30:03,003 : INFO : EPOCH 9 - PROGRESS: at 15.95% examples, 1317772 words/s, in_qsize 5, out_qsize 0
2021-08-28 00:30:0