IMPORTING REQUIRED PACKAGES

In [None]:
import numpy as np
import pandas as pd
import json
import logging
import re

from tqdm import tqdm
from sklearn.utils.extmath import randomized_svd
from gensim.models import word2vec
from matplotlib import pyplot as plt
from collections import defaultdict

In [None]:
corpus = open('CovCorpus.txt','r')
CovCorpus = corpus.read()
corpus.close()

docs = CovCorpus.split('\n')
docs.pop()

''

RETURN FREQUENCY OF VOCABULARY IN SORTED ORDER AS A DICT

In [None]:
def returnvocab(corpus):
    vocab = set()
    vocabFreqDict = {}

    for doc in tqdm(docs): 
        words = doc.split()
    
        for word in words:
            word = word.lower()
            if word not in vocab:
                vocab.add(word)
                vocabFreqDict[word] = 1
            else:
                vocabFreqDict[word] += 1
    return dict(sorted(vocabFreqDict.items(), key = lambda x : x[1], reverse = True))

In [None]:
wordRank = returnvocab(corpus)
vocab = list(wordRank.keys())

top15k = set(vocab[:15000])
FreqDict = {word: i for i, word in enumerate(vocab[:15000])}

100%|████████████████████████████████████████████████████████████████████| 1613013/1613013 [00:07<00:00, 228737.99it/s]


In [None]:
IMPLEMENTING THE COALS ALGORITHM

In [None]:
def COALS(corpus):
    wordRank = returnvocab(corpus)
    vocab = list(wordRank.keys())
    size = len(corpus)
    
    FreqDict = {word: i for i, word in enumerate(vocab[:15000])}
    top15k = set(FreqDict.keys())
    
    co_matrix = np.zeros((15000, 15000), dtype = 'float64')
    for i, word1 in tqdm(enumerate(corpus)):
        if word1 in top15k:
            if i-4 < 0:
                temp = 5-i
            else:
                temp = 1
            for word in corpus[max(0,i-4):i]:
                if word in top15k:
                    co_matrix[FreqDict[word1], FreqDict[word]] += temp
                temp += 1
            temp = 4
            for word in corpus[i+1:min(i+5, size)]:
                if word in top15k:
                    co_matrix[FreqDict[word1], FreqDict[word]] += temp
                temp -= 1
    
    corr_matrix = np.zeros_like(co_matrix)
    total = np.sum(co_matrix)
    rs = np.sum(co_matrix, axis = 1)
    cs = np.sum(co_matrix, axis = 0)
    for i in tqdm(range(co_matrix.shape[0])):
        for j in range(co_matrix.shape[1]):
            temp = rs[i]*cs[j]*(total-rs[i])*(total-cs[j])
            foo = total*co_matrix[i,j] - rs[i]*cs[j]
            if temp == 0:
                corr_matrix[i,j] = 0
            else:
                if foo < 0:
                    corr_matrix[i,j] = 0
                else:
                    corr_matrix[i,j] = (total*co_matrix[i,j] - rs[i]*cs[j])/np.sqrt(temp)
    
    corr_matrix = np.sqrt(corr_matrix)

    U, D, V = randomized_svd(corr_matrix, n_components = 50, n_iter = 7)
    word_embedding_mat = U

    return word_embedding_mat

In [None]:
wordVectors = COALS(docs)
wordVectors = wordVectors.astype('float32')

100%|████████████████████████████████████████████████████████████████████| 1613013/1613013 [00:07<00:00, 224086.31it/s]
1613013it [00:00, 3366877.13it/s]
100%|████████████████████████████████████████████████████████████████████████████| 15000/15000 [06:58<00:00, 35.81it/s]


Computing SVD of the COALS matrix




In [None]:
print(wordVectors.shape)

(15000, 50)


TRANIING THE MODEL. SENTENCE IS READ ONE AFTER THE OTHER TO AVOID MEMORY ISSUES

In [None]:
corpus = open('CovCorpus.txt', 'r')
stream = word2vec.LineSentence(corpus)
model = word2vec.Word2Vec(vector_size = 50, hs = 1, sg = 1, window = 4, workers = 8, max_final_vocab=15000)
model.build_vocab(stream)
model.wv.vectors = wordVectors 
model.train(stream, total_examples = model.corpus_count, epochs = 10)
model.save("wordvector.model")

2022-05-19 14:39:22,050 : INFO : Word2Vec lifecycle event {'params': 'Word2Vec<vocab=0, vector_size=50, alpha=0.025>', 'datetime': '2022-05-19T14:39:22.050092', 'gensim': '4.2.0', 'python': '3.9.12 (tags/v3.9.12:b28265d, Mar 23 2022, 23:52:46) [MSC v.1929 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19043-SP0', 'event': 'created'}
2022-05-19 14:39:22,053 : INFO : collecting all words and their counts
2022-05-19 14:39:22,054 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2022-05-19 14:39:22,102 : INFO : PROGRESS: at sentence #10000, processed 155202 words, keeping 15875 word types
2022-05-19 14:39:22,146 : INFO : PROGRESS: at sentence #20000, processed 308892 words, keeping 24232 word types
2022-05-19 14:39:22,193 : INFO : PROGRESS: at sentence #30000, processed 464179 words, keeping 30722 word types
2022-05-19 14:39:22,240 : INFO : PROGRESS: at sentence #40000, processed 620677 words, keeping 36450 word types
2022-05-19 14:39:22,285 : INFO : PROGRESS: at s

2022-05-19 14:39:25,230 : INFO : PROGRESS: at sentence #690000, processed 9708767 words, keeping 174267 word types
2022-05-19 14:39:25,278 : INFO : PROGRESS: at sentence #700000, processed 9839923 words, keeping 176965 word types
2022-05-19 14:39:25,321 : INFO : PROGRESS: at sentence #710000, processed 9970003 words, keeping 180450 word types
2022-05-19 14:39:25,365 : INFO : PROGRESS: at sentence #720000, processed 10102689 words, keeping 182971 word types
2022-05-19 14:39:25,409 : INFO : PROGRESS: at sentence #730000, processed 10232479 words, keeping 185616 word types
2022-05-19 14:39:25,454 : INFO : PROGRESS: at sentence #740000, processed 10363710 words, keeping 188182 word types
2022-05-19 14:39:25,497 : INFO : PROGRESS: at sentence #750000, processed 10497095 words, keeping 191395 word types
2022-05-19 14:39:25,542 : INFO : PROGRESS: at sentence #760000, processed 10627693 words, keeping 194067 word types
2022-05-19 14:39:25,585 : INFO : PROGRESS: at sentence #770000, processed 1

2022-05-19 14:39:28,290 : INFO : PROGRESS: at sentence #1400000, processed 19025224 words, keeping 331972 word types
2022-05-19 14:39:28,332 : INFO : PROGRESS: at sentence #1410000, processed 19158756 words, keeping 334585 word types
2022-05-19 14:39:28,374 : INFO : PROGRESS: at sentence #1420000, processed 19290073 words, keeping 336282 word types
2022-05-19 14:39:28,419 : INFO : PROGRESS: at sentence #1430000, processed 19420549 words, keeping 338347 word types
2022-05-19 14:39:28,463 : INFO : PROGRESS: at sentence #1440000, processed 19551960 words, keeping 340858 word types
2022-05-19 14:39:28,504 : INFO : PROGRESS: at sentence #1450000, processed 19683130 words, keeping 342699 word types
2022-05-19 14:39:28,552 : INFO : PROGRESS: at sentence #1460000, processed 19813006 words, keeping 344615 word types
2022-05-19 14:39:28,596 : INFO : PROGRESS: at sentence #1470000, processed 19943988 words, keeping 346095 word types
2022-05-19 14:39:28,638 : INFO : PROGRESS: at sentence #1480000,

2022-05-19 14:39:55,644 : INFO : EPOCH 0 - PROGRESS: at 75.33% examples, 469666 words/s, in_qsize 7, out_qsize 7
2022-05-19 14:39:56,670 : INFO : EPOCH 0 - PROGRESS: at 78.44% examples, 469386 words/s, in_qsize 15, out_qsize 0
2022-05-19 14:39:57,671 : INFO : EPOCH 0 - PROGRESS: at 81.51% examples, 468991 words/s, in_qsize 16, out_qsize 0
2022-05-19 14:39:58,705 : INFO : EPOCH 0 - PROGRESS: at 84.63% examples, 468415 words/s, in_qsize 15, out_qsize 0
2022-05-19 14:39:59,735 : INFO : EPOCH 0 - PROGRESS: at 87.78% examples, 468316 words/s, in_qsize 15, out_qsize 0
2022-05-19 14:40:00,740 : INFO : EPOCH 0 - PROGRESS: at 90.81% examples, 467899 words/s, in_qsize 15, out_qsize 0
2022-05-19 14:40:01,788 : INFO : EPOCH 0 - PROGRESS: at 93.87% examples, 467155 words/s, in_qsize 15, out_qsize 0
2022-05-19 14:40:02,795 : INFO : EPOCH 0 - PROGRESS: at 96.46% examples, 464861 words/s, in_qsize 15, out_qsize 1
2022-05-19 14:40:03,796 : INFO : EPOCH 0 - PROGRESS: at 99.07% examples, 462714 words/s, 

2022-05-19 14:41:07,793 : INFO : EPOCH 2 - PROGRESS: at 76.47% examples, 442519 words/s, in_qsize 15, out_qsize 0
2022-05-19 14:41:08,823 : INFO : EPOCH 2 - PROGRESS: at 79.48% examples, 442651 words/s, in_qsize 15, out_qsize 0
2022-05-19 14:41:09,844 : INFO : EPOCH 2 - PROGRESS: at 82.87% examples, 444549 words/s, in_qsize 15, out_qsize 0
2022-05-19 14:41:10,859 : INFO : EPOCH 2 - PROGRESS: at 86.04% examples, 445466 words/s, in_qsize 15, out_qsize 0
2022-05-19 14:41:11,890 : INFO : EPOCH 2 - PROGRESS: at 89.11% examples, 445559 words/s, in_qsize 15, out_qsize 0
2022-05-19 14:41:12,902 : INFO : EPOCH 2 - PROGRESS: at 92.31% examples, 446688 words/s, in_qsize 15, out_qsize 0
2022-05-19 14:41:13,917 : INFO : EPOCH 2 - PROGRESS: at 95.51% examples, 447758 words/s, in_qsize 16, out_qsize 0
2022-05-19 14:41:14,920 : INFO : EPOCH 2 - PROGRESS: at 98.55% examples, 448080 words/s, in_qsize 15, out_qsize 0
2022-05-19 14:41:15,333 : INFO : EPOCH 2: training on 21819825 raw words (15697852 effec

2022-05-19 14:42:19,674 : INFO : EPOCH 4 - PROGRESS: at 80.20% examples, 443640 words/s, in_qsize 12, out_qsize 0
2022-05-19 14:42:20,704 : INFO : EPOCH 4 - PROGRESS: at 83.06% examples, 442728 words/s, in_qsize 15, out_qsize 0
2022-05-19 14:42:21,721 : INFO : EPOCH 4 - PROGRESS: at 86.09% examples, 442963 words/s, in_qsize 15, out_qsize 0
2022-05-19 14:42:22,806 : INFO : EPOCH 4 - PROGRESS: at 89.11% examples, 442162 words/s, in_qsize 13, out_qsize 2
2022-05-19 14:42:23,829 : INFO : EPOCH 4 - PROGRESS: at 92.17% examples, 442558 words/s, in_qsize 15, out_qsize 0
2022-05-19 14:42:24,841 : INFO : EPOCH 4 - PROGRESS: at 95.23% examples, 443152 words/s, in_qsize 15, out_qsize 0
2022-05-19 14:42:25,847 : INFO : EPOCH 4 - PROGRESS: at 97.99% examples, 442359 words/s, in_qsize 15, out_qsize 0
2022-05-19 14:42:26,455 : INFO : EPOCH 4: training on 21819825 raw words (15699045 effective words) took 35.4s, 443296 effective words/s
2022-05-19 14:42:27,476 : INFO : EPOCH 5 - PROGRESS: at 2.12% exa

2022-05-19 14:43:32,026 : INFO : EPOCH 6 - PROGRESS: at 88.06% examples, 456088 words/s, in_qsize 14, out_qsize 0
2022-05-19 14:43:33,074 : INFO : EPOCH 6 - PROGRESS: at 91.04% examples, 455224 words/s, in_qsize 15, out_qsize 0
2022-05-19 14:43:34,078 : INFO : EPOCH 6 - PROGRESS: at 94.15% examples, 455755 words/s, in_qsize 15, out_qsize 0
2022-05-19 14:43:35,105 : INFO : EPOCH 6 - PROGRESS: at 97.12% examples, 455324 words/s, in_qsize 15, out_qsize 0
2022-05-19 14:43:36,010 : INFO : EPOCH 6: training on 21819825 raw words (15700378 effective words) took 34.4s, 455899 effective words/s
2022-05-19 14:43:37,020 : INFO : EPOCH 7 - PROGRESS: at 2.20% examples, 393750 words/s, in_qsize 15, out_qsize 0
2022-05-19 14:43:38,038 : INFO : EPOCH 7 - PROGRESS: at 4.75% examples, 423505 words/s, in_qsize 15, out_qsize 0
2022-05-19 14:43:39,044 : INFO : EPOCH 7 - PROGRESS: at 7.20% examples, 427986 words/s, in_qsize 15, out_qsize 0
2022-05-19 14:43:40,062 : INFO : EPOCH 7 - PROGRESS: at 9.76% exampl

2022-05-19 14:44:44,596 : INFO : EPOCH 8 - PROGRESS: at 88.72% examples, 415395 words/s, in_qsize 16, out_qsize 0
2022-05-19 14:44:45,604 : INFO : EPOCH 8 - PROGRESS: at 91.38% examples, 414872 words/s, in_qsize 15, out_qsize 0
2022-05-19 14:44:46,614 : INFO : EPOCH 8 - PROGRESS: at 94.34% examples, 415838 words/s, in_qsize 15, out_qsize 0
2022-05-19 14:44:47,623 : INFO : EPOCH 8 - PROGRESS: at 97.31% examples, 416754 words/s, in_qsize 15, out_qsize 0
2022-05-19 14:44:48,463 : INFO : EPOCH 8: training on 21819825 raw words (15700034 effective words) took 37.5s, 418172 effective words/s
2022-05-19 14:44:49,467 : INFO : EPOCH 9 - PROGRESS: at 2.08% examples, 374933 words/s, in_qsize 15, out_qsize 0
2022-05-19 14:44:50,470 : INFO : EPOCH 9 - PROGRESS: at 4.52% examples, 406752 words/s, in_qsize 15, out_qsize 0
2022-05-19 14:44:51,476 : INFO : EPOCH 9 - PROGRESS: at 6.96% examples, 416954 words/s, in_qsize 15, out_qsize 0
2022-05-19 14:44:52,480 : INFO : EPOCH 9 - PROGRESS: at 9.48% exampl

In [None]:
keywords = ['patient', 'study', 'cell', 'virus', 'data', 'reported', 'result', 'disease', 'protein', 'treatment']

In [None]:
for keyword in keywords:
    print()
    print(f'Top 5 most similar words to the root word: {keyword}')
    print(model.wv.most_similar(keyword, 5))


Top 5 most similar words to the root word: patient
[('patients', 0.6027148365974426), ('hospitalized', 0.5945053696632385), ('case-patients', 0.5756993889808655), ('hospitalised', 0.556251049041748), ('hemodialysis', 0.553633987903595), ('underwent', 0.5492505431175232), ('whom', 0.5476383566856384), ('diagnosed', 0.5311722159385681), ('ill', 0.5255144238471985), ('deceased', 0.5075977444648743)]

Top 5 most similar words to the root word: study
[('observational', 0.611547589302063), ('meta-analysis', 0.571100652217865), ('retrospective', 0.5594677329063416), ('corroborated', 0.5318753123283386), ('cohort', 0.5280941128730774), ('cross-sectional', 0.5224155187606812), ('metaanalysis', 0.5207840204238892), ('case-control', 0.510164201259613), ('single-center', 0.5059807896614075), ('meta-analyses', 0.5051824450492859)]

Top 5 most similar words to the root word: cell
[('co-cultured', 0.687863290309906), ('hela', 0.5977769494056702), ('thp-1', 0.5945456624031067), ('mhv-infected', 0.572

  dists = dot(self.vectors[clip_start:clip_end], mean) / self.norms[clip_start:clip_end]
