<a href="https://colab.research.google.com/github/s-k-sharma/Machine-learning-Models/blob/main/word2vec_covid_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import re
from tqdm import tqdm
from nltk.corpus import stopwords
from gensim.models import Word2Vec
from gensim.models import Phrases
from gensim.models.phrases import Phraser
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
%matplotlib inline

In [2]:
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_colwidth', None)

In [3]:
df=pd.read_csv("/content/drive/My Drive/Data Set/metadata.csv")

In [4]:
df['conc'] = df['title'].str.cat(df['abstract'], sep =" ")

In [5]:
tmp_corpus=[str(i) for i in df['conc'] if str(i)!= 'nan']

In [6]:
len(tmp_corpus)

237458

In [7]:
corpus = []
for line in tqdm(tmp_corpus):
  words = line.split()
  corpus.append(words)

100%|██████████| 237458/237458 [00:07<00:00, 31033.81it/s]


In [8]:
num_of_sentences = len(corpus)
num_of_words = 0
for line in corpus:
    num_of_words += len(line)

print('Num of sentences - %s'%(num_of_sentences))
print('Num of words - %s'%(num_of_words))

Num of sentences - 237458
Num of words - 50227159


In [9]:
phrases = Phrases(sentences=corpus,min_count=25,threshold=50)
bigram = Phraser(phrases)

2020-11-07 05:47:57,065 : INFO : collecting all words and their counts
2020-11-07 05:47:57,067 : INFO : PROGRESS: at sentence #0, processed 0 words and 0 word types
2020-11-07 05:48:00,876 : INFO : PROGRESS: at sentence #10000, processed 2173949 words and 1079294 word types
2020-11-07 05:48:03,995 : INFO : PROGRESS: at sentence #20000, processed 3875297 words and 1800385 word types
2020-11-07 05:48:07,978 : INFO : PROGRESS: at sentence #30000, processed 6118705 words and 2595890 word types
2020-11-07 05:48:12,231 : INFO : PROGRESS: at sentence #40000, processed 8380303 words and 3293449 word types
2020-11-07 05:48:16,321 : INFO : PROGRESS: at sentence #50000, processed 10658599 words and 3938392 word types
2020-11-07 05:48:20,481 : INFO : PROGRESS: at sentence #60000, processed 12953118 words and 4551687 word types
2020-11-07 05:48:24,714 : INFO : PROGRESS: at sentence #70000, processed 15239154 words and 5130379 word types
2020-11-07 05:48:28,990 : INFO : PROGRESS: at sentence #80000,

In [10]:
for index,sentence in enumerate(corpus):
    corpus[index] = bigram[sentence]

In [11]:
# sg - skip gram |  window = size of the window | size = vector dimension
size = 100
window_size = 2 # sentences weren't too long, so
epochs = 50
min_count = 2
workers = 4
# train word2vec model using gensim
model = Word2Vec(corpus, sg=1,window=window_size,size=size,
                 min_count=min_count,workers=workers,iter=epochs,sample=0.01)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
2020-11-07 07:21:52,630 : INFO : EPOCH 26 - PROGRESS: at 36.90% examples, 221010 words/s, in_qsize 7, out_qsize 0
2020-11-07 07:21:53,680 : INFO : EPOCH 26 - PROGRESS: at 37.40% examples, 220939 words/s, in_qsize 7, out_qsize 0
2020-11-07 07:21:54,716 : INFO : EPOCH 26 - PROGRESS: at 37.93% examples, 221152 words/s, in_qsize 7, out_qsize 0
2020-11-07 07:21:55,781 : INFO : EPOCH 26 - PROGRESS: at 38.44% examples, 221045 words/s, in_qsize 7, out_qsize 0
2020-11-07 07:21:56,781 : INFO : EPOCH 26 - PROGRESS: at 38.97% examples, 221234 words/s, in_qsize 7, out_qsize 0
2020-11-07 07:21:57,799 : INFO : EPOCH 26 - PROGRESS: at 39.45% examples, 221134 words/s, in_qsize 8, out_qsize 1
2020-11-07 07:21:58,808 : INFO : EPOCH 26 - PROGRESS: at 39.97% examples, 221294 words/s, in_qsize 7, out_qsize 0
2020-11-07 07:21:59,814 : INFO : EPOCH 26 - PROGRESS: at 40.46% examples, 221247 words/s, in_qsize 7, out_qsize 0
2020-11-07 07:22:00,852

In [12]:
# save model
model.save('w2v_model')

2020-11-07 08:46:31,756 : INFO : saving Word2Vec object under w2v_model, separately None
2020-11-07 08:46:31,758 : INFO : storing np array 'vectors' to w2v_model.wv.vectors.npy
2020-11-07 08:46:32,361 : INFO : not storing attribute vectors_norm
2020-11-07 08:46:32,364 : INFO : storing np array 'syn1neg' to w2v_model.trainables.syn1neg.npy
2020-11-07 08:46:33,129 : INFO : not storing attribute cum_table
2020-11-07 08:46:34,624 : INFO : saved w2v_model


In [13]:
# load word2vec model
model = Word2Vec.load('w2v_model')

2020-11-07 08:46:38,281 : INFO : loading Word2Vec object from w2v_model
2020-11-07 08:46:42,596 : INFO : loading wv recursively from w2v_model.wv.* with mmap=None
2020-11-07 08:46:42,597 : INFO : loading vectors from w2v_model.wv.vectors.npy with mmap=None
2020-11-07 08:46:42,737 : INFO : setting ignored attribute vectors_norm to None
2020-11-07 08:46:42,738 : INFO : loading vocabulary recursively from w2v_model.vocabulary.* with mmap=None
2020-11-07 08:46:42,742 : INFO : loading trainables recursively from w2v_model.trainables.* with mmap=None
2020-11-07 08:46:42,744 : INFO : loading syn1neg from w2v_model.trainables.syn1neg.npy with mmap=None
2020-11-07 08:46:42,898 : INFO : setting ignored attribute cum_table to None
2020-11-07 08:46:42,899 : INFO : loaded w2v_model


In [14]:
model.most_similar('coronavirus')

2020-11-07 08:46:46,340 : INFO : precomputing L2-norms of word weight vectors


[('corona_virus', 0.9339419603347778),
 ('Coronavirus', 0.8549025058746338),
 ('coronavirus_(CoV)', 0.829695463180542),
 ('CoV', 0.8095064163208008),
 ('coronavirus_disease-19', 0.7678813934326172),
 ('coronavirus_disease-2019', 0.754365086555481),
 ('coronavirus-2019', 0.7485929727554321),
 ('coronavirus,', 0.738055944442749),
 ('coronavirus.', 0.738009512424469),
 ('Coronavirus‐19', 0.732457160949707)]

In [15]:
model.most_similar('covid-19')

[('Covid-19', 0.9127538800239563),
 ('COVID-19', 0.8781176805496216),
 ('COVID19', 0.8758118152618408),
 ('COVID_19', 0.8594430685043335),
 ('COVID‐19', 0.8479489088058472),
 ('Covid‐19', 0.7971652150154114),
 ('CoViD-19', 0.7750253677368164),
 ('COVID', 0.7747471332550049),
 ('Covid_19', 0.7707716822624207),
 ('covid19', 0.756084680557251)]