# Generate Word2vec Model with COVID-19 documents

In [1]:
import numpy as np
import re
import ast
import nltk
import pandas as pd
from gensim.models import Word2Vec
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer



# Set hyparameters for generating word2vec model

- vector_size (int, optional) – Dimensionality of the word vectors.
- window (int, optional) – Maximum distance between the current and predicted word within a sentence.
- min_count (int, optional) – Ignores all words with total frequency lower than this.
- sg ({0, 1}, optional) – Training algorithm: 1 for skip-gram; otherwise CBOW.
- negative (int, optional) – If > 0, negative sampling will be used, the int for negative specifies how many “noise words” should be drawn (usually between 5-20). If set to 0, no negative sampling is used.

In [2]:
VECTOR_SIZE = 100
WINDOW_SIZE = 5
MIN_COUNT = 20
SG = 1
NEGATIVE = 20
SAVE_DIR = '../Data/covid_word2vec/'
INPUT_DIR = '../Data/preprocessed_data/'

## Train Word2vec model

In [3]:
# read input data
input_df = pd.read_csv(INPUT_DIR + "preprocessed_data.tsv", sep='\t', encoding='utf-8')
input_df.shape

(470382, 2)

In [4]:
def read_corpus(df):
    token_list = []
    for index, row in df.iterrows():
        token_list.append(row['text'].split(' '))
    training_docs = np.asarray(token_list)

    return training_docs

In [5]:
corpus = read_corpus(input_df)

  return array(a, dtype, copy=False, order=order)


In [6]:
filename = "covid_" + str(VECTOR_SIZE) + "d.txt"
model_filename = "covid_" + str(VECTOR_SIZE) + "d.model"

model = Word2Vec(corpus, 
                 vector_size=VECTOR_SIZE, 
                 window=WINDOW_SIZE, 
                 min_count=MIN_COUNT, 
                 sg=SG, 
                 negative=NEGATIVE)

In [7]:
# save the trained model
model.wv.save_word2vec_format(SAVE_DIR + "/" + filename, binary=False)
model.save(SAVE_DIR + "/" + model_filename)

## Verify the trained model

In [8]:
# load trained word2vec model
model_filename = "covid_" + str(VECTOR_SIZE) + "d.model"
wv_model = Word2Vec.load(SAVE_DIR + "/" + model_filename)

In [9]:
# print number of vocabs in the worv2vec model 
print("vocabulary size to be embedded: {0}".format(len(model.wv)))

# verify model with exist word
word = 'vaccination'
if word in wv_model.wv:
    print(word + ' exist')
# print the most similar words
wv_model.wv.most_similar(word)

vocabulary size to be embedded: 32451
vaccination exist


[('vaccine', 0.8372259736061096),
 ('immunization', 0.8313582539558411),
 ('booster', 0.8280298709869385),
 ('two_dose', 0.8003955483436584),
 ('third_dose', 0.779162585735321),
 ('vaccinate', 0.751685380935669),
 ('pfizer_biontech', 0.7484267950057983),
 ('coronavac', 0.7439802289009094),
 ('post_vaccination', 0.7432931661605835),
 ('one_dose', 0.7317822575569153)]