# Generate Word2vec Model with COVID-19 documents

In [3]:
import numpy as np
import re
import ast
import nltk
import pandas as pd
from gensim.models import Word2Vec
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer



# Set hyparameters for generating word2vec model

- vector_size (int, optional) – Dimensionality of the word vectors.
- window (int, optional) – Maximum distance between the current and predicted word within a sentence.
- min_count (int, optional) – Ignores all words with total frequency lower than this.
- sg ({0, 1}, optional) – Training algorithm: 1 for skip-gram; otherwise CBOW.
- negative (int, optional) – If > 0, negative sampling will be used, the int for negative specifies how many “noise words” should be drawn (usually between 5-20). If set to 0, no negative sampling is used.

In [4]:
VECTOR_SIZE = 100
WINDOW_SIZE = 5
MIN_COUNT = 20
SG = 1
NEGATIVE = 20
SAVE_DIR = '../data/covid_word2vec/'
INPUT_DIR = '../data/outputs/articles/'

## Train Word2vec model

In [5]:
# read input data
input_df = pd.read_csv(INPUT_DIR + "preprocessed_data(all_data_type).tsv", sep='\t', encoding='utf-8')
input_df.shape

  interactivity=interactivity, compiler=compiler, result=result)


(413915, 73)

In [21]:
def read_corpus(df):
    token_list = []
    for index, row in df.iterrows():
        token_list.append(row['text'].split(' '))
    training_docs = np.asarray(token_list)

    return training_docs

In [22]:
corpus = read_corpus(input_df)

In [24]:
filename = "covid_" + str(VECTOR_SIZE) + "d.txt"
model_filename = "covid_" + str(VECTOR_SIZE) + "d.model"

model = Word2Vec(corpus, 
                 vector_size=VECTOR_SIZE, 
                 window=WINDOW_SIZE, 
                 min_count=MIN_COUNT, 
                 sg=SG, 
                 negative=NEGATIVE)

In [25]:
# save the trained model
model.wv.save_word2vec_format(SAVE_DIR + "/" + filename, binary=False)
model.save(SAVE_DIR + "/" + model_filename)

## Verify the trained model

In [26]:
# load trained word2vec model
model_filename = "covid_" + str(VECTOR_SIZE) + "d.model"
wv_model = Word2Vec.load(SAVE_DIR + "/" + model_filename)

In [27]:
# print number of vocabs in the worv2vec model
print("vocabulary size to be embedded: {0}".format(len(model.wv)))

# verify model with exist word
word = 'vaccination'
if word in wv_model.wv:
    print(word + ' exist')
# print the most similar words
wv_model.wv.most_similar(word)

vocabulary size to be embedded: 29446
vaccination exist


[('vaccine', 0.8469165563583374),
 ('immunization', 0.8436818718910217),
 ('booster', 0.8193477988243103),
 ('post_vaccination', 0.779912531375885),
 ('two_dose', 0.767334520816803),
 ('pfizer_biontech', 0.7570767998695374),
 ('vaccinate', 0.7438457608222961),
 ('first_dose', 0.7429644465446472),
 ('revaccination', 0.7308782339096069),
 ('immunisation', 0.7266660928726196)]