# Generate Word2vec Model with COVID-19 documents

In [13]:
import numpy as np
import re
import ast
import pickle
from os.path import exists
import os
import nltk
from gensim import corpora
import pandas as pd
from gensim.models import Word2Vec
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer

# Set hyparameters for generating word2vec model

- vector_size (int, optional) – Dimensionality of the word vectors.
- window (int, optional) – Maximum distance between the current and predicted word within a sentence.
- min_count (int, optional) – Ignores all words with total frequency lower than this.
- sg ({0, 1}, optional) – Training algorithm: 1 for skip-gram; otherwise CBOW.
- negative (int, optional) – If > 0, negative sampling will be used, the int for negative specifies how many “noise words” should be drawn (usually between 5-20). If set to 0, no negative sampling is used.


# Set hyparameters for extracting features
**Extracting features**
- We use TFIDF vectorizer instead of Count vectorizer for extracting features. 

**Parameters**
- We have chosen a value of Minimum DF equal to 20 to get rid of extremely rare words that appear in less than 20 documents, and a Maximum DF equal to 80%. 


In [14]:
VECTOR_SIZE = 100
WINDOW_SIZE = 5
MIN_COUNT = 20
SG = 1
NEGATIVE = 20
MIN_DF =  20
MAX_DF = 0.8
NORM_FUNCTION = 'l1'
WEIGHT = 'tfidf'

In [16]:
SAVE_DIR = '../coronavirus_twenty_years_of_research/covid_word2vec/'
INPUT_DIR = '../coronavirus_twenty_years_of_research/technical_validation/'

## Train Word2vec model

In [None]:
# read input data
input_df = pd.read_pickle(INPUT_DIR + "merged_covid_articles.pkl")

In [18]:
def read_corpus(df):
    token_list = []
    for index, row in df.iterrows():
        token_list.append(row['text'].split(' '))

    training_docs = np.asarray(token_list, dtype = object)

    return training_docs

In [19]:
corpus = read_corpus(input_df)
dictionary = corpora.Dictionary(corpus)
dictionary.save(SAVE_DIR+'covid_dict.dict')

In [20]:
filename = "covid_" + str(VECTOR_SIZE) + "d.txt"
model_filename = "covid_" + str(VECTOR_SIZE) + "d.model"

model = Word2Vec(corpus, 
                 vector_size=VECTOR_SIZE, 
                 window=WINDOW_SIZE, 
                 min_count=MIN_COUNT, 
                 sg=SG, 
                 negative=NEGATIVE)

In [21]:
# save the trained model
if not exists(SAVE_DIR):
    os.makedirs(SAVE_DIR)
model.wv.save_word2vec_format(SAVE_DIR + "/" + filename, binary=False)
model.save(SAVE_DIR + "/" + model_filename)

## Extract features

In [22]:
def write_object(obj, output_fname):
    f = open(output_fname, 'wb')
    pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)
    f.close()

In [23]:
def data_vectorization(weight, input_dir, corpus, _min_df, _max_df, norm_function):
    
    V_fname = input_dir + "_V_{}.pkl".format(weight)
    D_fname = input_dir + "_D_{}.pkl".format(weight)
    
    if exists(V_fname):
        print("File {} already exist".format(V_fname))
        V = pickle.load( open(V_fname, "rb") )
        D = pickle.load( open(D_fname, "rb") )
    else:
        if weight == 'tfidf':
            print('TfidfVectorizer is proceed')
            V = TfidfVectorizer(analyzer='word', min_df=_min_df, norm=norm_function, max_df=_max_df, encoding='utf-8') # Term Frequency times inverse document frequency.
            D = V.fit_transform(corpus)
        else:
            print('CountVectorizer is proceed')
            V = CountVectorizer(analyzer='word', min_df=_min_df, max_df=_max_df, encoding='utf-8') 
            D = V.fit_transform(corpus)

        # write the vectorizer and data
        write_object(V, V_fname)
        write_object(D, D_fname)

    return V, D


In [24]:

corpus = list(input_df['text'])
V, D = data_vectorization(WEIGHT, INPUT_DIR, corpus, MIN_DF, MAX_DF, NORM_FUNCTION)
print("Matrix shape:", D.shape)

TfidfVectorizer is proceed
Matrix shape: (557956, 10989)
