# Generating Word Embeddings using Word2Vec

#### The Hackers: Mustajab Khawer, Katherine LaFever, Salvatore Pistone, Samuel Ramirez

#### Library Import Section

In [1]:
pip install -U gensim

Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install Unidecode




In [3]:
import nltk
import gensim
import pickle
import warnings
import unidecode
import pandas as pd
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
warnings.filterwarnings('ignore')
from nltk.corpus import stopwords
from sklearn.manifold import TSNE
from bokeh.io import output_notebook
from bokeh.plotting import show, figure
from gensim.models.word2vec import Word2Vec
from nltk import sent_tokenize, word_tokenize

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\sammy\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sammy\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### 1.  Pubmed Abstracts is a binary sample containing 132,935 abstracts. It is a list of strings that can be read using the following

In [4]:
# Reading the list of texts with custom file path
with open ("/content/temp/pubmed_abstracts_assignment_2.bin", 'rb') as fp:
    abstract_sample = pickle.load(fp)

#### Here we convert the list of texts to lower-case and drop any accent markers before tokenization, in order so that all the embeddings will be uncased.

In [5]:
# Convert list of abstracts to lowercase
for i in range(len(abstract_sample)):
    abstract_sample[i] = abstract_sample[i].lower()

In [6]:
# Drop any accent markers in list of abstracts
for i in range(len(abstract_sample)):
    abstract_sample[i] = unidecode.unidecode(abstract_sample[i])

#### Abstracts of research publications in the life sciences made available by the National Institute of Health

In [7]:
abstract_sample

['intervirology. extensively degraded rna was isolated from virions of influenza virus which had been oxidized with sodium m-periodate. similarly, although to a lesser extent, rna isolated from periodate-treated ribonucleoprotein of influenza virus was also degraded. in contrast, influenza virus rna, if first freed from other virion components, was not degraded by periodate oxidation.',
 'journal of general microbiology. some mutants and stock strains of escherichia coli k12 were sensitive to acriflavine in the presence of inorganic phosphate but were resistant to acriflavine in its absence. they mutated spontaneously to resistance to acriflavine plus phosphate. the synergistic effect of phosphate on acriflavine sensitivity was increased at high ph values. genetic analysis suggested that the mutations occurred in the gene acra. electron microscopic observation suggested that the presence of acriflavine plus phosphate affected the structure of the plasma membrane and the cytoplasm under

### 2. Each sentence in the abstract_sample list must be tokenized.

In [8]:
tokenized_sents = [word_tokenize(i) for i in abstract_sample]

In [9]:
print (tokenized_sents[0:1])

[['intervirology', '.', 'extensively', 'degraded', 'rna', 'was', 'isolated', 'from', 'virions', 'of', 'influenza', 'virus', 'which', 'had', 'been', 'oxidized', 'with', 'sodium', 'm-periodate', '.', 'similarly', ',', 'although', 'to', 'a', 'lesser', 'extent', ',', 'rna', 'isolated', 'from', 'periodate-treated', 'ribonucleoprotein', 'of', 'influenza', 'virus', 'was', 'also', 'degraded', '.', 'in', 'contrast', ',', 'influenza', 'virus', 'rna', ',', 'if', 'first', 'freed', 'from', 'other', 'virion', 'components', ',', 'was', 'not', 'degraded', 'by', 'periodate', 'oxidation', '.']]


### 3. Pre-processing is applied to improve the embedding model by removing punctuation, removing stopwords such as "and" and "the", and applying lemmatization which groups together inflected forms of a word.

In [10]:
def remove_punctuation(abstract):
    punctuations = ".,\"-\\/#!?$%\^&\*;:{}=\-_'~()"    
    filtered_abstract = [token for token in abstract if (not token in punctuations)]
    return filtered_abstract

def apply_stopwording(abstract, min_len):
    filtered_abstract = [token.lower() for token in abstract if (not token in stopwords.words('english') and len(token)>min_len)]
    return filtered_abstract

def apply_lemmatization(abstract):
    lemmatizer = nltk.WordNetLemmatizer()
    normalized_abstract = [lemmatizer.lemmatize(token) for token in abstract]
    return normalized_abstract

In [11]:
g_sentences=[]
for sentence in tokenized_sents:
    g_sentences.append(apply_lemmatization(apply_stopwording(remove_punctuation(sentence),3)))

In [12]:
print(g_sentences[0:5])

[['intervirology', 'extensively', 'degraded', 'isolated', 'virion', 'influenza', 'virus', 'oxidized', 'sodium', 'm-periodate', 'similarly', 'although', 'lesser', 'extent', 'isolated', 'periodate-treated', 'ribonucleoprotein', 'influenza', 'virus', 'also', 'degraded', 'contrast', 'influenza', 'virus', 'first', 'freed', 'virion', 'component', 'degraded', 'periodate', 'oxidation'], ['journal', 'general', 'microbiology', 'mutant', 'stock', 'strain', 'escherichia', 'coli', 'sensitive', 'acriflavine', 'presence', 'inorganic', 'phosphate', 'resistant', 'acriflavine', 'absence', 'mutated', 'spontaneously', 'resistance', 'acriflavine', 'plus', 'phosphate', 'synergistic', 'effect', 'phosphate', 'acriflavine', 'sensitivity', 'increased', 'high', 'value', 'genetic', 'analysis', 'suggested', 'mutation', 'occurred', 'gene', 'acra', 'electron', 'microscopic', 'observation', 'suggested', 'presence', 'acriflavine', 'plus', 'phosphate', 'affected', 'structure', 'plasma', 'membrane', 'cytoplasm', 'struct

### 4/5. Embeddings of window sizes 2, 5, 10, and 20 are generated for both Skip-Gram and CBOW methods with an embedding size of 2048 for a total of eight embeddings.

#### Skip-Gram method embeddings

In [13]:
w2v_model = Word2Vec(sentences=g_sentences, vector_size=2048, sg=1, window = 2, min_count=5, seed = 20, workers=10)

In [14]:
w2v_model2 = Word2Vec(sentences=g_sentences, vector_size=2048, sg=1, window = 5, min_count=5, seed = 20, workers=10)

In [15]:
w2v_model3 = Word2Vec(sentences=g_sentences, vector_size=2048, sg=1, window = 10, min_count=5, seed = 20, workers=10)

In [16]:
w2v_model4 = Word2Vec(sentences=g_sentences, vector_size=2048, sg=1, window = 20, min_count=5, seed = 20, workers=10)

#### CBOW method embeddings

In [17]:
w2v_model5 = Word2Vec(sentences=g_sentences, vector_size=2048, cbow_mean=1, window = 2, min_count=5, seed = 20, workers=10)

In [18]:
w2v_model6 = Word2Vec(sentences=g_sentences, vector_size=2048, cbow_mean=1, window = 5, min_count=5, seed = 20, workers=10)

In [19]:
w2v_model7 = Word2Vec(sentences=g_sentences, vector_size=2048, cbow_mean=1, window = 10, min_count=5, seed = 20, workers=10)

In [20]:
w2v_model8 = Word2Vec(sentences=g_sentences, vector_size=2048, cbow_mean=1, window = 20, min_count=5, seed = 20, workers=10)

### 6. Using the generated word embeddings, we create four meaningful examples of extrinsic testing.

In [100]:
print(w2v_model.wv.most_similar(positive=['obese','healthy'], negative=['sick'],topn=5))

[('lean', 0.5054928660392761), ('nonobese', 0.5050557255744934), ('non-obese', 0.5045276284217834), ('nondiabetic', 0.49842312932014465), ('non-diabetic', 0.4945414364337921)]


In [101]:
print(w2v_model4.wv.most_similar(positive=['social','introvert'], negative=['extravert'],topn=5))

[('non-social', 0.4722740948200226), ('cognitive-behavioural', 0.4501905143260956), ('hiv/aids-related', 0.44599470496177673), ("'social", 0.44397109746932983), ('self-related', 0.44346728920936584)]


In [102]:
print(w2v_model5.wv.most_similar(positive=['negative','proton'], negative=['electron'],topn=5))

[('positive', 0.5243296027183533), ('inotropic', 0.43952125310897827), ('dromotropic', 0.4283335208892822), ('chronotropic', 0.4239177107810974), ('allosteric', 0.41455790400505066)]


In [103]:
print(w2v_model8.wv.most_similar(positive=['ailment','cure'], negative=['disease'],topn=5))

[('remedy', 0.5459733605384827), ('palliation', 0.5424662828445435), ('treating', 0.5284415483474731), ('burch', 0.509563148021698), ('suppository', 0.46416959166526794)]


#### Here we generate a .emb file for each of the eight embeddings.

In [105]:
w2v_model.wv.save_word2vec_format('./The_Hackers_pubmed_w2v_uncased_2048_2_skip.emb')

w2v_model2.wv.save_word2vec_format('./The_Hackers_pubmed_w2v_uncased_2048_5_skip.emb')

w2v_model3.wv.save_word2vec_format('./The_Hackers_pubmed_w2v_uncased_2048_10_skip.emb')

w2v_model4.wv.save_word2vec_format('./The_Hackers_pubmed_w2v_uncased_2048_20_skip.emb')

w2v_model5.wv.save_word2vec_format('./The_Hackers_pubmed_w2v_uncased_2048_2_cbow.emb')

w2v_model6.wv.save_word2vec_format('./The_Hackers_pubmed_w2v_uncased_2048_5_cbow.emb')

w2v_model7.wv.save_word2vec_format('./The_Hackers_pubmed_w2v_uncased_2048_10_cbow.emb')

w2v_model8.wv.save_word2vec_format('./The_Hackers_pubmed_w2v_uncased_2048_20_cbow.emb')