#Implmentation of CBOW & GloVe

In [50]:
#imports
import nltk
import os
import json
import numpy as np
import gensim.downloader as api
import os
import os.path
import io
from gensim.models import KeyedVectors
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity
from scipy.stats import spearmanr
from nltk.corpus import stopwords
from stanfordcorenlp import StanfordCoreNLP
from scipy import stats

#CBOW Implementation 

In [None]:
#Downloads

#Downloaded the simplewiki-latest-pages-articles.xml.bg
# Dated: 20-Feb-2023 16:13
#Source: https://dumps.wikimedia.org/simplewiki/latest/simplewiki-latest-pages-articles.xml.bz2 

#Used Wiki Extactor from and concatenated all text.
#https://github.com/attardi/wikiextractor

#Download the stopwords of English from nltk
nltk.download('stopwords')

In [84]:
#Contents of the local folder
print(os.listdir())

['extracted_data', 'misc', 'stanford-corenlp-4.5.1', 'simplewiki-latest-pages-articles.txt', '.DS_Store', 'GloVe_model.model.vectors.npy', 'wikiextractor-master', 'cbow_model.model', 'simplewiki-latest-pages-articles.xml', 'stanford-corenlp-latest.zip', 'preprocessed_text.txt', 'wordsim_goldstandard.txt', 'wikiextractor-master.zip', 'glove-wiki-gigaword-100.txt', '.ipynb_checkpoints', 'GloVe_model.model', 'tokenized_text.txt', 'code_wiki.ipynb']


In [13]:
# load the text
with open("simplewiki-latest-pages-articles.txt", "r",encoding="utf-16") as f:
    text = f.read()

In [14]:
#Peek the loaded text file.
text[0:50]

'{"id": "1", "revid": "440121", "url": "https://simple.wikipedia.org/wiki?curid=1", "title": "April", "text": "April is the fourth month of the year in the Julian and Gregorian calendars, and comes between March and May. It is one of four months to have 30 days.\\nApril always begins on the same day of week as July, and additionally, January in leap years. April always ends on the same day of the week as December.\\nApril\'s flowers are the Sweet Pea and Daisy. Its birthstone is the diamond. The mea'

In [16]:
# Get the set of English stop words
stop_words = set(stopwords.words('english')) 

In [17]:
# Split the input text into lines and parse each line as a JSON object
json_list = [json.loads(line) for line in text.split('\n') if line.strip()]

In [18]:
# Extract the text field from each dictionary and preprocess it
preprocessed_text = []
for j in json_list:
    tokens = nltk.word_tokenize(j['text'])
    filtered_tokens = [token.lower() for token in tokens if token.isalpha() and token.lower() not in stop_words]
    preprocessed_text.append(filtered_tokens)

In [19]:
# Save preprocessed_text to a file
with open('preprocessed_text.txt', 'w') as f:
    for line in preprocessed_text:
        f.write(' '.join(line) + '\n')

In [20]:
#Check the type of processed text
type(preprocessed_text)

list

In [21]:
#Using Standford Tokenizer
nlp = StanfordCoreNLP('http://localhost',port=9000)

In [22]:
# Tokenize the text using the Stanford tokenizer
tokenized_text = [nlp.word_tokenize(" ".join(sentence)) for sentence in preprocessed_text]

In [23]:
type(tokenized_text)

list

In [24]:
# Save preprocessed_text to a file
with open('tokenized_text.txt', 'w') as f:
    for line in tokenized_text:
        f.write(' '.join(line) + '\n')

In [25]:
# Train a CBOW model on the tokenized sentences
#Default parameters are used.
cbow_model = Word2Vec(tokenized_text,vector_size=100,window=5,min_count=5,workers=4,sg=0)

In [75]:
# Number of tokens available
print("Number of Tokens available : "+str(cbow_model.corpus_count))

# Dimensions of the word vector 
print("Dimension of the word vector: "+str(cbow_model.vector_size))

Number of Tokens available : 314722
Dimension of the word vector: 100


In [43]:
#Check top 5 most related words to queen
cbow_model.wv.most_similar('queen', topn=5)

[('elizabeth', 0.6839427947998047),
 ('crown', 0.6766826510429382),
 ('princess', 0.6750682592391968),
 ('king', 0.672218382358551),
 ('monarch', 0.652546226978302)]

In [68]:
#Check relation between queen and king
cbow_model.wv.similarity('queen','king')


0.6722183

In [45]:
#Save cbow model
cbow_model.save('cbow_model.model')

In [60]:
#Evaluate CBOW model and used final goldstandard of relatedness from
#wordsim353(Finkelstein, et al., January 2002) collection
#located at: http://alfonseca.org/eng/research/wordsim353.html 
Wordsim353_eval_file="wordsim_goldstandard.txt"
CBOW_model_scores_wordsim353 = []
wordA=[]
wordB=[]
Wordsim353_Evalutaion_Filescore=[]
with open(Wordsim353_eval_file, "r") as f:
    for line in f:
        line = line.lower()
        word1, word2, score = line.strip().split("\t")
        if word1 in cbow_model.wv.index_to_key and word2 in cbow_model.wv.index_to_key:
            relatedness_scores = cbow_model.wv.similarity(word1, word2)
            wordA.append(word1)
            wordB.append(word2)
            Wordsim353_Evalutaion_Filescore.append(float(score))
            CBOW_model_scores_wordsim353.append(relatedness_scores)
        else:
            print("Word not found, remove word from the Evaluation or increase Corpus")

In [62]:
#Pearson correlation score
stats.pearsonr(CBOW_model_scores_wordsim353,Wordsim353_Evalutaion_Filescore)[0]

0.709419689091693

In [63]:
stats.spearmanr(CBOW_model_scores_wordsim353,Wordsim353_Evalutaion_Filescore)[0]

0.6970069308160629

#GloVe Implementation 

In [51]:
# load pre-trained word-vectors from local file if avialable else download it from gensim-data
try:
    GloVe_model = KeyedVectors.load_word2vec_format('glove-wiki-gigaword-100.txt', binary=False)
except:
    GloVe_model = api.load("glove-wiki-gigaword-100")
    GloVe_model.save_word2vec_format('glove-wiki-gigaword-100.txt', binary=False)

In [52]:
# Printing out number of tokens available
print("Number of Tokens: "+str(GloVe_model.vectors.shape[0]))

# Printing out the dimension of a word vector 
print("Dimension of a word vector: "+str(GloVe_model.vectors.shape[1]))

Number of Tokens: 400000
Dimension of a word vector: 100


In [78]:
#Check top 5 most related words to queen
GloVe_model.most_similar_cosmul('queen', topn=5)

[('princess', 0.8973613977432251),
 ('king', 0.8753836750984192),
 ('elizabeth', 0.8677847385406494),
 ('royal', 0.8532505035400391),
 ('lady', 0.8522390127182007)]

In [80]:
#Performing relationship test for king queen
GloVe_model.similarity('queen', 'king')

0.7507691

In [83]:
#Save GloVe model
GloVe_model.save('GloVe_model.model')

In [64]:
#Evaluate GloVe model and used final goldstandard of relatedness from
#wordsim353(Finkelstein, et al., January 2002) collection
#located at: http://alfonseca.org/eng/research/wordsim353.html 
Wordsim353_eval_file="wordsim_goldstandard.txt"
GloVe_model_scores_wordsim353 = []
wordA=[]
wordB=[]
Wordsim353_Evalutaion_Filescore=[]
with open(Wordsim353_eval_file, "r") as f:
    for line in f:
        line = line.lower()
        word1, word2, score = line.strip().split("\t")
        if word1 in GloVe_model and word2 in GloVe_model:
            relatedness_scores = GloVe_model.similarity(word1, word2)
            wordA.append(word1)
            wordB.append(word2)
            Wordsim353_Evalutaion_Filescore.append(float(score))
            GloVe_model_scores_wordsim353.append(relatedness_scores)
        else:
            print("Word not found, remove word from the Evaluation or increase Corpus")

In [65]:
#Pearson correlation score
stats.pearsonr(GloVe_model_scores_wordsim353,Wordsim353_Evalutaion_Filescore)[0]


0.5941813212764435

In [66]:
stats.spearmanr(GloVe_model_scores_wordsim353,Wordsim353_Evalutaion_Filescore)[0]

0.6035208452876207