In [1]:
"""
Gensim Library - one of many core NLP libraries
Used to:
  1. Retrieval
  2. Topic modelling
  3. Representation Learning (word2vec and doc2vec)

  """

# Import libraries

import gensim
import numpy as np
import pandas as pd

import gensim.downloader as api
from gensim import utils
from gensim.models import KeyedVectors
from gensim.test.utils import datapath
from gensim.models import Word2Vec

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import PCA
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegressionCV
from sklearn.preprocessing import LabelEncoder

from scipy.stats import pearsonr, spearmanr

import nltk
from nltk.corpus import stopwords

import torch
import torch.nn as nn

import plotly.express as px



In [2]:
"""
By calling an API we tell the gensim which embeddings we want: 
specifying it by a string as a param of api.load

output:

The object we get is of type KeyedVectors
which is a map from the word to its embedding
here we are downloading an embedding table

"""

word_emb = api.load('word2vec-google-news-300')

In [3]:
"""
Let's show how those embeddings look like in 2 ways:

1. Assess the embeddings with word-lookup (easiest way to do this)
   
   using a dictionary notation; passing a word and get embedding 

2. Assess the embeddings with index lookup

   get the embedding by index
   when you want to know  let's say the tenth word of the dictionary 

"""

# First way

print(word_emb["apple"])

# Second way

print(word_emb[10])

[-0.06445312 -0.16015625 -0.01208496  0.13476562 -0.22949219  0.16210938
  0.3046875  -0.1796875  -0.12109375  0.25390625 -0.01428223 -0.06396484
 -0.08056641 -0.05688477 -0.19628906  0.2890625  -0.05151367  0.14257812
 -0.10498047 -0.04736328 -0.34765625  0.35742188  0.265625    0.00188446
 -0.01586914  0.00195312 -0.35546875  0.22167969  0.05761719  0.15917969
  0.08691406 -0.0267334  -0.04785156  0.23925781 -0.05981445  0.0378418
  0.17382812 -0.41796875  0.2890625   0.32617188  0.02429199 -0.01647949
 -0.06494141 -0.08886719  0.07666016 -0.15136719  0.05249023 -0.04199219
 -0.05419922  0.00108337 -0.20117188  0.12304688  0.09228516  0.10449219
 -0.00408936 -0.04199219  0.01409912 -0.02111816 -0.13476562 -0.24316406
  0.16015625 -0.06689453 -0.08984375 -0.07177734 -0.00595093 -0.00482178
 -0.00089264 -0.30664062 -0.0625      0.07958984 -0.00909424 -0.04492188
  0.09960938 -0.33398438 -0.3984375   0.05541992 -0.06689453 -0.04467773
  0.11767578 -0.13964844 -0.26367188  0.17480469 -0.

In [4]:
"""
Let's check the vocabulary.

Two important attributes:

1. key_to_index: maps a word to its vocabulary index
2. index_to_key: maps a vocabulary index to corresponding word

output:

      sorted list based on the frequency of the pre-trained words
"""

# From the vocabulary index we want to get the vocabulary word 

print(f"Vocabulary length {len(word_emb.key_to_index)}")
print(f"Index of cat {word_emb.key_to_index['cat']}")
print(f"Word at position 5947 {word_emb.index_to_key[5947]}")


Vocabulary length 3000000
Index of cat 5947
Word at position 5947 cat


In [5]:
"""
Compute similarity and distance

Given a list of pairs of words find the cosine similarity between them, 
obtaining fixed the first word

"""

pairs = [
('car','minivan'),
('car','bicycle'),
('car','airplane'),
('car','cereal'),
('car','communism')
]

print("w1   w2   cos_sim   cos_dist")
for w1, w2 in pairs:
    print(f"{w1}  {w2}  {word_emb.similarity(w1, w2):.3f}  {word_emb.distance(w1, w2):.3f}")


w1   w2   cos_sim   cos_dist
car  minivan  0.691  0.309
car  bicycle  0.536  0.464
car  airplane  0.424  0.576
car  cereal  0.139  0.861
car  communism  0.058  0.942


In [6]:
"""
Nearest Neighbour Retrieval

"""    

def retrieve_most_similar(query_words, all_word_emb, restrict_vocab = 10000):

    #Step 1: Get full or restricted vocabulary embeddings

    vocab_emb = all_word_emb.vectors[:restrict_vocab+1,:] if restrict_vocab is not None else all_word_emb.vectors

    #Step 2: get the word embeddings for the query words

    query_emb = all_word_emb[query_words]

    #Step 3:get cosine similarity between queries and embeddings
         # output: a matrix with queries along the rows and along the columns you will have  the words in the vocabolary
         # the entries contains cosine similarity 


    cos_sim = cosine_similarity(query_emb, vocab_emb)

    #Step 4: Sort similarities in desceding orders and get indices of nearest neighbours
             # like this the most similar words are in the first position



    nn = np.argsort(-cos_sim)

    #Step 5: delete self similarity, i.e. cos_sim(w,w) = 1.0
           # just delete the first result here

    nn_filtered = nn[:, 1:]
    
    #Step 6: use the indices to get the words

    nn_words = np.array(word_emb.index_to_key)[nn_filtered]

    return nn_words

In [7]:
# test the function

queries = ["king","queen","italy","Italy","nurse"]
res = retrieve_most_similar(queries, word_emb, restrict_vocab=10000)
top_k = 10
res_k = res[:, :top_k]
del res
print(res_k)

[['royal' 'King' 'prime_minister' 'legend' 'Queen' 'Prince' 'superstar'
  'hero' 'champion' 'premier']
 ['Queen' 'royal' 'lady' 'Miss' 'ladies' 'actress' 'she' 'her' 'herself'
  'girl']
 ['i' 'Milan' 'Real_Madrid' 'Italian' 'Spain' 'Portugal' 'Tunisia'
  'France' 'Argentina' 'Romania']
 ['Italian' 'Spain' 'France' 'Milan' 'Romania' 'Germany' 'Portugal'
  'Argentina' 'Austria' 'Rome']
 ['nurses' 'nursing' 'doctor' 'physician' 'patient' 'doctors' 'hospital'
  'teacher' 'worker' 'mother']]


In [8]:
# Dimensionality reduction and plotting

all_res_words = res_k.flatten()
res_word_emb = word_emb[all_res_words]
print("(|Q| x k) x word_emb_size")
print(res_word_emb.shape)

(|Q| x k) x word_emb_size
(50, 300)


In [9]:
pca = PCA(n_components = 3)
word_emb_pca = pca.fit_transform(res_word_emb)

pca_df = pd.DataFrame(word_emb_pca, columns=["pca_x","pca_y","pca_z"])

pca_df["word"] = res_k.flatten()

labels = np.array([queries]).repeat(top_k)
pca_df["query"] = labels

print(pca_df.head())

      pca_x     pca_y     pca_z            word query
0 -0.576351  1.649626 -1.527970           royal  king
1 -0.622732  0.791007 -0.940520            King  king
2 -0.470575  0.624666 -0.521788  prime_minister  king
3 -0.757105  0.773507 -0.094908          legend  king
4 -0.600329  1.351069 -0.741312           Queen  king


In [10]:
px.scatter_3d (pca_df, x='pca_x', y='pca_y', z='pca_z', color="query", text="word", opacity=0.7, title="3d-PCA representation of word embeddings")



Word embedding evaluation:

1. intrinsic evaluation: evaluate embedding without a downstream taks

   a. word similarity benchmarks
   b. word analogy benchmarks

2. extrinsic evaluation: evaluate word embeddings on a downstream tast



In [11]:
"""
Word similarity benchmarks, such as WS353, contain word pairs and human- given similarity score

"""

ws353_df = pd.read_csv(datapath('wordsim353.tsv'), sep="\t", skiprows=1).rename(columns={"# Word 1": "Word 1"})
ws353_df.sample(5)

Unnamed: 0,Word 1,Word 2,Human (mean)
331,country,citizen,7.31
42,football,soccer,9.03
111,tiger,animal,7.0
172,territory,surface,5.34
284,seven,series,3.56



Three steps to evaluate word embeddings:

1. For every pair in our dataset we get the embeddings
2. For ach pair we compute cosine similarity between its word embeddings,
   we call the similarity function
3. Then, we simply compute the correlation score, even Pearson's r or Spearman's p
   between the human given score h and the cosine similarity s

!! Gensim provides us with a function: evaluate_word_pairs


In [12]:
word_emb.evaluate_word_pairs(datapath('wordsim353.tsv'), case_insensitive = False)

((0.6525349647301872, 3.3734146147370187e-44),
 SpearmanrResult(correlation=0.7000166486272194, pvalue=2.86866666051422e-53),
 0.0)

Word analogy benchmarks

man : king = woman : x

word2vec paper shows that word2vec embeddings can solve (some) of these equations by algebric operations:

Get  ex=eking−eman+ewoman 
Check if  NNV(ex)=queen

Gensim provides us with a most_similar function
It has several arguments, the most important are:
positive : list of words that should be summed together
negative : list of words that should be subtracted

In [13]:
print(word_emb.most_similar(positive=["king", "woman"], negative=["man"], restrict_vocab=100000))

#apple is to ipod as sony is to x

print(word_emb.most_similar(positive=["iPod", "Sony"], negative=["Apple"], restrict_vocab=100000))


[('queen', 0.7118193507194519), ('monarch', 0.6189674139022827), ('princess', 0.5902431011199951), ('crown_prince', 0.5499460697174072), ('prince', 0.5377321839332581), ('kings', 0.5236844420433044), ('queens', 0.5181134343147278), ('sultan', 0.5098593831062317), ('monarchy', 0.5087411999702454), ('royal_palace', 0.5087166428565979)]
[('Walkman', 0.5814809203147888), ('MP3_player', 0.5763883590698242), ('MP3', 0.552082359790802), ('Panasonic', 0.5468561053276062), ('Blu_ray_disc', 0.5435828566551208), ('JVC', 0.5259768962860107), ('camcorder', 0.5257487297058105), ('Sony_PSP', 0.5226278305053711), ('PlayStation_Portable', 0.5171500444412231), ('Blu_ray', 0.5171388983726501)]


In [14]:
f = open(datapath('questions-words.txt'))
print("".join(f.readlines()[:15]))
f.close()



: capital-common-countries
Athens Greece Baghdad Iraq
Athens Greece Bangkok Thailand
Athens Greece Beijing China
Athens Greece Berlin Germany
Athens Greece Bern Switzerland
Athens Greece Cairo Egypt
Athens Greece Canberra Australia
Athens Greece Hanoi Vietnam
Athens Greece Havana Cuba
Athens Greece Helsinki Finland
Athens Greece Islamabad Pakistan
Athens Greece Kabul Afghanistan
Athens Greece London England
Athens Greece Madrid Spain



In [15]:
# computes a matrices score ; divided in different domains
# you can expect the result and do some error analysis

accuracy, results = word_emb.evaluate_word_analogies(datapath('questions-words.txt'))
print(f"Accuracy {accuracy}")
print(results[0].keys())
print(f"Correct {results[0]['correct'][:5]}")
print(f"Inorrect {results[0]['incorrect'][:5]}")

Accuracy 0.7401448525607863
dict_keys(['section', 'correct', 'incorrect'])
Correct [('ATHENS', 'GREECE', 'BANGKOK', 'THAILAND'), ('ATHENS', 'GREECE', 'BEIJING', 'CHINA'), ('ATHENS', 'GREECE', 'BERLIN', 'GERMANY'), ('ATHENS', 'GREECE', 'BERN', 'SWITZERLAND'), ('ATHENS', 'GREECE', 'CAIRO', 'EGYPT')]
Inorrect [('ATHENS', 'GREECE', 'BAGHDAD', 'IRAQ'), ('ATHENS', 'GREECE', 'HANOI', 'VIETNAM'), ('ATHENS', 'GREECE', 'KABUL', 'AFGHANISTAN'), ('ATHENS', 'GREECE', 'LONDON', 'ENGLAND'), ('BAGHDAD', 'IRAQ', 'BERN', 'SWITZERLAND')]


Implement word similarity benchmark evaluation (instrisic evaluation)

In [16]:
# reload the data

ws353_df = pd.read_csv(datapath('wordsim353.tsv'), sep = "\t", skiprows=1).rename(columns ={"# Word 1": "Word 1"})

# Get embeddings

w1 = word_emb[ws353_df["Word 1"]]
w2 = word_emb[ws353_df["Word 2"]]

# compute cosine similarities

cos_mat = cosine_similarity(w1, w2)
cos_pairs = np.diag(cos_mat)

# compute correlations

print(pearsonr(cos_pairs, ws353_df["Human (mean)"]))
print(spearmanr(cos_pairs, ws353_df["Human (mean)"]))


(0.6525349483670781, 3.3734367166422075e-44)
SpearmanrResult(correlation=0.7000166486272194, pvalue=2.86866666051422e-53)


Pretraining your own embeddings

In [17]:
# get a dataset

corpus = open(datapath('lee_background.cor'))
sample = corpus.readline()
print(sample, utils.simple_preprocess(sample))

Hundreds of people have been forced to vacate their homes in the Southern Highlands of New South Wales as strong winds today pushed a huge bushfire towards the town of Hill Top. A new blaze near Goulburn, south-west of Sydney, has forced the closure of the Hume Highway. At about 4:00pm AEDT, a marked deterioration in the weather as a storm cell moved east across the Blue Mountains forced authorities to make a decision to evacuate people from homes in outlying streets at Hill Top in the New South Wales southern highlands. An estimated 500 residents have left their homes for nearby Mittagong. The New South Wales Rural Fire Service says the weather conditions which caused the fire to burn in a finger formation have now eased and about 60 fire units in and around Hill Top are optimistic of defending all properties. As more than 100 blazes burn on New Year's Eve in New South Wales, fire crews have been called to new fire at Gunning, south of Goulburn. While few details are available at this

In [18]:
class MyCorpus:
    """ An iterator that yields sentences """

    def __iter__(self):
        corpus_path = datapath('lee_background.cor')
        for line in open(corpus_path):
            # we assume there's one document per line, tokens separated by whitespace
            yield utils.simple_preprocess(line)



In [19]:
# pretrain our own embeddings
# we will use the Word2Vec class from gensim.models
# Vocabulary building + training 

model = Word2Vec(sentences = MyCorpus(),
                 min_count = 3, # during vocabulary building ignore all words with freq < min_count
                 vector_size = 200, # dimensionality of the vectors
                 sg = 1, # set to 1 for skip-gram; if 0 will be CBOW style
                 epochs = 10,
                 alpha = 0.025, # initial learning rate
                 batch_words = 10000, # batch size
                 window = 5, # window size for context words
                 negative = 10, # number of negatives for negative sampling
                 ns_exponent = 0.75 # exponent of the sampling distribution
                )

print(model)
word_emb_lee = model.wv # wv attribute contains word embeddings; wv stands for word vector

Word2Vec<vocab=2747, vector_size=200, alpha=0.025>


Saving and loading embeddings

Saving and loading the full model (embeddings plus hyperparameters)
allows us to resume training

If you save only word embeddings, this does not allow to resume training

In [20]:
save_path = "word2vec_lee.model"
model.save(save_path)
model_reloaded = Word2Vec.load(save_path)

save_path = "word2vee_lee.emb"
model.wv.save(save_path)
emb_reloaded = KeyedVectors.load(save_path)

Extrinsic evaluation of word embeddings
In this example, we will use them to solve a spam classification task

In [21]:
spam_df = pd.read_csv("C:\\Users\\roven\\NLP\\data\\SMSSpamCollection.tsv", sep="\t", header=None, names=["label", "text"])
spam_df

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [22]:
# do one-ot encoding of the labels

label_encoder = LabelEncoder()
spam_df["label"] = label_encoder.fit_transform(spam_df['label'])
spam_df

Unnamed: 0,label,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,1,This is the 2nd time we have tried 2 contact u...
5568,0,Will ü b going to esplanade fr home?
5569,0,"Pity, * was in mood for that. So...any other s..."
5570,0,The guy did some bitching but I acted like i'd...


Building a classification model
Standard ML approach:
 1. Preprocess the text
    a. lowercasing
    b. tokenization
    c. stopword removal
2. Create a sentence embedding of each SMS as the average of 
word embeddings in that sentence


In [23]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\roven\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [24]:
# lowercase, tokenize and stopwords removal

stop_words = set(stopwords.words('english'))
spam_df["preprocessed_text"] = spam_df["text"].apply(lambda sentence: [word for word in utils.simple_preprocess(sentence) if word not in stop_words ])
spam_df

Unnamed: 0,label,text,preprocessed_text
0,0,"Go until jurong point, crazy.. Available only ...","[go, jurong, point, crazy, available, bugis, g..."
1,0,Ok lar... Joking wif u oni...,"[ok, lar, joking, wif, oni]"
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,"[free, entry, wkly, comp, win, fa, cup, final,..."
3,0,U dun say so early hor... U c already then say...,"[dun, say, early, hor, already, say]"
4,0,"Nah I don't think he goes to usf, he lives aro...","[nah, think, goes, usf, lives, around, though]"
...,...,...,...
5567,1,This is the 2nd time we have tried 2 contact u...,"[nd, time, tried, contact, pound, prize, claim..."
5568,0,Will ü b going to esplanade fr home?,"[going, esplanade, fr, home]"
5569,0,"Pity, * was in mood for that. So...any other s...","[pity, mood, suggestions]"
5570,0,The guy did some bitching but I acted like i'd...,"[guy, bitching, acted, like, interested, buyin..."


In [25]:
# create sentence embeddings
# for every tokenize sentence I take the word embedding of the word in that tokenized sentence, if that word is in the vocabulary

spam_df["sent_emb"] = spam_df["preprocessed_text"].apply(lambda tok_sentence: np.mean([word_emb[word] for word in tok_sentence if word in word_emb.key_to_index], axis = 0))



Mean of empty slice.



In [26]:
# drop NA

spam_df = spam_df.dropna()

In [27]:
# train-test split 

all_features = spam_df.drop(columns = "label")
features_train, features_test, y_train, y_test = train_test_split(all_features, spam_df["label"], test_size = 0.2, random_state = 2023, stratify=spam_df["label"])

print(features_train.shape, features_test.shape)

(4444, 3) (1111, 3)


Logistic regression classifier

In [28]:
logreg_model = LogisticRegressionCV(Cs=10, cv = 5, penalty='l2', max_iter = 1000)
sent_emb_train = np.stack(features_train["sent_emb"])
logreg_model.fit(sent_emb_train, y_train)

In [29]:
sent_emb_test = np.stack(features_test["sent_emb"])
print(f"Accuracy of the model {logreg_model.score(sent_emb_test, y_test)}")

Accuracy of the model 0.9567956795679567


In [30]:
print(classification_report(y_test, logreg_model.predict(sent_emb_test), target_names=label_encoder.classes_))

              precision    recall  f1-score   support

         ham       0.97      0.98      0.98       962
        spam       0.87      0.79      0.83       149

    accuracy                           0.96      1111
   macro avg       0.92      0.89      0.90      1111
weighted avg       0.96      0.96      0.96      1111



Using gensim embeddings in PyTorch

In [31]:
embs = nn.Embedding (len(word_emb.key_to_index),300).from_pretrained(torch.from_numpy(word_emb.vectors))
idx = torch.LongTensor([word_emb.key_to_index[word] for word in ['soccer', 'tennis', 'football']])
embs(idx)

tensor([[-7.6660e-02,  1.1035e-01,  3.5352e-01, -7.9102e-02, -5.0049e-02,
         -2.9688e-01,  1.0938e-01, -3.5938e-01, -8.7402e-02, -7.0312e-02,
          2.0801e-01, -2.4512e-01, -5.5664e-02,  2.4219e-01,  2.3560e-02,
         -8.6670e-03,  2.6855e-02,  4.0234e-01,  1.7480e-01, -1.6602e-02,
         -2.0410e-01,  5.0000e-01, -8.9844e-02, -1.4355e-01,  5.5420e-02,
          9.0820e-02,  1.1426e-01,  1.5430e-01,  1.3477e-01,  2.2656e-01,
          4.4189e-02,  3.7109e-02, -1.1621e-01, -1.1328e-01, -1.6479e-02,
         -1.2695e-01,  2.9883e-01, -1.2598e-01,  1.0303e-01,  3.1641e-01,
          1.2665e-03, -8.8379e-02,  1.2695e-01,  1.5820e-01, -7.1777e-02,
         -2.1094e-01,  3.1641e-01, -2.0801e-01,  9.6893e-04,  3.2422e-01,
          7.1289e-02,  7.1289e-02, -9.2285e-02, -2.2705e-02,  9.5703e-02,
         -2.9883e-01, -6.5918e-02, -7.0801e-02, -7.4219e-02, -2.3535e-01,
         -2.8320e-01, -2.0215e-01, -1.6211e-01,  2.2070e-01, -2.3682e-02,
         -1.0645e-01,  1.9653e-02, -5.

In [35]:
# plot 3D PCA representation of the sentence embeddings (ham and spam different colors)

pca_sent_emb = PCA(n_components=3).fit_transform(np.stack(spam_df["sent_emb"]))
pca_df = pd.DataFrame(pca_sent_emb, columns = ["pca_x","pca_y","pca_z"])
pca_df["label"] = label_encoder.inverse_transform(spam_df["label"])
px.scatter_3d(pca_df, x = "pca_x", y = "pca_y", z = "pca_z", color = "label", opacity = 0.7)


Implement a new classifier

In [36]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

tuned_parameters = [
    {"kernel": ["rbf"], "gamma": [1e-3, 1e-4], "C": [1, 10, 100, 1000]},
    {"kernel": ["linear"], "C": [1, 10, 100, 1000]}
]

# find best hyperparameter combination according to F1 score and then refit the best estimator on the entire training set 

grid_search = GridSearchCV(SVC(), tuned_parameters, cv = 5, scoring = "f1", refit = "f1")
grid_search.fit(sent_emb_train, y_train)

In [37]:
from sklearn.metrics import accuracy_score 

print(f"SVC accuracy on test set {accuracy_score(y_test, grid_search.predict(sent_emb_test))}")
print(classification_report(y_test, grid_search.predict(sent_emb_test), target_names=label_encoder.classes_))

SVC accuracy on test set 0.963996399639964
              precision    recall  f1-score   support

         ham       0.97      0.99      0.98       962
        spam       0.90      0.82      0.86       149

    accuracy                           0.96      1111
   macro avg       0.94      0.90      0.92      1111
weighted avg       0.96      0.96      0.96      1111

