##### Imports:

In [1]:
from nltk.tokenize import sent_tokenize, word_tokenize
import string
from nltk.corpus import stopwords
import numpy as np
import pandas as pd

##### Custom class of Word2Vec Skip-gram model:

In [2]:
# Class Word2Vec Skip-gram model
class Word2VecSkipGram():
    
    # Constructor
    def __init__(self, sentences, vector_size, window, learning_rate, epochs, verbose=False):
        self.h = None
        self.u = None
        self.y = None
        self.loss = 0
        self.vector_size = vector_size # Number of neurons
        self.window = window
        self.epochs = epochs;
        self.verbose = verbose
        self.learning_rate = learning_rate
        self.training_samples, self.vocabulary, self.vocabulary_size = self.create_vectors(sentences)
        self.word_vector_weights = np.random.uniform(-0.9, 0.9, (self.vocabulary_size, self.vector_size))
        self.context_vector_weights = np.random.uniform(-0.9, 0.9, (self.vector_size, self.vocabulary_size))
        self.train()
        
    # Method to create vectors (One-hot encoded embeddings) of words and their context words
    def create_vectors(self, sentences):
        
        # Creating a vocabulary (Dictionary with word and its index)
        all_words = []
        for sentence in sentences:
            for word in sentence:
                all_words.append(word)
        unique_words = sorted(list(set(all_words)))
        vocabulary = {}
        for i in range(len(unique_words)):
            vocabulary[unique_words[i]] = i
        vocabulary_size = len(vocabulary)

        # Creating vectors (Training samples) for all words and their context words
        training_samples = []
        for sentence in sentences:
            for i in range(len(sentence)):

                # Creating a target word vector
                word_vector = [0 for _ in range(vocabulary_size)]
                word_vector[vocabulary[sentence[i]]] = 1

                # Creating context words vector
                context_vector = [0 for _ in range(vocabulary_size)]
                for j in range(i - self.window, i + self.window):
                    if j != i and j >= 0 and j < len(sentence):
                        context_vector[vocabulary[sentence[j]]] = 1
                        
                # Creating training sample tuple for current word (word_vector, context_vector)
                training_samples.append((word_vector, context_vector))                                
        
        return training_samples, vocabulary, vocabulary_size
        
    # Softmax activation function
    def softmax(self, vector):
        e_x = np.exp(vector - np.max(vector))
        return e_x / e_x.sum()
        
    # Method to perform forward pass
    def forward_pass(self, word_vector):
        self.h = np.dot(self.word_vector_weights.T, word_vector).reshape(self.vector_size, 1)
        self.u = np.dot(self.context_vector_weights.T, self.h)  
        self.y = self.softmax(self.u)
        return self.y
    
    # Method to perform back propagation
    def back_propagation(self, word_vector, context_vector):
        
        # Computing error for context words
        e = self.y - np.asarray(context_vector).reshape(self.vocabulary_size, 1)
        
        # Calculating new weights
        new_context_vector_weights = np.dot(self.h, e.T)
        new_word_vector_weights = np.dot(np.array(word_vector).reshape(self.vocabulary_size, 1), np.dot(self.context_vector_weights, e).T)
        
        # Changing weights using back propagation
        self.context_vector_weights -= self.learning_rate * new_context_vector_weights
        self.word_vector_weights -= self.learning_rate * new_word_vector_weights
        
    # Method to calculate loss
    def calculate_loss(self, context_vector):
        C = 0
        for i in range(self.vocabulary_size):
            if context_vector[i]:
                self.loss += -1 * self.u[i][0]
                C += 1
        self.loss += C * np.log(np.sum(np.exp(self.u)))
        
    # Method to perform training of model
    def train(self):
        for i in range(0, self.epochs):
            self.loss = 0
            for word_vector, context_vector in self.training_samples:
                
                # Performing forward pass
                self.forward_pass(word_vector)
                
                # Performing back propagation
                self.back_propagation(word_vector, context_vector)
                
                # Calculating loss
                self.calculate_loss(context_vector)
                
            # Adaptive learning rate
            self.learning_rate *= 1 / (1 + self.learning_rate * i)
            
            # Displaying loss after every epoch
            if self.verbose:
                print("Epoch: ", i, ", Loss: ", self.loss, sep="")
                
    # Method to get most similar words for a given word
    def most_similar(self, word, top=10):
        if word in list(self.vocabulary.keys()):
            
            # Create word vector of given word
            word_vector = [0 if i != self.vocabulary[word] else 1 for i in range(self.vocabulary_size)]
            
            # Performing forward pass
            prediction = self.forward_pass(word_vector)
            
            # Creating a dictionary of all words and their scores
            vocabulary = list(self.vocabulary)
            word_scores = {}
            for i in range(len(vocabulary)):
                if vocabulary[i] != word:
                    word_scores[vocabulary[i]] = prediction[i][0]
                
            # Sorting dictionary of all words and their scores
            word_scores = sorted(word_scores.items(), key=lambda item: item[1], reverse=True)
            
            return word_scores[:top]
        return None
    
    # Method to get weight vector (Embedding) of given word
    def get_embedding(self, word):
        if word in list(self.vocabulary.keys()):
            return self.word_vector_weights[self.vocabulary[word]]
        return None

##### Utility functions:

In [3]:
# Function to calculate cosine similarity
def cosine_similarity(word_embedding1, word_embedding2):
    a = word_embedding1
    b= word_embedding2
    return np.dot(word_embedding1, word_embedding2) / np.sqrt(np.dot(word_embedding1, word_embedding1) * np.dot(word_embedding2, word_embedding2))

##### Loading dataset:

In [4]:
# Reading data from file
with open("data.txt", "r") as in_file:
    data = in_file.read()
data

"Today we will be learning about the fundamentals of data science and\nstatistics. Data Science and statistics are hot and growing fields with\nalternative names of machine learning, artificial intelligence, big\ndata, etc. I'm really excited to talk to you about data science and\nstatistics because data science and statistics have long been passions\nof mine. I didn't use to be very good at data science and statistics but\nafter studying data science and statistics for a long time, I got\nbetter and better at it until I became a data science and statistics\nexpert. I'm really excited to talk to you about data science and\nstatistics, thanks for listening to me talk about data science and\nstatistics."

In [5]:
# Sentence tokenization
sentences = sent_tokenize(data)
sentences

['Today we will be learning about the fundamentals of data science and\nstatistics.',
 'Data Science and statistics are hot and growing fields with\nalternative names of machine learning, artificial intelligence, big\ndata, etc.',
 "I'm really excited to talk to you about data science and\nstatistics because data science and statistics have long been passions\nof mine.",
 "I didn't use to be very good at data science and statistics but\nafter studying data science and statistics for a long time, I got\nbetter and better at it until I became a data science and statistics\nexpert.",
 "I'm really excited to talk to you about data science and\nstatistics, thanks for listening to me talk about data science and\nstatistics."]

##### Data preprocessing:

In [6]:
# Removing new line characters
sentences = [sentence.replace("\n", " ") for sentence in sentences]
sentences

['Today we will be learning about the fundamentals of data science and statistics.',
 'Data Science and statistics are hot and growing fields with alternative names of machine learning, artificial intelligence, big data, etc.',
 "I'm really excited to talk to you about data science and statistics because data science and statistics have long been passions of mine.",
 "I didn't use to be very good at data science and statistics but after studying data science and statistics for a long time, I got better and better at it until I became a data science and statistics expert.",
 "I'm really excited to talk to you about data science and statistics, thanks for listening to me talk about data science and statistics."]

In [7]:
# Making dataset lowercase
sentences = [sentence.casefold() for sentence in sentences]
sentences

['today we will be learning about the fundamentals of data science and statistics.',
 'data science and statistics are hot and growing fields with alternative names of machine learning, artificial intelligence, big data, etc.',
 "i'm really excited to talk to you about data science and statistics because data science and statistics have long been passions of mine.",
 "i didn't use to be very good at data science and statistics but after studying data science and statistics for a long time, i got better and better at it until i became a data science and statistics expert.",
 "i'm really excited to talk to you about data science and statistics, thanks for listening to me talk about data science and statistics."]

In [8]:
# Removing punctuation from dataset
sentences = [sentence.translate(sentence.maketrans("", "", string.punctuation)) for sentence in sentences]
sentences

['today we will be learning about the fundamentals of data science and statistics',
 'data science and statistics are hot and growing fields with alternative names of machine learning artificial intelligence big data etc',
 'im really excited to talk to you about data science and statistics because data science and statistics have long been passions of mine',
 'i didnt use to be very good at data science and statistics but after studying data science and statistics for a long time i got better and better at it until i became a data science and statistics expert',
 'im really excited to talk to you about data science and statistics thanks for listening to me talk about data science and statistics']

In [9]:
# Word tokenization
sentences = [word_tokenize(sentence) for sentence in sentences]
print(sentences)

[['today', 'we', 'will', 'be', 'learning', 'about', 'the', 'fundamentals', 'of', 'data', 'science', 'and', 'statistics'], ['data', 'science', 'and', 'statistics', 'are', 'hot', 'and', 'growing', 'fields', 'with', 'alternative', 'names', 'of', 'machine', 'learning', 'artificial', 'intelligence', 'big', 'data', 'etc'], ['im', 'really', 'excited', 'to', 'talk', 'to', 'you', 'about', 'data', 'science', 'and', 'statistics', 'because', 'data', 'science', 'and', 'statistics', 'have', 'long', 'been', 'passions', 'of', 'mine'], ['i', 'didnt', 'use', 'to', 'be', 'very', 'good', 'at', 'data', 'science', 'and', 'statistics', 'but', 'after', 'studying', 'data', 'science', 'and', 'statistics', 'for', 'a', 'long', 'time', 'i', 'got', 'better', 'and', 'better', 'at', 'it', 'until', 'i', 'became', 'a', 'data', 'science', 'and', 'statistics', 'expert'], ['im', 'really', 'excited', 'to', 'talk', 'to', 'you', 'about', 'data', 'science', 'and', 'statistics', 'thanks', 'for', 'listening', 'to', 'me', 'talk'

In [10]:
# Removing stop words from dataset
stop_words = set(stopwords.words("english"))
filtered_sentences = []
for sentence in sentences:
    original_sentence = sentence
    sentence = [word for word in sentence if word not in stop_words]
    if len(sentence) < 1:
        sentence = original_sentence
    filtered_sentences.append(sentence)
sentences = filtered_sentences
print(sentences)

[['today', 'learning', 'fundamentals', 'data', 'science', 'statistics'], ['data', 'science', 'statistics', 'hot', 'growing', 'fields', 'alternative', 'names', 'machine', 'learning', 'artificial', 'intelligence', 'big', 'data', 'etc'], ['im', 'really', 'excited', 'talk', 'data', 'science', 'statistics', 'data', 'science', 'statistics', 'long', 'passions', 'mine'], ['didnt', 'use', 'good', 'data', 'science', 'statistics', 'studying', 'data', 'science', 'statistics', 'long', 'time', 'got', 'better', 'better', 'became', 'data', 'science', 'statistics', 'expert'], ['im', 'really', 'excited', 'talk', 'data', 'science', 'statistics', 'thanks', 'listening', 'talk', 'data', 'science', 'statistics']]


#####  Word2Vec (Skip-gram) model:

In [11]:
vector_size = 5
window = 3
learning_rate = 0.001
epochs = 100
model = Word2VecSkipGram(sentences, vector_size, window, learning_rate, epochs, verbose=True)

Epoch: 0, Loss: 1010.4370621166835
Epoch: 1, Loss: 1008.286403227592
Epoch: 2, Loss: 1006.1531127118399
Epoch: 3, Loss: 1004.036855780981
Epoch: 4, Loss: 1001.9373392554968
Epoch: 5, Loss: 999.8543198130064
Epoch: 6, Loss: 997.7876112745554
Epoch: 7, Loss: 995.7370907910096
Epoch: 8, Loss: 993.7027038193805
Epoch: 9, Loss: 991.6844678093704
Epoch: 10, Loss: 989.6824745524144
Epoch: 11, Loss: 987.6968911778624
Epoch: 12, Loss: 985.727959812491
Epoch: 13, Loss: 983.7759959492895
Epoch: 14, Loss: 981.8413855984904
Epoch: 15, Loss: 979.9245813173684
Epoch: 16, Loss: 978.0260972349549
Epoch: 17, Loss: 976.1465032029964
Epoch: 18, Loss: 974.2864182153612
Epoch: 19, Loss: 972.4465032444394
Epoch: 20, Loss: 970.6274536452374
Epoch: 21, Loss: 968.8299912762287
Epoch: 22, Loss: 967.0548564808503
Epoch: 23, Loss: 965.3028000655272
Epoch: 24, Loss: 963.574575399683
Epoch: 25, Loss: 961.8709307509355
Epoch: 26, Loss: 960.192601955216
Epoch: 27, Loss: 958.540305507198
Epoch: 28, Loss: 956.9147321418

In [12]:
model.most_similar("data")

[('statistics', 0.14859777221789916),
 ('science', 0.07893528809438585),
 ('good', 0.05704925368399773),
 ('became', 0.05512691988759127),
 ('talk', 0.047754659246576965),
 ('better', 0.03313228113539583),
 ('really', 0.03123615350212278),
 ('studying', 0.030067961617975696),
 ('big', 0.029765136960201312),
 ('fields', 0.0286821829338157)]

In [13]:
model.most_similar("statistics")

[('data', 0.41371046381530585),
 ('science', 0.1336779523410498),
 ('became', 0.08664259790088952),
 ('good', 0.04534134002253001),
 ('talk', 0.0438375350344247),
 ('long', 0.03729519228312033),
 ('really', 0.02800214931207578),
 ('fields', 0.02475041681919624),
 ('better', 0.014835301207424666),
 ('expert', 0.013819005960043934)]

##### Computing cosine similarity:

In [14]:
cosine_similarity_matrix = np.zeros((model.vocabulary_size, model.vocabulary_size))
for i in range(model.vocabulary_size):
    for j in range(model.vocabulary_size):
        cosine_similarity_matrix[i][j] = cosine_similarity(model.get_embedding(list(model.vocabulary)[i]), model.get_embedding(list(model.vocabulary)[j]))
pd.DataFrame(data=cosine_similarity_matrix, columns=list(model.vocabulary), index=list(model.vocabulary)).head(10)

Unnamed: 0,alternative,artificial,became,better,big,data,didnt,etc,excited,expert,...,passions,really,science,statistics,studying,talk,thanks,time,today,use
alternative,1.0,-0.285043,0.053769,-0.679491,0.096176,0.446685,-0.017968,-0.21045,-0.000193,-0.280719,...,0.197267,-0.434694,-0.167805,0.203478,0.356667,0.745365,-0.029041,0.365627,0.805696,-0.298269
artificial,-0.285043,1.0,-0.104594,0.865117,-0.291089,0.269344,-0.321764,-0.487023,0.833241,-0.654566,...,-0.41709,-0.524127,0.333866,-0.204251,-0.063536,-0.206273,-0.385485,-0.432621,-0.113201,0.382615
became,0.053769,-0.104594,1.0,-0.173582,-0.674288,-0.505346,0.674966,-0.197224,-0.378414,-0.104401,...,-0.055766,-0.497719,-0.577099,-0.229863,-0.745073,-0.24709,0.45994,-0.015184,0.329703,-0.923156
better,-0.679491,0.865117,-0.173582,1.0,-0.338618,0.144273,-0.133794,-0.267717,0.676114,-0.304351,...,-0.21182,-0.189377,0.489451,-0.063513,-0.057351,-0.359567,-0.146419,-0.324189,-0.384291,0.495798
big,0.096176,-0.291089,-0.674288,-0.338618,1.0,-0.109615,-0.799045,0.359512,-0.184043,0.179161,...,-0.348911,0.685902,-0.130032,-0.287401,0.258845,-0.098238,-0.650114,-0.300283,-0.483541,0.484026
data,0.446685,0.269344,-0.505346,0.144273,-0.109615,1.0,-0.076411,-0.158209,0.69759,-0.178943,...,0.45456,-0.360867,0.765877,0.714018,0.810506,0.808092,-0.011499,0.44535,0.569706,0.433511
didnt,-0.017968,-0.321764,0.674966,-0.133794,-0.799045,-0.076411,1.0,-0.092801,-0.358848,0.23052,...,0.660714,-0.274423,-0.057522,0.391909,-0.168388,0.204639,0.936601,0.613639,0.46047,-0.623865
etc,-0.21045,-0.487023,-0.197224,-0.267717,0.359512,-0.158209,-0.092801,1.0,-0.212168,0.913285,...,-0.098404,0.604097,0.248404,0.362761,-0.132711,-0.256201,-0.200563,-0.254981,-0.296195,-0.041037
excited,-0.000193,0.833241,-0.378414,0.676114,-0.184043,0.69759,-0.358848,-0.212168,1.0,-0.411168,...,-0.182676,-0.495829,0.7063,0.279263,0.306641,0.185451,-0.424756,-0.230805,0.158435,0.489501
expert,-0.280719,-0.654566,-0.104401,-0.304351,0.179161,-0.178943,0.23052,0.913285,-0.411168,1.0,...,0.249014,0.665488,0.239389,0.480184,-0.0432,-0.141606,0.191799,0.080122,-0.23845,-0.09738
