## Word2Vec with SGD (Stochastic Gradient Descent) and NEG (Negative Sampling) 
### Implementation Steps:
1. Assign the target word and neighboring context words as **Positive** examples.
2. Assign randomly sampled words in the lexicon based on a unigram distrubution (built using word frequency) as **Negative** examples.
3. Train the model using a Logistic Classifier by optimizing the loss function.
4. Use the regression weights as the embedding vectors.

## Import Required Modules

In [9]:
import pandas as pd
import numpy as np
import pickle

import random
import string
import time

from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

import re
from collections import defaultdict

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rojin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
def sigmoid(x):
    '''Function to compute the value of x after applying the Sigmoid function'''
    return 1.0 /(1 + np.exp(-x))

## Data Preprocessing

In [8]:
def preprocessing(corpus):
    '''Function for data preprocessing'''
    processed = []
    
    stop_words = stopwords.words('english')
    
    # Split text corpus into sentences
    sentences = corpus.split(".")
    
    # Loop through each sentence
    for i in range(len(sentences)):
        
        # Remove leading and trailing characters
        sentences[i] = sentences[i].strip()
        
        # Split sentence into list of words
        sentence = sentences[i].split()
        
        # Remove punctuations
        x = [word.strip(string.punctuation) for word in sentence if word not in stop_words]
        
        # Convert to lower case
        x = [word.lower() for word in x]
        
        processed.append(x) 
        
    print('\nProcessed sentence is:',  processed)
        
    return processed

## Word2Vec with NEG

In [13]:
class Word2Vec_with_NEG:
    '''Implmentation of Skip-Gram Word2Vec model with negative sampling'''
    def __init__(self, sentences):
        self.sentences = sentences
        self.N = 5 # dimension of word embeddings
        self.learning_rate = 0.01 # learning rate
        self.epochs = 5000 # number of training epochs
        self.window = 2 # window size
        self.negative_rate = 5 #ratio of negative samples over positive samples
        self.min_count = 5 # minimum count of words to be considered
        self.word2idx = None
        self.unigram = None
        pass
    
    def generate_training_data(unigram_power=0.75):
        '''Function to generate the word counts and mapping from word to index and vice versa
        Input: List of tokenized sentences
        Output: 
        v: Vocabulary size
        word_list: list of words in vocabulary sorted in alphabetical order
        word2idx: dict with word as key and index as value
        word_freq: dict with word as key and frequency as value'''
        
         # Initialize a dictionary of word frequency
        word_freq = {}
        
        # Iterate over each sentence in the list of sentences
        for sent in self.sentences:
            # Iterate over each word in sentence
            for word in sent:
                # Create the frequency dictionary to count each word
                word_freq[word] = word_freq.get(word, 0) + 1

        # Remove words that have frequency < minCount
        if self.min_count > 1:
            word_freq = {word:freq for word, freq in word_freq.items() if freq >= self.min_count}

        # Create word2idx and idx2word dictionaries from word_list
        self.word2idx = {w: idx for (idx, w) in enumerate(word_freq.keys())}

        # Compute unigram
        
        # Initialize an array of unigram
        unigram = np.zeros(len(self.word2idx))
        
        # Iterate over list of words and calculate the probability for each word
        for word, frequency in word_freq.items():
            # Raise each word frequency to the power chosen
            f = frequency ** unigram_power
            # Update unigram array
            unigram[self.word2idx[word]] = f
        
        # Normalization
        self.unigram = unigram / np.sum(unigram)
    
    def generate_positive_words():
        '''Function to generate positive training words'''
        
        P = [] # Initialize list of positive words
        V = len(self.word2id) # Size of vocabulary
        
        N_sentences = len(self.sentences)
        
        # If the word does not exist in the dictionary (due to min_count) then set its index to -1
        sentences_index_form = [None]* N_Sentences
        for idx, sent in enumerate(self.sentences):
            sentences_index_form[idx] = [self.word2idx.get(w, -1) for w in sent]
        
        
        

In [10]:
# Set random seed
np.random.seed(0) 

# Get text data
text = "Welcome students to the Department of Computer Science. We have great faculty and professors. We will have a welcome program today."

# Pre-process the data
corpus = preprocessing(text)


Processed sentence is: [['welcome', 'students', 'department', 'computer', 'science'], ['we', 'great', 'faculty', 'professors'], ['we', 'welcome', 'program', 'today'], []]


In [11]:
corpus

[['welcome', 'students', 'department', 'computer', 'science'],
 ['we', 'great', 'faculty', 'professors'],
 ['we', 'welcome', 'program', 'today'],
 []]