<a href="https://colab.research.google.com/github/psych0man/Continuous-Bag-of-Words-Model/blob/master/Continuous_Bag_of_Words_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Importing Libraries 

In [1]:
import nltk
from nltk.tokenize import word_tokenize
import numpy as np
from collections import Counter

In [2]:
nltk.data.path.append('.')

In [None]:
nltk.download('punkt')

# Pre-Processing

In [None]:
import re
with open('shakespeare.txt') as f:
  data = f.read()
data = re.sub(r'[,!?;-]', '.',data) # Punctuations
data = nltk.word_tokenize(data)
data = [ ch.lower() for ch in data if ch.isalpha() or ch == '.'] # Lowercase and non-alphabetical

In [7]:
freq_dist = nltk.FreqDist(word for word in data)

# Training the CBOW Model

In [8]:
def initialize(N,V, random_seed=1):
  np.random.seed(random_seed)

  w1 = np.random.rand(N,V)
  w2 = np.random.rand(V,N)
  b1 = np.random.rand(N,1)
  b2 = np.random.rand(V,1)

  return w1,w2,b1,b2

In [9]:
def softmax(z):
  e = np.exp(z)
  y_hat = e/e.sum(e,axis=0)

  return y_hat

In [10]:
def forward_propagation(x,w1,w2,b1,b2):

  h = np.dot(w1,x) + b1
  h = np.maximum(0,h) # Make-shift ReLU
  z = np.dot(w2,h)+b2

  return z,h

In [11]:
def cost(y,y_hat, batch_size):
  log_prob = np.multiply(np.log(y_hat),y) + np.multiply(np.log(1 - y_hat), 1 - y)
  cost = -1/batch_size * np.sum(log_prob)
  cost = np.squeeze(cost)

  return cost

In [13]:
def back_prob(x, y_hat, y, h, w1, w2, b1, b2, batch_size):
  l1 = np.dot(w2.T,(y_hat-y))
  l1 = np.maximum(0,l1) # Make-shift ReLU
  grad_w1 = (1/batch_size)*np.dot(l1,x.T)
  grad_w2 = (1/batch_size)*np.dot(y_hat-y,h.T)
  grad_b1 = np.sum((1/batch_size)*np.dot(l1,x.T),axis=1,keepdims=True)
  grad_b2 = np.sum((1/batch_size)*np.dot(y_hat-y,h.T),axis=1,keepdims=True)

  return grad_w1, grad_w2, grad_b1, grad_b2

In [14]:
def get_batches(data, word2Ind, V, C, batch_size):
    batch_x = []
    batch_y = []
    for x, y in get_vectors(data, word2Ind, V, C):
        while len(batch_x) < batch_size:
            batch_x.append(x)
            batch_y.append(y)
        else:
            yield np.array(batch_x).T, np.array(batch_y).T
            batch = []

In [15]:
def grad_descent(data, word2Ind, N, V, num_iters, alpha=0.03):

    w1, w2, b1, b2 = initialise(N,V, random_seed=282)
    batch_size = 128
    iters = 0
    C = 2
    for x, y in get_batches(data, word2Ind, V, C, batch_size):

        z, h = forward_prop(x, w1, w2, b1, b2)
        yhat = softmax(z)
        cost = cost(y, y_hat, batch_size)

        if ( (iters+1) % 10 == 0):
            print(f"iters: {iters + 1} cost: {cost:.6f}")
        
        grad_w1, grad_w2, grad_b1, grad_b2 = back_prop(x, y_hat, y, h, w1, w2, b1, b2, batch_size)
        
        w1 -= alpha*grad_w1 
        w2 -= alpha*grad_w2
        b1 -= alpha*grad_b1
        b2 -= alpha*grad_b2
        
        
        iters += 1 
        if iters == num_iters: 
            break
        if iters % 100 == 0:
            alpha *= 0.66
            
    return w1, w2, b1, b2