In [None]:
import nltk
import re
from nltk.tokenize import word_tokenize
import numpy as np
from collections import Counter
nltk.data.path.append('.')
from scipy import linalg
from collections import defaultdict
def sigmoid(z):
    # sigmoid function
    return 1.0/(1.0+np.exp(-z))

In [None]:
def get_idx(words, word2Ind):
    idx = []
    for word in words:
        idx = idx + [word2Ind[word]]
    return idx


def pack_idx_with_frequency(context_words, word2Ind):
    freq_dict = defaultdict(int)
    for word in context_words:
        freq_dict[word] += 1
    idxs = get_idx(context_words, word2Ind)
    packed = []
    for i in range(len(idxs)):
        idx = idxs[i]
        freq = freq_dict[context_words[i]]
        packed.append((idx, freq))
    return packed


#Given corpus, break it up into arrays of context words and center word, send the context word array to 
#the function above and each words frequency in the corpus, then divide this vector by num of context words in the array
# Thus x would look like [0.25, 0.25, 0.  , 0.5 , 0.  ] for vocab of size 5
#At the same time y is a one hot vector with the center word's position turned on
#For example [0,0,0,1,0]

def get_vectors(data, word2Ind, V, C):
    i = C
    while True:
        y = np.zeros(V)
        x = np.zeros(V)
        center_word = data[i]
        y[word2Ind[center_word]] = 1
        context_words = data[(i - C):i] + data[(i+1):(i+C+1)]
        num_ctx_words = len(context_words)
        for idx, freq in pack_idx_with_frequency(context_words, word2Ind):
            x[idx] = freq/num_ctx_words
        yield x, y
        i += 1
        if i >= len(data):
            print('i is being set to 0')
            i = 0





def get_batches(data, word2Ind, V, C, batch_size):
    batch_x = []
    batch_y = []
    for x, y in get_vectors(data, word2Ind, V, C):
        while len(batch_x) < batch_size:
            batch_x.append(x)
            batch_y.append(y)
        else:
            yield np.array(batch_x).T, np.array(batch_y).T
            batch = []
  



In [None]:
#Returns word_dict: a dictionary with the weighted probabilities of each word
        #word2Ind: returns dictionary mapping the word to its index
        #Ind2Word: returns dictionary mapping the index to its word
def get_dict(data):
    words = sorted(list(set(data)))
    n = len(words)
    idx = 0
    word2Ind = {}
    Ind2word = {}
    for k in words:
        word2Ind[k] = idx
        Ind2word[idx] = k
        idx += 1
    return word2Ind, Ind2word

In [None]:
data="In an attempt to build an AI-ready workforce, Microsoft announced Intelligent Cloud Hub which has been launched to empower the next generation of students with AI-ready skills. Envisioned as a three-year collaborative program, Intelligent Cloud Hub will support around 100 institutions with AI infrastructure, course content and curriculum, developer support, development tools and give students access to cloud and AI services. As part of the program, the Redmond giant which wants to expand its reach and is planning to build a strong developer ecosystem in India with the program will set up the core AI infrastructure and IoT Hub for the selected campuses. The company will provide AI development tools and Azure AI services such as Microsoft Cognitive Services, Bot Services and Azure Machine Learning.According to Manish Prakash, Country General Manager-PS, Health and Education, Microsoft India, said, 'With AI being the defining technology of our time, it is transforming lives and industry and the jobs of tomorrow will require a different skillset. This will require more collaborations and training and working with AI. That’s why it has become more critical than ever for educational institutions to integrate new cloud and AI technologies. The program is an attempt to ramp up the institutional set-up and build capabilities among the educators to educate the workforce of tomorrow.' The program aims to build up the cognitive skills and in-depth understanding of developing intelligent cloud connected solutions for applications across industry. Earlier in April this year, the company announced Microsoft Professional Program In AI as a learning track open to the public. The program was developed to provide job ready skills to programmers who wanted to hone their skills in AI and data science with a series of online courses which featured hands-on labs and expert instructors as well. This program also included developer-focused AI school that provided a bunch of assets to help build AI skills."
data = re.sub(r'[,!?;-]', '.',data)                                 #  Punktuations are replaced by .
data = nltk.word_tokenize(data)                                     #  Tokenize string to words
data = [ ch.lower() for ch in data if ch.isalpha() or ch == '.']    #  Lower case and drop non-alphabetical tokens
print("Number of tokens:", len(data),'\n', data[:15]) 

Number of tokens: 328 
 ['in', 'an', 'attempt', 'to', 'build', 'an', 'workforce', '.', 'microsoft', 'announced', 'intelligent', 'cloud', 'hub', 'which', 'has']


In [None]:
word2Ind, Ind2word = get_dict(data)
V = len(word2Ind)
print("Size of vocabulary: ", V)
# example of word to index mapping
print("Index of the word 'workforce' :  ",word2Ind['workforce'] )
print("Word which has index 5:  ",Ind2word[5] )

Size of vocabulary:  155
Index of the word 'workforce' :   152
Word which has index 5:   aims


In [None]:
def initialize_model(N,V, random_seed=1):
    '''
    Inputs: 
        N:  dimension of hidden vector 
        V:  dimension of vocabulary
        random_seed: random seed for consistent results in the unit tests
     Outputs: 
        W1, W2, b1, b2: initialized weights and biases
    '''
    
    np.random.seed(random_seed)
    
   
    # W1 has shape (N,V)
    W1 = np.random.rand(N,V)
    # W2 has shape (V,N)
    W2 = np.random.rand(V,N)
    # b1 has shape (N,1)
    b1 = np.random.rand(N,1)
    # b2 has shape (V,1)
    b2 = np.random.rand(V,1)

    return W1, W2, b1, b2


#Gives probs for each possible word of V summing to 1
def softmax(z):
    '''
 prediction (estimate of y)
    '''
    
    yhat = np.exp(z)/np.sum(np.exp(z),axis=0)
    
    
    return yhat


def forward_prop(x, W1, W2, b1, b2):
    '''
    Inputs: 
        x:  average one hot vector for the context 
        W1, W2, b1, b2:  matrices and biases to be learned
     Outputs: 
        z:  output score vector
    '''
    
    
    # Calculate h
    h = np.dot(W1,x)+b1
    
    # Apply the relu on h 
    h = np.maximum(0,h)
    
    
    # Calculate z
    z = np.dot(W2,h)+b2
    

    return z, h



In [None]:
def compute_cost(y, yhat, batch_size):
    # cost function 
    logprobs = np.multiply(np.log(yhat),y) + np.multiply(np.log(1 - yhat), 1 - y)
    cost = - 1/batch_size * np.sum(logprobs)
    cost = np.squeeze(cost)
    return cost


In [None]:
def back_prop(x, yhat, y, h, W1, W2, b1, b2, batch_size):

    l1 = np.dot(W2.T,yhat-y)
    # Apply relu to l1
    l1 = np.maximum(0,l1)
    # Compute the gradient of W1
    grad_W1 = (1/batch_size)*np.dot(l1,x.T) 
    # Compute the gradient of W2
    grad_W2 = (1/batch_size)*np.dot(yhat-y,h.T)
    # Compute the gradient of b1
    grad_b1 = np.sum((1/batch_size)*np.dot(l1,x.T),axis=1,keepdims=True)
    # Compute the gradient of b2
    grad_b2 = np.sum((1/batch_size)*np.dot(yhat-y,h.T),axis=1,keepdims=True)
   \
    
    return grad_W1, grad_W2, grad_b1, grad_b2
  


In [None]:
def gradient_descent(data, word2Ind, N, V, num_iters, alpha=0.03):
    
 
    W1, W2, b1, b2 = initialize_model(N,V, random_seed=282)
    batch_size = 128
    iters = 0
    C = 2
    for x, y in get_batches(data, word2Ind, V, C, batch_size):
       
        # Get z and h
        z, h = forward_prop(x,W1,W2,b1,b2)
        # Get yhat
        yhat = softmax(z)
        # Get cost
        cost = compute_cost(y,yhat,batch_size)
        if ( (iters+1) % 10 == 0):
            print(f"iters: {iters + 1} cost: {cost:.6f}")
        # Get gradients
        grad_W1, grad_W2, grad_b1, grad_b2 = back_prop(x, yhat, y, h, W1, W2, b1, b2, batch_size)
        
        # Update weights and biases
        W1 -= alpha*grad_W1 
        W2 -= alpha*grad_W2
        b1 -= alpha*grad_b1
        b2 -= alpha*grad_b2
        
       
        
        iters += 1 
        if iters == num_iters: 
            break
        if iters % 100 == 0:
            alpha *= 0.66
            
    return W1, W2, b1, b2
W1, W2, b1, b2 = gradient_descent(data, word2Ind, 50, len(word2Ind), 500)

#Retreive embeddings of size V X N
embs = (W1.T + W2)/2.0


iters: 10 cost: 0.070543
iters: 20 cost: 0.031579
iters: 30 cost: 0.020415
iters: 40 cost: 0.015098
iters: 50 cost: 0.011985
iters: 60 cost: 0.009938
iters: 70 cost: 0.008490
iters: 80 cost: 0.007412
iters: 90 cost: 0.006576
iters: 100 cost: 0.005911
iters: 110 cost: 0.005524
iters: 120 cost: 0.005200
iters: 130 cost: 0.004912
iters: 140 cost: 0.004654
iters: 150 cost: 0.004422
iters: 160 cost: 0.004213
iters: 170 cost: 0.004022
iters: 180 cost: 0.003847
iters: 190 cost: 0.003688
iters: 200 cost: 0.003541
iters: 210 cost: 0.003445
iters: 220 cost: 0.003359
iters: 230 cost: 0.003278
iters: 240 cost: 0.003200
iters: 250 cost: 0.003126
iters: 260 cost: 0.003055
iters: 270 cost: 0.002987
iters: 280 cost: 0.002922
iters: 290 cost: 0.002860
iters: 300 cost: 0.002801
iters: 310 cost: 0.002761
iters: 320 cost: 0.002724
i is being set to 0
iters: 330 cost: 0.002688
iters: 340 cost: 0.002653
iters: 350 cost: 0.002619
iters: 360 cost: 0.002586
iters: 370 cost: 0.002554
iters: 380 cost: 0.002522
i

In [None]:
#Showing an example of similarity in our corpus between words education and learning
#As you can see 0.84 is pretty accurate and shows they are very similar
from nltk.cluster.util import cosine_distance

1-cosine_distance(embs[word2Ind['education']],embs[word2Ind['learning']])

0.8405726777600854