In [93]:
import nltk
import re
import pandas as pd
import numpy as np
import copy

In [94]:
# changing corpus into vocabulary

def process_corp(corpus):
    words = re.sub(" \d+", " ", corpus)
    words = words.split()
    unique_words = set(words)
    vocab = list(unique_words)
    return vocab

vocab_input = process_corp(corpus)


In [95]:
# generating one hot vectors for voacbulary

def one_hot_encoding(vocab_input,list_of_zeroes):
    encoded_vocab = []
    for i in range(len(vocab_input)):
        word_encoding = copy.deepcopy(list_of_zeroes)
        word_encoding[i] = 1
        encoded_vocab.append(word_encoding)
    return encoded_vocab


In [96]:
# generating the mapping words for input.This is for skipgram model.
def generate_mapping(vocab_input,window_size):
    mapped_words = {}
    for index, word in enumerate(vocab_input):
        neighbour_words = []
        try:
            for i in range(window_size):
                if(index-(i+1) >= 0):
                    neighbour_words.append(index-(i+1))
                if(index + i +1 <= len(vocab_input) -1):
                    neighbour_words.append(index+i+1)
                neighbour_words.sort()
                mapped_words[index] = neighbour_words
        except:
            print("index out of bounds")
    return mapped_words


In [97]:
# converting the one hot vectors into matrix.This can be taken as input to neural network.
def onehotvec_to_matrix(list_of_lists):
    mat_input = np.array(list_of_lists)
    return mat_input

In [105]:
from math import exp

# function for random intialisation of weights of a matrix
def initialize_weight_vec(x,y):
    weight_vect = np.random.rand(x,y)
    return weight_vect

# function for multiplication of matrices
def mat_multiply(x,y):
    result_vec = np.zeros((len(x),len(y[0])))
    for i in range(len(x)):
        for j in range(len(y[0])):
            for k in range(len(y)):
                result_vec[i][j] += x[i][k]*y[k][j]
    return result_vec

# function for applying sigmoid function. output will be in [0-1] range
def apply_sigmoid(x): 
    for i in range(len(x)):
        for j in range(len(x[0])):
            x[i][j] = 1.0/(1.0 + exp(-x[i][j]))
    return x

# function for softmax. The sum of probabilities of all elements of row will be 1.
def apply_softmax(x):
    for i in range(len(x)):
        new_list = x[i]
        t = 0
        for k in range(len(new_list)):
            t += exp(new_list[k])
        for j in range(len(x[0])):
            x[i][j] = exp(x[i][j])/t
    return x

def forward_Propagation(input_to_hidden_wt_vec,hidden_to_output_wt_vec ):

    # initialising w1 with random weights    
    #input_to_hidden_wt_vec = initialize_weight_vec(len(input_mat),2)
    #w1T*X
    weight_transpose_times_input = np.matmul(input_to_hidden_wt_vec.T,input_mat)
    #sigmoid(w1T*X) = x'
    weights_after_activation = apply_sigmoid(weight_transpose_times_input)
    # initialising w2 with random weights 
    #hidden_to_output_wt_vec = initialize_weight_vec(2,len(input_mat))
    #print(hidden_to_output_wt_vec)
    # w2T * X'
    weight_transpose_times_hidden = np.matmul(hidden_to_output_wt_vec.T,weights_after_activation)
    # softmax(w2T * X')
    predicted_output = apply_softmax(weight_transpose_times_hidden)
    
    return weights_after_activation,predicted_output


In [99]:
def gen_error_mat(predicted_output,mapped_words,input_mat):
    error_mat = np.zeros((len(predicted_output),len(predicted_output)))
    for i in range(len(predicted_output)):
        neighbours = mapped_words[i]
        error = [0]*len(predicted_output[i])
        for j in range(len(neighbours)):
            error +=  (predicted_output[j] - input_mat[j])
        window_size = len(neighbours)
        error = [float(x)/window_size for x in error]
        error_mat[i] = error
    return error_mat

In [100]:
def softmax_derivative(x):
    for i in range(len(x)):
        for j in range(len(x[0])):
            if i == j:
                x[i][j] = np.multiply(x[i][j],(1-x[i][j]))
            else:
                x[i][j] = -(np.multiply(x[i][j],x[i][j]))
    return x

def sigmoid_derivative(x):
    for i in range(len(x)):
        for j in range(len(x[0])):
            x[i][j] = np.multiply(x[i][j],(1-x[i][j]))
    return x

def gen_column_error(x):
    row_mat = np.zeros((len(x),1))
    for i in range(len(x)):
        row_mat[i] = sum(x[i][:])
    return row_mat



In [101]:
def update_weights(weight_matrix,delta_matrix,learning_rate,input_matrix):
    delta_times_input = np.matmul(input_matrix,delta_matrix.T)
    delta_times_input = learning_rate*delta_times_input
    print(weight_matrix)
    updated_weight_matrix = weight_matrix + delta_times_input
    return updated_weight_matrix

def bp_error_and_update_weights(weights_after_activation,predicted_output,input_to_hidden_wt_vec,hidden_to_output_wt_vec):
    
    ## generate error matrix at output layer
    error_mat = gen_error_mat(predicted_output,mapped_words,input_mat)
    #print(error_mat)
   
    ### compute delta matrix at output layer
    derivative_output = softmax_derivative(predicted_output)
    error_mat = gen_column_error(error_mat)
    delta_mat_output = error_mat*derivative_output
    #print(delta_mat_output)
    
    ## propagate delta values backwards to hidden layer and compute error
    error_mat = delta_mat_output.dot(hidden_to_output_wt_vec.T)
    derivative_hidden = sigmoid_derivative(weights_after_activation)
    error_mat = gen_column_error(error_mat)
    error_mat = error_mat.T
    
    ### calculate delta at hidden layer
    delta_mat_hidden = error_mat*derivative_hidden
    
    
    ### updating weights after completion of backpropagation
    input_to_hidden_wt_vec = update_weights(input_to_hidden_wt_vec,delta_mat_hidden,0.5,input_mat)
    hidden_to_output_wt_vec = update_weights(hidden_to_output_wt_vec,delta_mat_output,0.5,weights_after_activation)

    return input_to_hidden_wt_vec,hidden_to_output_wt_vec

In [102]:
def SkipGram_NN(input_mat,window_size,learning_rate):
    epochs = 3
    no_of_hidden_layers = 2
    
    # initialise weight matrices
    input_to_hidden_wt_vec = initialize_weight_vec(len(input_mat),no_of_hidden_layers)
    hidden_to_output_wt_vec = initialize_weight_vec(no_of_hidden_layers,len(input_mat))

    for i in range(epochs):
        print("entered epoch " + str(i))
        weights_after_activation,predicted_output = forward_Propagation(input_to_hidden_wt_vec,hidden_to_output_wt_vec )
        input_to_hidden_wt_vec,hidden_to_output_wt_vec = bp_error_and_update_weights(weights_after_activation,predicted_output,input_to_hidden_wt_vec,hidden_to_output_wt_vec)
           
    return hidden_to_output_wt_vec

In [106]:

corpus = "my name is subbareddy my mother name is venkayemma"  ## input corpus
vocab = process_corp(corpus) ## preprocess_corpus
list_of_zeroes = [0]*len(vocab) ## generate list of size vocab
encoded_vocab = one_hot_encoding(vocab,list_of_zeroes) ## one_hot_encoding of vocab
window_size = 2    ## window_size 
learning_rate = 0.5
mapped_words = generate_mapping(vocab,window_size)
input_mat = onehotvec_to_matrix(encoded_vocab)

final_weight_vector = SkipGram_NN(input_mat,window_size,learning_rate)
final_weight_vector = final_weight_vector.T

print(final_weight_vector)








entered epoch 0
[[ 0.86967189  0.2137422 ]
 [ 0.91891183  0.97953628]
 [ 0.3176267   0.3507927 ]
 [ 0.81186891  0.37057071]
 [ 0.59007253  0.43194808]
 [ 0.86417176  0.31559078]]
[[ 0.64347001  0.67316424  0.33836549  0.97748249  0.82265267  0.92314928]
 [ 0.77726255  0.50629436  0.1658538   0.14600902  0.99424755  0.26639942]]
entered epoch 1
[[ 0.86967189  0.2137422 ]
 [ 0.91891183  0.97953628]
 [ 0.3176267   0.3507927 ]
 [ 0.81186891  0.37057071]
 [ 0.59007253  0.43194808]
 [ 0.86417176  0.31559078]]
[[ 0.64347001  0.67316424  0.33836549  0.97748249  0.82265267  0.92314928]
 [ 0.77726255  0.50629436  0.1658538   0.14600902  0.99424755  0.26639942]]
entered epoch 2
[[ 0.86967189  0.2137422 ]
 [ 0.91891183  0.97953628]
 [ 0.3176267   0.3507927 ]
 [ 0.81186891  0.37057071]
 [ 0.59007253  0.43194808]
 [ 0.86417176  0.31559078]]
[[ 0.64347001  0.67316424  0.33836549  0.97748249  0.82265267  0.92314928]
 [ 0.77726255  0.50629436  0.1658538   0.14600902  0.99424755  0.26639942]]
[[ 0.64347

In [108]:
np.savetxt("skipgram_final_weights.csv", final_weight_vector, delimiter=",")


In [119]:
vocab.index("subba")

2

In [126]:
word = 'subba'
input_index = vocab.index(word)
input_embedding = final_weight_vector[input_index][:]
print(input_embedding)

[ 0.33836549  0.1658538 ]


In [None]:
printing top 10 neighbours from final weight vec. Take word in strings.
def Top_K(Input_word,vocab,weight_embeddings,K):
    input_index = vocab.index(Input_word)
    input_embedding = weight_embeddings[input_index][:]
    return 