In [None]:
import os
import numpy as np
import pandas as pd

In [None]:
# we will be using pre trained glove word vectors which has 400k words in vocab from wikipedia
# we have to run demo.sh to obtain the vocab.txt, which obtains 100M characters of Wikipedia

# here we will make a class of embedding and make some attributes like vocab and all 
# which can be used later
#self.vocab
#self.W

#the file vocab.txt is having the list of words along with some number(I dont know what that is now)
# the file vectors.txt is having word and its vector ahead in a line.

class gloveEmbeddings:
    
    def __init__(self, vocabFile, vectorFile):
        
        f = open(vocabPath, 'r')
        #we take out all words and put in words list
        words = [x.rstrip().split(' ')[0] for x in f.readlines()]

        # now we ll make a dictionary with key as the word and value as 50 D vector
        # from vector.txt

        f = open(vectorPath, 'r')
        vectors = {}

        for l in f:
            values = l.rstrip().split(' ')
            vectors[values[0]] = [float(num) for num in values[1:]]
        
        #now our aim is to make a big numpy matrix of (vocabSize, vectorSize)
        # which will be holding vector values for each word with regard to the position number of 
        # the word
        vocabSize = words.__len__()
        
        #now we have to take record of position of words
        
        # ennumerate in python is used for loop over something with an automatic counter
        # for counter, word in enumerate(list):
        #here counter will be keeping the count
        
        
        vocab = {word : position for position, word in enumerate(words)}
        posVocab = {position : word for position, word in enumerate(words)}
        
        vectorSize = 50
        
        W = np.zeros((vocabSize, vectorSize))
        
        for word, vec in vectors.items():   #dictionary.items() give tuple of key, value pair
            
            if word == '<unk>':
                continue
                
            W[vocab[word], :] = vec   #here vocab[word] gives the position of that word
            
        # now we will normalize values of W

        wNew = np.zeros(W.shape)

        d = (np.sum(W ** 2, 1) ** (0.5))    #in sum(W**2, 1), 1 is used for axis. 1 means summing across the 
                                            #columns, ie. producing total in one row
        wNew = (W.T / d).T

        # things are done, now we will set attributes so that we can access through this attribute

        self.vocab = vocab
        self.W = wNew

In [None]:
def sumVectors(embed, query):
    
    tokens = query.split(' ')
    
    # now we will initialize a np array of size equal to dimension of vec and then keep 
    #on adding the equivalent vec of words of query
    
    vec = np.zeros(embed.W.shape[1])
    
    for word in tokens:
        
        if word in embed.vocab:
            
            vec = vec + embed.W[embed.vocab[word], :]  #again vocab[word] gives position of that word
            
    return vec

def calculateCentroid(embed, samples):
    
    vec = np.zeros((len(samples), embed.W.shape[1]))
    
    for count, sample in enumerate(samples):
        
        vec[count, :] = sumVectors(embed, sample)
        
    centroid = np.mean(vec, axis = 0)
    
    assert centroid.shape[0] == embed.W.shape[1]
    
    return centroid


In [None]:
def getDataFromCsv(csvPath):
    
    df = pd.read_csv(csvPath)
    df = df.fillna(0)
    
    classNum = len(df.columns)
    classNames = df.columns.values.tolist()
    
    dicti = {}
    for colName in classNames:
        lis = []
        for sentence in df[colName]:
            if sentence != 0:
                lis.append(str(sentence).lower().rstrip())

            dicti[colName] = { "samples" : lis, "centroid" : None }
            
    for label in classNames:
        dicti[label]["centroid"] = calculateCentroid(embed,dicti[label]["samples"])
    
    return dicti,classNum,classNames

In [None]:
def intentClassifier(embed, query):
    
    dicti, classNum, classNames = getDataFromCsv(os.path.abspath('bot_intent_for_model.csv'))
    
    vec = sumVectors(embed, query)
    
    scores = np.array([ np.linalg.norm(vec-dicti[label]["centroid"]) for label in classNames ])
    
    return classNames[np.argmin(scores)]

In [None]:
vocabPath = os.path.abspath('glove_vectors_wikipedia/vocab.txt')
vectorPath = os.path.abspath('glove_vectors_wikipedia/vectors.txt')

flag = 'y'
while flag == 'y':
    
    embed = gloveEmbeddings(vocabPath, vectorPath)
    
    query = str(input("Enter your query : "))
    
    intent = intentClassifier(embed,query)
    
    print('\n classified intent : {}'.format(intent))
    
    print('\n you want to continue? (y/n)')
    
    flag = input()