# QAwiki Model 01

### Imports

In [22]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os
import sys
from IPython.display import clear_output as clr
import time
import tensorflow as tf
import keras as K
import string
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer as WNL
%matplotlib inline

In [44]:
# Setting Working Dircetory
cw_path = os.getcwd() + '/../../'
os.listdir(cw_path)

['.git',
 '.ipynb_checkpoints',
 'Datasets',
 'dev-evaluate-v2.0-in1.json',
 'Embeddings',
 'evaluate-v2.0.py',
 'Models',
 'Readings',
 'README.md']

### Creating a Preprocessor

In [50]:
class preprocessor:
    
    def __init__(self):
        # word lemmatizer
        self.articles = 'a|an|and|the'
        self.lemmatizer = WNL()
        self.punctuations = string.punctuation
        self.translator = str.maketrans(string.punctuation, ' '*len(self.punctuations))
        
    def normalize_text(self, s):
        lower_s = self.lower(s)
        rem_p_s = self.remove_punc(lower_s)
        rem_a_s = self.remove_articles(rem_p_s)
        space_s = self.white_space_fix(rem_a_s)
        lemma_s = self.lemmatize(space_s)
        return lemma_s
        
    def lemmatize(self, txt):
        lemmatizer = self.lemmatizer
        return ' '.join(lemmatizer.lemmatize(lemmatizer.lemmatize(word, pos = 'v'), pos = 'a') for word in txt.split())

    def remove_articles(self, text):
        return re.sub('\s+('+self.articles+')(\s+)', ' ',text)

    def white_space_fix(self, text):
        return ' '.join(text.split())

    def remove_punc(self, text):
        return text.translate(self.translator)

    def lower(self, text):
        return text.lower()
    

In [105]:
class embedder:
    
    def __init__(self, embeddingfile = None):
        self.GloveEmbeddings = {}
        self.emb_dim = None
        if(embeddingfile is None):
            pass
        else:
            self.loadEmbeddings(embeddingfile)

    def loadEmbeddings(self, embeddingfile):

        fe = open(embeddingfile,"r",encoding="utf-8",errors="ignore")
        for line in fe:
            tokens= line.strip().split()
            word = tokens[0]
            vec = tokens[1:]
            vec = " ".join(vec)
            self.GloveEmbeddings[word]=vec
        self.emb_dim = len(vec.split(' '))
        #Add Zerovec, this will be useful to pad zeros, it is better to experiment with padding any non-zero constant values also.
        self.GloveEmbeddings["zerovec"] = "0.0 "*(self.emb_dim-1) + "0.0"
        fe.close()  


    def embed(self, text_batch, max_size):
        """
        text_batch : list of texts
        """
        batch_size = len(text_batch)
        emb_batch = np.zeros((batch_size, max_size, self.emb_dim))
        
        for i in range(batch_size):
            doc = text_batch[i]
            for j, word in enumerate(doc.strip().split()):
                if(j < max_size):
                    emb_batch[i,j,:] = np.array(self.GloveEmbeddings[word].strip().split(" ")).astype(float)
            
        return emb_batch

In [106]:
text_processor = preprocessor()

In [107]:
# Testing the [preprocessor]
text_processor.normalize_text("Hi I am Abhishek Kumar! How are You ?")

'hi i be abhishek kumar how be you'

In [108]:
text_embedder = embedder(cw_path+'Embeddings/glove.6B.50d.txt')

In [110]:
# Testing the Embedder
text_embedder.embed(['hi how are you mister simond'],5).shape

(1, 5, 50)

In [116]:
# Global Parameters
emb_dim = text_embedder.emb_dim
time_size = 100
encoder_units = int(text_embedder.emb_dim//2)

In [119]:
# Checking params
print('Embeding_dim : ',emb_dim, ', Time Size :', time_size, ', Encoder_units :', encoder_units)

Embeding_dim :  50 , Time Size : 100 , Encoder_units : 25


In [155]:
class sentence_encoder:
    
    def __init__(self, sess, emd_dim, time_size, units, n_levels = 1):
        self.sess = sess
        self.emb_dim = emb_dim
        self.time_size = time_size
        self.n_levels = n_levels
        self.units = units
        self.make_encoding()
        
        
    def make_encoding(self):
        self.encoding = self.gen_transformer_graph()
    
    def gen_transformer_graph(self):
        
        self.input_text_vec = tf.placeholder(shape = [None, self.time_size, self.emb_dim], dtype=tf.float32)
        
        c_out = self.input_text_vec
        for k in range(self.n_levels):
            out = []
            for i in range(self.time_size):
                out.append(self.get_attension_graph(c_out))
            c_out = tf.nn.leaky_relu(tf.stack(out, axis = 1))
#             print(out)
        
        return c_out
            
            
            
    def get_attension_graph(self, seq):
        print(seq)
        dense = tf.layers.dense(seq, 1, activation=tf.nn.leaky_relu)
        print(dense)
        alphas = tf.nn.softmax(dense, axis = 1)
        print(alphas)
        out = tf.keras.layers.Dot(axes = 1)([alphas, seq])
        return out
    
    
    def get_encoding(self):
        
        return self.encoding

In [156]:
sess = tf.Session()
tf.reset_default_graph()

In [157]:
senc = sentence_encoder(sess, emb_dim, time_size, encoder_units)

Tensor("Placeholder:0", shape=(?, 100, 50), dtype=float32)
Tensor("dense/LeakyRelu:0", shape=(?, 100, 1), dtype=float32)
Tensor("transpose_1:0", shape=(?, 100, 1), dtype=float32)
Tensor("Placeholder:0", shape=(?, 100, 50), dtype=float32)
Tensor("dense_1/LeakyRelu:0", shape=(?, 100, 1), dtype=float32)
Tensor("transpose_3:0", shape=(?, 100, 1), dtype=float32)
Tensor("Placeholder:0", shape=(?, 100, 50), dtype=float32)
Tensor("dense_2/LeakyRelu:0", shape=(?, 100, 1), dtype=float32)
Tensor("transpose_5:0", shape=(?, 100, 1), dtype=float32)
Tensor("Placeholder:0", shape=(?, 100, 50), dtype=float32)
Tensor("dense_3/LeakyRelu:0", shape=(?, 100, 1), dtype=float32)
Tensor("transpose_7:0", shape=(?, 100, 1), dtype=float32)
Tensor("Placeholder:0", shape=(?, 100, 50), dtype=float32)
Tensor("dense_4/LeakyRelu:0", shape=(?, 100, 1), dtype=float32)
Tensor("transpose_9:0", shape=(?, 100, 1), dtype=float32)
Tensor("Placeholder:0", shape=(?, 100, 50), dtype=float32)
Tensor("dense_5/LeakyRelu:0", shape=(?