#implementing a sentence classfier using cnn

In [1]:
import os
import tensorflow as tf
import numpy as np
from six.moves.urllib.request import urlretrieve
import zipfile
import matplotlib.pyplot as plt
import collections
import random
import math


#download the data if not already available


In [2]:
def maybe_download(url,name):
    filename, _ = urlretrieve(url,os.getcwd() + name)
    return filename


In [3]:
#filename = maybe_download('http://cogcomp.org/Data/QA/QC/TREC_10.label','/trec_10.label')

In [4]:
max_sent_length = 0

def read_data(filename):
    global max_sent_length
    
    questions = []
    labels = []
    
    with open(filename,'r',encoding = 'latin-1') as f:
        for row in f:
            row_data = row.split(':')
            
            lab,q = row_data[0],row_data[1]
            questions.append(q.split())
            labels.append(lab)
            if len(questions[-1]) > max_sent_length:
                max_sent_length = len(questions[-1])
    print(max_sent_length)   
    return questions,labels

            
                


In [5]:
# train and test set
test_questions,test_labels = read_data("trec_10.label")
train_questions,train_labels = read_data("file_1000.label")


18
33


In [6]:
#now pad shorter sentences

In [7]:
for qi,que in enumerate(train_questions):
    for _ in range(max_sent_length - len(que)):
        que.append("PAD")
    train_questions[qi] = que
print("train question padded")

train question padded


In [8]:
#now pad test Questions
for qi,que in enumerate(test_questions):
    for _ in range(max_sent_length - len(que)):
        que.append("PAD")
    test_questions[qi] = que
print("test questions padded")

test questions padded


In [9]:
def build_dataset(questions):
    words = []
    data_list = []
    count = []
    for d in questions:
        words.extend(d)
    count.extend(collections.Counter(words).most_common())
    
    dictionary = dict()
    for word,_ in count:
        dictionary[word] = len(dictionary)
    for d in questions:
        data = list()
        for word in d:
            index = dictionary[word]
            data.append(index)
        data_list.append(data)
        
    reverse_dictionary = dict(zip(dictionary.values() , dictionary.keys()))
    
    return data_list,count,dictionary,reverse_dictionary


In [10]:
all_questions = list(train_questions)
all_questions.extend(test_questions)
data_list,count,dictionary,reverse_dictionary = build_dataset(all_questions)

In [11]:
batch_size = 16
sent_length = max_sent_length
num_classes = 6
all_labels = ['NUM','LOC','HUM','DESC','ENTY','ABBR']
vocabulary_size = len(dictionary)

In [12]:
print(vocabulary_size)

3548


In [13]:
class BatchGenerator(object):
    
    def __init__(self,batch_size,questions,labels):
        self.questions = questions
        self.labels = labels
        self.batch_size = batch_size
        self.text_size = len(questions)
        self.data_index = 0
        
    def generate_batch(self):
        
        global sent_length,num_classes
        global dictionary,all_labels
        
        inputs = np.zeros((self.batch_size,sent_length,vocabulary_size),dtype = np.float32)
        label_op = np.zeros((self.batch_size,num_classes),dtype = np.float32)
        
        if self.data_index + self.batch_size > self.text_size:
            self.data_index = 0
        for qi, que in enumerate(self.questions[self.data_index:self.data_index + self.batch_size]):
            
            for wi,word in enumerate(que):
                inputs[qi,wi,dictionary[word]] = 1.0
            label_op[qi,all_labels.index(self.labels[self.data_index + qi])] = 1.0
        self.data_index = (self.data_index + self.batch_size)%self.text_size
        return inputs,label_op
    
    def return_index(self):
        return self.data_index
    
            
sample_gen = BatchGenerator(batch_size,train_questions,train_labels)

sample_batch_inputs,sample_batch_label = sample_gen.generate_batch()
sample_batch_inputs_2,sample_batch_label_2 = sample_gen.generate_batch()

print("sample batch label")
print(np.argmax(sample_batch_label,axis=1))
print(np.argmax(sample_batch_label_2,axis = 1))
print("Shape of sample batch size : (%d,%d,%d) " % (sample_batch_inputs.shape))
print("shape of sample label size : (%d,%d)" % (sample_batch_label.shape))

sample batch label
[3 4 3 4 5 2 2 2 3 2 0 3 2 2 4 1]
[3 0 3 3 0 4 2 3 3 4 2 1 4 1 5 4]
Shape of sample batch size : (16,33,3548) 
shape of sample label size : (16,6)


In [14]:
#defining hyperparaaters

In [15]:
tf.reset_default_graph

batch_size = 32

filter_size = [3,5,7]

sample_input = tf.placeholder(shape=[batch_size,sent_length,vocabulary_size],dtype = tf.float32,name = "sentence_input")
sample_label = tf.placeholder(shape=[batch_size,num_classes],dtype = tf.float32,name = "sentence_label")


In [16]:
#defining model parameters

In [17]:
w1 = tf.Variable(tf.truncated_normal([filter_size[0],vocabulary_size,1],stddev = 0.02, dtype = tf.float32),name = "weight_1")
b1 = tf.Variable(tf.random_uniform([1], 0,0.01, dtype = tf.float32),name = "bias_1")

w2 = tf.Variable(tf.truncated_normal([filter_size[1],vocabulary_size,1],stddev = 0.02,dtype = tf.float32), name = "weight_2")
b2 = tf.Variable(tf.random_uniform([1],0,0.01,dtype = tf.float32), name = "bias_2")

w3 = tf.Variable(tf.truncated_normal([filter_size[2],vocabulary_size,1],stddev = 0.02, dtype = tf.float32),name = "weight_3")
b3 = tf.Variable(tf.random_uniform([1], 0,0.01, dtype = tf.float32),name = "bias_3")

w_fc1 = tf.Variable(tf.truncated_normal([len(filter_size),num_classes], dtype = tf.float32), name = "weight_fulocn1")
b_fc1 = tf.Variable(tf.random_uniform([num_classes],0,0.01,dtype = tf.float32),name = "bias_fulcon1")


In [18]:
#defining inference of the cnn


In [19]:
h1_1 = tf.nn.relu(tf.nn.conv1d(sample_input,w1,stride=1,padding = "SAME") + b1)
h1_2 = tf.nn.relu(tf.nn.conv1d(sample_input,w2,stride = 1,padding= 'SAME') + b2)
h1_3 = tf.nn.relu(tf.nn.conv1d(sample_input,w3,stride = 1,padding = "SAME") + b3)


# pooling operation

h2_1 = tf.reduce_max(h1_1,axis=1)
h2_2 = tf.reduce_max(h1_2,axis = 1)
h2_3 = tf.reduce_max(h1_3,axis = 1)

h2 = tf.concat([h2_1,h2_2,h2_3], axis = 1)

#calculate the fully connectd layer output(no activation)

logits = tf.matmul(h2,w_fc1) + b_fc1

In [20]:
loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(labels=sample_label,logits = logits))

optimizer = tf.train.MomentumOptimizer(learning_rate = 0.01, momentum = 0.9).minimize(loss)

In [21]:
predictions = tf.argmax(tf.nn.softmax(logits),axis = 1)

In [22]:
#running our model

In [23]:
def accuracy(labels,preds):
    return np.sum(np.argmax(labels,axis=1)==preds)/labels.shape[0]

In [24]:
init = tf.global_variables_initializer()
num_steps = 50

train_gen = BatchGenerator(batch_size,train_questions,train_labels)
test_gen = BatchGenerator(batch_size,test_questions,test_labels)

test_interval = 5

with tf.Session() as sess:
    sess.run(init)
    for step in range(num_steps):
        avg_loss = []
        
        
        for tr_ in range((len(test_questions)//batch_size) - 1):
            tr_input,tr_label = train_gen.generate_batch()
            
            l, _ = sess.run([loss,optimizer],feed_dict = {sample_input :tr_input,sample_label:tr_label})
            avg_loss.append(l)
        print("train loss at epoch %d : %.2f" % (step,np.mean(avg_loss)))
        
        test_accuracy = []
        
        if(step+1)%test_interval ==0:
            for ts_i in range((len(test_questions)-1)//batch_size):
                
                ts_input,ts_label = test_gen.generate_batch()
                a = sess.run(predictions, feed_dict = {sample_input : ts_input,sample_label:ts_label})
                test_accuracy.append(accuracy(ts_label,a))
                
            print("mean test accuracy in epoch %d is %.3f" % (step,np.mean(test_accuracy)*100.0))

        
    
    

train loss at epoch 0 : 1.77
train loss at epoch 1 : 1.70
train loss at epoch 2 : 1.65
train loss at epoch 3 : 1.56
train loss at epoch 4 : 1.50
mean test accuracy in epoch 4 is 48.542
train loss at epoch 5 : 1.43
train loss at epoch 6 : 1.36
train loss at epoch 7 : 1.31
train loss at epoch 8 : 1.22
train loss at epoch 9 : 1.16
mean test accuracy in epoch 9 is 68.750
train loss at epoch 10 : 1.11
train loss at epoch 11 : 1.06
train loss at epoch 12 : 1.00
train loss at epoch 13 : 0.96
train loss at epoch 14 : 0.90
mean test accuracy in epoch 14 is 76.250
train loss at epoch 15 : 0.84
train loss at epoch 16 : 0.83
train loss at epoch 17 : 0.75
train loss at epoch 18 : 0.73
train loss at epoch 19 : 0.70
mean test accuracy in epoch 19 is 83.750
train loss at epoch 20 : 0.65
train loss at epoch 21 : 0.63
train loss at epoch 22 : 0.59
train loss at epoch 23 : 0.57
train loss at epoch 24 : 0.52
mean test accuracy in epoch 24 is 86.667
train loss at epoch 25 : 0.56
train loss at epoch 26 : 0.