In [1]:
#importing required libraries

from sklearn import model_selection, preprocessing, metrics
from sklearn import decomposition, ensemble
import pandas, numpy, string
from keras.preprocessing import text, sequence
from keras import layers, models, optimizers
from sklearn import model_selection
import pandas as pd
from keras.utils import np_utils
from sklearn.metrics import accuracy_score, classification_report
import warnings
warnings.filterwarnings('ignore')
import numpy as np
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional


In [2]:
#loading all the 300 dimensional vector representation of all the words present in pretrained fasttext wordembeddings in embeddings_index1 dictionary

embeddings_index1 = {}

import numpy as np

from keras.preprocessing import text, sequence

for i, line in enumerate(open('cc.te.300.vec',encoding="utf-8")):
    values = line.split()
    embeddings_index1[values[0]] = np.asarray(values[1:], dtype='float32')  #words as key and 300 dimension vector values as values

In [3]:
#loading all the 300 dimensional vector representation of all the words present in pretrained bytepair wordembeddings in embeddings_index2 dictionary

embeddings_index2 = {}

for i, line in enumerate(open('te.wiki.bpe.vs200000.d300.w2v.txt',encoding="utf-8")):
    values = line.split()
    embeddings_index2[values[0]] = np.asarray(values[1:], dtype='float32') #words as key and 300 dimension vector values as values   

In [4]:
#creating convolution neural network model

def create_cnn(input_size,vocab_size,embedding_matrix):
    # Add an Input Layer
    input_layer = layers.Input((input_size, ))  #defining the input format

    # Add the word embedding Layer
    embedding_layer = layers.Embedding(vocab_size, 300, weights=[embedding_matrix], trainable=False)(input_layer)
    embedding_layer = layers.SpatialDropout1D(0.25)(embedding_layer) #entire feature maps from the convolutional layer which are then not used during pooling are droped out

    #Add the convolutional layer
    conv_layer = layers.Convolution1D(256, 3, activation="tanh")(embedding_layer) #256: no of filters, 3:kernal size 
    
    #Add the pooling layer
    pooling_layer = layers.GlobalMaxPool1D()(conv_layer) #selects the maximum of the values in the input feature map region
    
    # Add the output Layers
    output_layer1 = layers.Dense(128, activation="tanh")(pooling_layer)
    output_layer1 = layers.Dropout(0.25)(output_layer1) #the outputs of a layer under dropout are randomly subsampled
    output_layer2 = layers.Dense(8, activation="softmax")(output_layer1)

    # Compile the model
    model = models.Model(inputs=input_layer, outputs=output_layer2)  #models define how to organize (uses Functional API Model)
    model.compile(optimizer=optimizers.Adam(), loss='categorical_crossentropy',metrics=['accuracy'])

    return model

In [5]:
def create_rnn_lstm(input_size,vocab_size,embedding_matrix):
    # Add an Input Layer
    input_layer = layers.Input((input_size, ))

    # Add the word embedding Layer
    embedding_layer = layers.Embedding(vocab_size, 300, weights=[embedding_matrix], trainable=False)(input_layer)
    embedding_layer = layers.SpatialDropout1D(0.25)(embedding_layer)

    
    # Add the LSTM Layer
    lstm_layer = layers.LSTM(100)(embedding_layer)
    
    # Add the output Layers
    output_layer1 = layers.Dense(128, activation="tanh")(lstm_layer)
    output_layer1 = layers.Dropout(0.25)(output_layer1)
    output_layer2 = layers.Dense(8, activation="softmax")(output_layer1)

    # Compile the model
    model = models.Model(inputs=input_layer, outputs=output_layer2)
    model.compile(optimizer=optimizers.Adam(), loss='categorical_crossentropy',metrics=['accuracy'])

    return model

In [6]:
def create_rnn_gru(input_size,vocab_size,embedding_matrix):
    # Add an Input Layer
    input_layer = layers.Input((input_size, ))

    # Add the word embedding Layer
    embedding_layer = layers.Embedding(vocab_size, 300, weights=[embedding_matrix], trainable=False)(input_layer)
    embedding_layer = layers.SpatialDropout1D(0.25)(embedding_layer)

    
    # Add the GRU Layer
    lstm_layer = layers.GRU(100)(embedding_layer)
    
    # Add the output Layers
    output_layer1 = layers.Dense(128, activation="tanh")(lstm_layer)
    output_layer1 = layers.Dropout(0.25)(output_layer1)
    output_layer2 = layers.Dense(8, activation="softmax")(output_layer1)

    # Compile the model
    model = models.Model(inputs=input_layer, outputs=output_layer2)
    model.compile(optimizer=optimizers.Adam(), loss='categorical_crossentropy',metrics=['accuracy'])

    return model

In [7]:
def create_rnn_bi(input_size,vocab_size,embedding_matrix):
    # Add an Input Layer
    input_layer = layers.Input((input_size, ))

    # Add the word embedding Layer
    embedding_layer = layers.Embedding(vocab_size, 300, weights=[embedding_matrix], trainable=False)(input_layer)
    embedding_layer = layers.SpatialDropout1D(0.25)(embedding_layer)

    
    # Add the LSTM Layer
    lstm_layer = layers.Bidirectional(layers.GRU(100))(embedding_layer)
    
    # Add the output Layers
    output_layer1 = layers.Dense(128, activation="tanh")(lstm_layer)
    output_layer1 = layers.Dropout(0.25)(output_layer1)
    output_layer2 = layers.Dense(8, activation="softmax")(output_layer1)

    # Compile the model
    model = models.Model(inputs=input_layer, outputs=output_layer2)
    model.compile(optimizer=optimizers.Adam(), loss='categorical_crossentropy',metrics=['accuracy'])

    return model

In [8]:
def QC_Embedding(x_train,x_valid,vocab_size,input_size,embedding_matrix,input_query,encoder,token):
    
    train_seq_x,trainLabels,valid_seq_x,validLabels = x_train[0],x_train[1],x_valid[0],x_valid[1] 
    
    
    cnn = create_cnn(input_size,vocab_size,embedding_matrix)                
    cnn.fit(train_seq_x, trainLabels, epochs=20, verbose=0)  #training the model with the provided data
    predictions = cnn.predict(valid_seq_x)
    # print(predictions)
    predictions1 = predictions.argmax(axis=-1) #gives max value in target function
    validLabels1 = validLabels.argmax(axis=-1)
    # print(predictions1)
    acc = metrics.accuracy_score(predictions1, validLabels1)
    print("CNN Model: ",acc)
    valid_seq1 = sequence.pad_sequences(token.texts_to_sequences(input_query),maxlen=32) #padding sentences to ensure same length for all sentences
    predict=cnn.predict(valid_seq1)
    predict = predict.argmax(axis=-1)
    print(encoder.inverse_transform(predict))
    
    
    rnn_lstm = create_rnn_lstm(input_size,vocab_size,embedding_matrix)
    rnn_lstm.fit(train_seq_x, trainLabels, epochs=20, verbose=0) #verbose=0 gives no info for every epoch
    predictions_lstm = rnn_lstm.predict(valid_seq_x)
    predictions1_lstm = predictions_lstm.argmax(axis=-1)
    acc2 = metrics.accuracy_score(predictions1_lstm, validLabels1) #returns the accuracy based on predicted and valid labels
    print("\nRNN LSTM Model: ",acc2)
    predict2=rnn_lstm.predict(valid_seq1)
    predict2 = predict2.argmax(axis=-1)
    print(encoder.inverse_transform(predict2))
       
    
    rnn_bi = create_rnn_bi(input_size,vocab_size,embedding_matrix)
    rnn_bi.fit(train_seq_x, trainLabels, epochs=20, verbose=0)
    predictions_bi = rnn_bi.predict(valid_seq_x)
    predictions1_bi = predictions_bi.argmax(axis=-1)
    acc3 = metrics.accuracy_score(predictions1_bi, validLabels1)
    print("\nRNN BI LSTM Model: ",acc3)
    predict3=rnn_bi.predict(valid_seq1)
    predict3 = predict3.argmax(axis=-1)
    print(encoder.inverse_transform(predict3))
    
    
    rnn_gru = create_rnn_gru(input_size,vocab_size,embedding_matrix)
    rnn_gru.fit(train_seq_x, trainLabels, epochs=20, verbose=0)
    predictions_gru = rnn_gru.predict(valid_seq_x)
    predictions1_gru = predictions_gru.argmax(axis=-1)
    acc4 = metrics.accuracy_score(predictions1_gru, validLabels1)
    print("\nRNN GRU Model: ",acc4)
    predict4=rnn_gru.predict(valid_seq1)
    predict4 = predict4.argmax(axis=-1)
    print(encoder.inverse_transform(predict4))  #decodes the encoded predicted label and return the actual label value
       

In [9]:
def Question_Classification_DL(input_query):
    f=open("data\Dataset.txt","r",encoding="utf-8")
    tags,queries  = [],[]

    for line in f:
        line=line.rstrip('\n')
        lb=(line.split()[0]).split(":")[0]
        if len(lb)!=0:
            tags.append(lb)
        queries.append(line[5:])
        
    #queries contains questions , tags contains the labels for the queries    
    trainDF = pd.DataFrame()
    trainDF['text'] = queries
    trainDF['label'] = tags


    #splitting the text,labels into train,valid  with default ration 75:25  
    #train 75%  valid 25%

    train_x, valid_x, train_y, valid_y = model_selection.train_test_split(trainDF['text'], trainDF['label'], random_state=10)


    #converting the train,valid labels to one-hot encoding using labelEncoder
    encoder = preprocessing.LabelEncoder()
    trainLabels = encoder.fit_transform(train_y)
    trainLabels = [np_utils.to_categorical(i, num_classes=8) for i in trainLabels]
    trainLabels = np.asarray(trainLabels)

    validLabels = encoder.fit_transform(valid_y)
    validLabels = [np_utils.to_categorical(i, num_classes=8) for i in validLabels]
    validLabels = np.asarray(validLabels)
    
    token = text.Tokenizer()
    token.fit_on_texts(trainDF['text'])
    word_index = token.word_index
    vocab_size =len(word_index)+1
    
    input_size=32
    
    train_seq_x = sequence.pad_sequences(token.texts_to_sequences(train_x),maxlen=input_size) #padding sentences to ensure same sentence length
    valid_seq_x = sequence.pad_sequences(token.texts_to_sequences(valid_x),maxlen=input_size)


    embedding_matrix1 = np.zeros((len(word_index)+1, 300))
    embedding_matrix2= np.zeros((len(word_index)+1, 300))

    for word,i in word_index.items():
            embedding_vector = embeddings_index1.get(word)    # checking that particular indexed word in telugu embedding .vec file
            if embedding_vector is not None:                 # if it is found in that .vec file  
                embedding_matrix1[i] = embedding_vector
                
    for word,i in word_index.items():
        embedding_vector = embeddings_index2.get(word)    # checking that particular indexed word in telugu embedding .vec file
        if embedding_vector is not None:                 # if it is found in that .vec file  
            embedding_matrix2[i] = embedding_vector           
    
    x_train =[train_seq_x,trainLabels]
    x_valid =[valid_seq_x,validLabels]
    
    print("With Deep learning Techniques")
    print("\n")
    print("Embedding using FastText Pre trained Embedding")
    QC_Embedding(x_train,x_valid,vocab_size,input_size,embedding_matrix1,input_query,encoder,token)
    print("\n")
    print("\nEmbedding using Byte-Pair Encoding Pre trained Embedding")
    QC_Embedding(x_train,x_valid,vocab_size,input_size,embedding_matrix2,input_query,encoder,token)
    
    


In [10]:
input = ['మద్యప్రదేశ్ రాజధాని ఏమిటి?']

Question_Classification_DL(input)

With Deep learning Techniques


Embedding using FastText Pre trained Embedding
CNN Model:  0.8776119402985074
['LOCA']

RNN LSTM Model:  0.8776119402985074
['LOCA']

RNN BI LSTM Model:  0.9014925373134328
['LOCA']

RNN GRU Model:  0.9014925373134328
['LOCA']



Embedding using Byte-Pair Encoding Pre trained Embedding
CNN Model:  0.8686567164179104
['LOCA']

RNN LSTM Model:  0.8537313432835821
['LOCA']

RNN BI LSTM Model:  0.8656716417910447
['LOCA']

RNN GRU Model:  0.8507462686567164
['LOCA']
