In [1]:
# Importing library
import numpy as np
import pandas as pd
import nltk
import re

# for ignoring warning
import warnings
warnings.simplefilter("ignore")

In [2]:
df = pd.read_csv(r"C:\Users\wel\Downloads\LabelledData.txt",sep=",,,",header=None ,names=['Question','Category'])
df.head() 
# .head() is used for looking up first 5 row of dataset


Unnamed: 0,Question,Category
0,how did serfdom develop in and then leave russ...,unknown
1,what films featured the character popeye doyle ?,what
2,how can i find a list of celebrities ' real na...,unknown
3,what fowl grabs the spotlight after the chines...,what
4,what is the full form of .com ?,what


In [3]:
print("The shape of Labelled Data is :",df.shape)



The shape of Labelled Data is : (1483, 2)


In [4]:
print(" The category of Labelled Data Questions are :",df['Category'].unique())

 The category of Labelled Data Questions are : [' unknown' ' what' ' when' ' who' '  what' '  who' ' affirmation']


In [5]:
df.isnull().sum()
# checking for null value

Question    0
Category    0
dtype: int64

#  Using WordNet Lemmatizer

In [6]:
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize
from nltk.corpus import wordnet as wn

In [7]:
class StemTokenizer(object):
    def __init__(self):
        self.ignore_set = {'footnote', 'nietzsche', 'plato', 'mr.'}

    def __call__(self, doc):
        words = []
        for word in word_tokenize(doc):
            word = word.lower()
            w = wn.morphy(word)
            if w and len(w) > 1 and w not in self.ignore_set:
                words.append(w)
        return words

In [8]:
lemmatizer=WordNetLemmatizer()
def stem_tokenize(text):
    return [lemmatizer.lemmatize(i) for i in word_tokenize(text)]

# Using Naive_bayes classifier model

In [9]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
import pickle as pkl
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

In [10]:
vectorizer = CountVectorizer(analyzer='word',lowercase=True,tokenizer=stem_tokenize)
X_train = vectorizer.fit_transform(df.Question.values)
with open('vectorizer.pk', 'wb') as fin:
    pkl.dump(vectorizer, fin)


In [11]:
labels = df['Category']

# using train-test split to train the model

In [12]:
# split the data into a training set and a validation set
VALIDATION_SPLIT=0.10
indices = np.arange(X_train.shape[0])
np.random.shuffle(indices)
X_train = X_train[indices]
labels = labels[indices]
validation_samples = int(VALIDATION_SPLIT * X_train.shape[0])

x_train = X_train[:-validation_samples]
y_train = labels[:-validation_samples]
x_val = X_train[-validation_samples:]
y_val = labels[-validation_samples:]

In [13]:
clf = MultinomialNB()
clf.fit(x_train,y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [14]:
# evaluate the model of test data
preds = clf.predict(x_val)
print(classification_report(preds,y_val))
print("Accuracy of the model is:",clf.score(x_val,y_val) )

              precision    recall  f1-score   support

 affirmation       0.43      1.00      0.60         3
     unknown       0.68      0.95      0.79        20
        what       0.99      0.86      0.92        80
        when       0.43      0.75      0.55         4
         who       1.00      0.88      0.94        41

    accuracy                           0.88       148
   macro avg       0.70      0.89      0.76       148
weighted avg       0.92      0.88      0.89       148

Accuracy of the model is: 0.8783783783783784


# validating the model for sentence

In [15]:
# example 1
example=vectorizer.transform([" How are you"])
clf.predict(example)

array([' unknown'], dtype='<U12')

In [16]:
# example 2
example=vectorizer.transform([" who are you"])
clf.predict(example)

array([' who'], dtype='<U12')

# Using Long short-term memory ( LSTM) 

In [17]:
# importing neccessary packges
import tensorflow
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical

Using TensorFlow backend.


In [18]:
MAX_NB_WORDS = 20000
MAX_SEQUENCE_LENGTH=30

In [19]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [20]:
data=df.copy()

In [21]:
print(data['Category'].value_counts())

tokenizer = Tokenizer(num_words=MAX_NB_WORDS, split=' ')
tokenizer.fit_on_texts(data['Question'].values)
X = tokenizer.texts_to_sequences(data['Question'].values)
X = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)

 what           607
 who            401
 unknown        272
 affirmation    104
 when            96
  what            2
  who             1
Name: Category, dtype: int64


In [22]:
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Y = data['Category']
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(Y)
Y=le.transform(Y) 
labels = to_categorical(np.asarray(Y))
print('Shape of data tensor:', X.shape)
print('Shape of label tensor:', labels.shape)


# split the data into a training set and a validation set
indices = np.arange(X.shape[0])
np.random.shuffle(indices)
X = X[indices]
labels = labels[indices]
nb_validation_samples = int(VALIDATION_SPLIT * X.shape[0])

x_train = X[:-nb_validation_samples]
y_train = labels[:-nb_validation_samples]
x_val = X[-nb_validation_samples:]
y_val = labels[-nb_validation_samples:]


Found 3675 unique tokens.
Shape of data tensor: (1483, 30)
Shape of label tensor: (1483, 7)


In [24]:
""" Using GLoVe pretrained model for word embedding to trained the model for better accuracy
so, i download the large file 'glove.42B.300d' and save it to my local directory"""
embeddings_index = {}
f = open(r'C:\Users\wel\Downloads\glove.42B.300d\glove.42B.300d.txt',encoding="utf8")
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Found 1917494 word vectors.


In [25]:
EMBEDDING_DIM=300
embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [26]:
from keras.layers import Embedding

embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)
embed_dim = 300
lstm_out = 196

model = Sequential()
model.add(embedding_layer)
model.add(LSTM(lstm_out, dropout_U=0.25, dropout_W=0.25))
model.add(Dense(7,activation='softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model.summary())

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 30, 300)           1102800   
_________________________________________________________________
lstm_1 (LSTM)                (None, 196)               389648    
_________________________________________________________________
dense_1 (Dense)              (None, 7)                 1379      
Total params: 1,493,827
Trainable params: 391,027
Non-trainable params: 1,102,800
_________________________________________________________________
None


In [27]:
model.fit(x_train, y_train,batch_size=128,epochs=20,validation_data=(x_val, y_val))

Train on 1335 samples, validate on 148 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.callbacks.History at 0x27d2f5ccdc8>

# # Validating our model for exceptional cases that result are not accurate by Naive-bayes classifier model

In [28]:
example = tokenizer.texts_to_sequences(["is this time does the train leave"])
example = pad_sequences(example, maxlen=MAX_SEQUENCE_LENGTH)
le.inverse_transform(np.argmax(model.predict(example),axis=1))


array([' affirmation'], dtype=object)

In [29]:
example = tokenizer.texts_to_sequences(["What time does the train leave"])
example = pad_sequences(example, maxlen=MAX_SEQUENCE_LENGTH)
le.inverse_transform(np.argmax(model.predict(example),axis=1))


array([' when'], dtype=object)