In [1]:
# Importing library
import numpy as np
import pandas as pd
import nltk
import re

# for ignoring warning
import warnings
warnings.simplefilter("ignore")

In [2]:
df = pd.read_csv(r"C:\Users\wel\Downloads\LabelledData.txt",sep=",,,",header=None ,names=['Question','Category'])
df.head() 
# .head() is used for looking up first 5 row of dataset


Unnamed: 0,Question,Category
0,how did serfdom develop in and then leave russ...,unknown
1,what films featured the character popeye doyle ?,what
2,how can i find a list of celebrities ' real na...,unknown
3,what fowl grabs the spotlight after the chines...,what
4,what is the full form of .com ?,what


In [3]:
print("The shape of Labelled Data is :",df.shape)



The shape of Labelled Data is : (1483, 2)


In [4]:
print(" The category of Labelled Data Questions are :",df['Category'].unique())

 The category of Labelled Data Questions are : [' unknown' ' what' ' when' ' who' '  what' '  who' ' affirmation']


In [5]:
df.isnull().sum()
# checking for null value

Question    0
Category    0
dtype: int64

#  Using WordNet Lemmatizer

In [6]:
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize
from nltk.corpus import wordnet as wn

In [7]:
class StemTokenizer(object):
    def __init__(self):
        self.ignore_set = {'footnote', 'nietzsche', 'plato', 'mr.'}

    def __call__(self, doc):
        words = []
        for word in word_tokenize(doc):
            word = word.lower()
            w = wn.morphy(word)
            if w and len(w) > 1 and w not in self.ignore_set:
                words.append(w)
        return words

In [8]:
lemmatizer=WordNetLemmatizer()
def stem_tokenize(text):
    return [lemmatizer.lemmatize(i) for i in word_tokenize(text)]

# Using Naive_bayes classifier model

In [9]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
import pickle as pkl
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

In [10]:
vectorizer = CountVectorizer(analyzer='word',lowercase=True,tokenizer=stem_tokenize)
X_train = vectorizer.fit_transform(df.Question.values)
with open('vectorizer.pk', 'wb') as fin:
    pkl.dump(vectorizer, fin)


In [11]:
labels = df['Category']

# using train-test split to train the model

In [12]:
# split the data into a training set and a validation set
VALIDATION_SPLIT=0.10
indices = np.arange(X_train.shape[0])
np.random.shuffle(indices)
X_train = X_train[indices]
labels = labels[indices]
validation_samples = int(VALIDATION_SPLIT * X_train.shape[0])

x_train = X_train[:-validation_samples]
y_train = labels[:-validation_samples]
x_val = X_train[-validation_samples:]
y_val = labels[-validation_samples:]

In [13]:
clf = MultinomialNB()
clf.fit(x_train,y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [20]:
# evaluate the model of test data
preds = clf.predict(x_val)
print(classification_report(preds,y_val))
print("Accuracy of the model is:",clf.score(x_val,y_val) )

              precision    recall  f1-score   support

 affirmation       0.50      1.00      0.67         4
     unknown       0.86      0.79      0.83        24
        what       0.98      0.86      0.92        73
        when       0.20      1.00      0.33         2
         who       1.00      0.98      0.99        45

    accuracy                           0.89       148
   macro avg       0.71      0.93      0.75       148
weighted avg       0.95      0.89      0.91       148

Accuracy of the model is: 0.8918918918918919


# validating the model for sentence

In [24]:
# example 1
example=vectorizer.transform([" How are you"])
clf.predict(example)

array([' unknown'], dtype='<U12')

In [25]:
# example 2
example=vectorizer.transform([" who are you"])
clf.predict(example)

array([' who'], dtype='<U12')