<a href="https://colab.research.google.com/github/riyakb/Relevant-Answer-Search-Engine/blob/master/Question_Classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
#mounting drive

from google.colab import drive
drive.mount('/content/drive')
#changing path

import os
os.chdir("/content/drive/My Drive/Colab Notebooks/Relevant Answer Search Engine")

In [0]:
!pip install sner
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re, nltk
import gensim
import codecs
from sner import Ner
import spacy
from sklearn.metrics import confusion_matrix, accuracy_score, average_precision_score
from sklearn.model_selection import KFold, StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.internals import find_jars_within_path
from nltk.tag import StanfordPOSTagger
from nltk.tag import StanfordNERTagger
import spacy
from sklearn import linear_model
from sklearn import svm
from sklearn.metrics import fbeta_score, accuracy_score
from scipy.sparse import hstack
from sklearn.feature_extraction.text import CountVectorizer

In [0]:
f_train = open('Question-Classification-master/traininig_dataset (1) (1).txt', 'r+')
f_test = open('Question-Classification-master/validation_dataset (1) (1).txt', 'r+')

train = pd.DataFrame(f_train.readlines(), columns = ['Question'])
test = pd.DataFrame(f_test.readlines(), columns = ['Question'])

In [0]:
train['QType'] = train.Question.apply(lambda x: x.split(' ', 1)[0])
train['Question'] = train.Question.apply(lambda x: x.split(' ', 1)[1])
train['QType-Coarse'] = train.QType.apply(lambda x: x.split(':')[0])
train['QType-Fine'] = train.QType.apply(lambda x: x.split(':')[1])
test['QType'] = test.Question.apply(lambda x: x.split(' ', 1)[0])
test['Question'] = test.Question.apply(lambda x: x.split(' ', 1)[1])
test['QType-Coarse'] = test.QType.apply(lambda x: x.split(':')[0])
test['QType-Fine'] = test.QType.apply(lambda x: x.split(':')[1])

As can be observed, the train set consists of some duplicate question (81 to be exact). <br>
The number of unique Coarse:Fine classes is 50 whereas entries corresponding to 42 are present in the test set. <br>
The number of fine classes overall is 47 whereas entries corresponding to 39 are present in test.

In [0]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit(pd.Series(train.QType.tolist() + test.QType.tolist()).values)
train['QType'] = le.transform(train.QType.values)
test['QType'] = le.transform(test.QType.values)
le2 = LabelEncoder()
le2.fit(pd.Series(train['QType-Coarse'].tolist() + test['QType-Coarse'].tolist()).values)
train['QType-Coarse'] = le2.transform(train['QType-Coarse'].values)
test['QType-Coarse'] = le2.transform(test['QType-Coarse'].values)
le3 = LabelEncoder()
le3.fit(pd.Series(train['QType-Fine'].tolist() + test['QType-Fine'].tolist()).values)
train['QType-Fine'] = le3.transform(train['QType-Fine'].values)
test['QType-Fine'] = le3.transform(test['QType-Fine'].values)

In [0]:
all_corpus = pd.Series(train.Question.tolist() + test.Question.tolist()).astype(str)

Obtaining Dotwords.<br>
Also, performing text cleaning and pre-processing in the next two blocks

In [0]:
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer 
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer

# dot_words = []
# for row in all_corpus:
#     for word in row.split():
#         if '.' in word and len(word)>2:
#             dot_words.append(word)

In [0]:
def text_clean(corpus, keep_list):
    '''
    Purpose : Function to keep only alphabets, digits and certain words (punctuations, qmarks, tabs etc. removed)
    
    Input : Takes a text corpus, 'corpus' to be cleaned along with a list of words, 'keep_list', which have to be retained
            even after the cleaning process
    
    Output : Returns the cleaned text corpus
    
    '''
    cleaned_corpus = pd.Series()
    for row in corpus:
        qs = []
        for word in row.split():
            if word not in keep_list:
                p1 = re.sub(pattern='[^a-zA-Z0-9]',repl=' ',string=word)
                p1 = p1.lower()
                qs.append(p1)
            else : qs.append(word)
        cleaned_corpus = cleaned_corpus.append(pd.Series(' '.join(qs)))
    return cleaned_corpus

In [0]:
def preprocess(corpus, keep_list, cleaning = True, stemming = False, stem_type = None, lemmatization = False, remove_stopwords = True):
    
    '''
    Purpose : Function to perform all pre-processing tasks (cleaning, stemming, lemmatization, stopwords removal etc.)
    
    Input : 
    'corpus' - Text corpus on which pre-processing tasks will be performed
    'keep_list' - List of words to be retained during cleaning process
    'cleaning', 'stemming', 'lemmatization', 'remove_stopwords' - Boolean variables indicating whether a particular task should 
                                                                  be performed or not
    'stem_type' - Choose between Porter stemmer or Snowball(Porter2) stemmer. Default is "None", which corresponds to Porter
                  Stemmer. 'snowball' corresponds to Snowball Stemmer
    
    Note : Either stemming or lemmatization should be used. There's no benefit of using both of them together
    
    Output : Returns the processed text corpus
    
    '''
    if cleaning == True:
        corpus = text_clean(corpus, keep_list)
    
    if remove_stopwords == True:
        wh_words = ['who', 'what', 'when', 'why', 'how', 'which', 'where', 'whom']
        stop = set(stopwords.words('english'))
        for word in wh_words:
            stop.remove(word)
        corpus = [[x for x in x.split() if x not in stop] for x in corpus]
    else :
        corpus = [[x for x in x.split()] for x in corpus]
    
    if lemmatization == True:
        lem = WordNetLemmatizer()
        corpus = [[lem.lemmatize(x, pos = 'v') for x in x] for x in corpus]
    
    if stemming == True:
        if stem_type == 'snowball':
            stemmer = SnowballStemmer(language = 'english')
            corpus = [[stemmer.stem(x) for x in x] for x in corpus]
        else :
            stemmer = PorterStemmer()
            corpus = [[stemmer.stem(x) for x in x] for x in corpus]
    
    corpus = [' '.join(x) for x in corpus]
        

    return corpus

In [0]:
common_dot_words = ['U.S.', 'St.', 'Mr.', 'Mrs.', 'D.C.']
all_corpus = preprocess(all_corpus, keep_list = common_dot_words, remove_stopwords = True)

# Splitting the preprocessed combined corpus again into train and test set

In [0]:
train_data=pd.read_csv('data.tsv',header=None,sep='\t')

In [0]:
train_data=train_data.loc[train_data[3] == 1]

In [0]:
train_corpus = all_corpus[0:train.shape[0]]
test_corpus = train_data[1].tolist()

Loading the English model for Spacy.<br>
NLTK version for the same performs too slowly, hence opting for Spacy.

In [0]:
!python -m spacy download en
nlp = spacy.load('en')

# Obtaining Features from Train Data, which would be fed to CountVectorizer

Creating list of Named Entitites, Lemmas, POS Tags, Syntactic Dependency Relation and Orthographic Features using shape.<br>
Later, these would be used as features for our model.

In [0]:
all_ner = []
all_lemma = []
all_tag = []
all_dep = []
all_shape = []
for row in train_corpus:
    doc = nlp(row)
    present_lemma = []
    present_tag = []
    present_dep = []
    present_shape = []
    present_ner = []
    #print(row)
    for token in doc:
        present_lemma.append(token.lemma_)
        present_tag.append(token.tag_)
        #print(present_tag)
        present_dep.append(token.dep_)
        present_shape.append(token.shape_)
    all_lemma.append(" ".join(present_lemma))
    all_tag.append(" ".join(present_tag))
    all_dep.append(" ".join(present_dep))
    all_shape.append(" ".join(present_shape))
    for ent in doc.ents:
        present_ner.append(ent.label_)
    all_ner.append(" ".join(present_ner))

Converting the attributes obtained above into vectors using CountVectorizer.

In [0]:
count_vec_ner = CountVectorizer(ngram_range=(1, 2)).fit(all_ner)
ner_ft = count_vec_ner.transform(all_ner)
count_vec_lemma = CountVectorizer(ngram_range=(1, 2)).fit(all_lemma)
lemma_ft = count_vec_lemma.transform(all_lemma)
count_vec_tag = CountVectorizer(ngram_range=(1, 2)).fit(all_tag)
tag_ft = count_vec_tag.transform(all_tag)
count_vec_dep = CountVectorizer(ngram_range=(1, 2)).fit(all_dep)
dep_ft = count_vec_dep.transform(all_dep)
count_vec_shape = CountVectorizer(ngram_range=(1, 2)).fit(all_shape)
shape_ft = count_vec_shape.transform(all_shape)

Combining the features obtained into 1 matrix

In [0]:
#x_all_ft_train = hstack([ner_ft, lemma_ft, tag_ft, dep_ft, shape_ft])
x_all_ft_train = hstack([ner_ft, lemma_ft, tag_ft])

Converting from COOrdinate format to Compressed Sparse Row format for easier mathematical computations.

In [0]:
x_all_ft_train = x_all_ft_train.tocsr()
x_all_ft_train

# Now we will obtain the Feature vectors for the test set using the CountVectorizers Obtained from the Training Corpus

In [0]:
all_test_ner = []
all_test_lemma = []
all_test_tag = []
all_test_dep = []
all_test_shape = []
i=0
for row in test_corpus:
    doc = nlp(row)
    present_lemma = []
    present_tag = []
    present_dep = []
    present_shape = []
    present_ner = []
    #print(row)
    for token in doc:
        present_lemma.append(token.lemma_)
        present_tag.append(token.tag_)
        #print(present_tag)
        present_dep.append(token.dep_)
        present_shape.append(token.shape_)
    all_test_lemma.append(" ".join(present_lemma))
    all_test_tag.append(" ".join(present_tag))
    all_test_dep.append(" ".join(present_dep))
    all_test_shape.append(" ".join(present_shape))
    for ent in doc.ents:
        present_ner.append(ent.label_)
    all_test_ner.append(" ".join(present_ner))
    if i%500==0:
      print(i)
    i=i+1

In [0]:
ner_test_ft = count_vec_ner.transform(all_test_ner)
lemma_test_ft = count_vec_lemma.transform(all_test_lemma)
tag_test_ft = count_vec_tag.transform(all_test_tag)
dep_test_ft = count_vec_dep.transform(all_test_dep)
shape_test_ft = count_vec_shape.transform(all_test_shape)

In [0]:
#x_all_ft_test = hstack([ner_test_ft, lemma_test_ft, tag_test_ft, dep_test_ft, shape_test_ft])
x_all_ft_test = hstack([ner_test_ft, lemma_test_ft, tag_test_ft])

In [0]:
x_all_ft_test = x_all_ft_test.tocsr()
x_all_ft_test

# Model Training
Literature study over the years has shown Linear SVM performs best in this Use Case.

In [0]:
model = svm.LinearSVC()

First Modelling for Coarse Classes

In [0]:
model.fit(x_all_ft_train, train['QType-Coarse'].values)

# Model Evaluation

In [0]:
preds = model.predict(x_all_ft_test)

In [0]:
len(preds)

In [0]:
dic=dict(zip(test_corpus, list(map(int, preds))))

In [0]:
import json
with open('qc.json', 'w') as fp:
    json.dump(dic, fp)

# Conclusion

We achieved great accuracies using Feature Engineering as compared to accuracies obtained without feature engineering.
(The notebook for models obtained without feature engineering is not being shared and one can try implementing it easily).

Experimenting with informer hypernyms can further help in accuracy improvement as suggested in https://nlp.stanford.edu/courses/cs224n/2010/reports/olalerew.pdf