<a href="https://colab.research.google.com/github/shivammehta007/QuestionGenerator/blob/master/Classifier_to_detect_type_of_questions.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Testing Classifier Model

In [41]:
# Essential Installation for working of notebook
!pip install -U tqdm

Requirement already up-to-date: tqdm in /usr/local/lib/python3.6/dist-packages (4.43.0)


### Imports

In [0]:
import os
import random

import numpy as np
import pandas as pd
import spacy
import torch
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC
from tqdm.auto import tqdm, trange

### Environment Setup 

In [0]:
SEED=1234
def seed_all(seed=1234):
    """Seed the results for duplication"""
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

seed_all(SEED)

In [0]:
tqdm.pandas()

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
DATASET_LOCATION = '/content/drive/My Drive/Data/GrammarDataset.csv'

## Dataset Overview

In [0]:
original_dataset = pd.read_csv(DATASET_LOCATION, sep="\t")

In [8]:
original_dataset.head()

Unnamed: 0,Question,key,answer,Type of Question,Sub Section
0,She _________________ some chocolates to the p...,bring,She brought some chocolates to the party.,irregular verb,past simple
1,I _________________ a new song on the radio.,hear,I heard a new song on the radio.,irregular verb,past simple
2,I _________________ three books last week.,read,I read three books last week.,irregular verb,past simple
3,They _________________ French to the waitress.,speak,They spoke French to the waitress.,irregular verb,past simple
4,He _________________ during the class but now ...,understand,He understood during the class but now he doe...,irregular verb,past simple


In [9]:
original_dataset.dtypes

Question            object
key                 object
answer              object
Type of Question    object
Sub Section         object
dtype: object

### PreProcessing

In [0]:
def preprocessor(dataset):
    # Replace continuous underscores with single one
    dataset["Question"] = dataset["Question"].str.replace("[_]{2,}", "_")
    # Remove Brackets
    dataset["Question"] = dataset["Question"].str.replace("[\)\(]", "")
    # Strip whitespaces
    dataset["Question"] = dataset["Question"].apply(lambda x: x.strip())
    for columns in dataset.columns:
        dataset[columns] = dataset[columns].str.lower() 

    return dataset

In [0]:
original_dataset = preprocessor(original_dataset)

In [12]:
original_dataset.columns

Index(['Question', 'key', 'answer', 'Type of Question', 'Sub Section'], dtype='object')

#### Encoding Labels

In [0]:
label_encoder = LabelEncoder()
original_dataset["Type of Question"] = label_encoder.fit_transform(original_dataset["Type of Question"])

#### Split Training and Testing Data

In [0]:
X_train_orig_dataset, X_test_orig_dataset, y_train_orig_dataset, y_test_orig_dataset = train_test_split(original_dataset[["Question", "key", "answer"]], original_dataset["Type of Question"], random_state=SEED, test_size=0.15)

In [15]:
X_train_orig_dataset.shape, X_test_orig_dataset.shape, y_train_orig_dataset.shape, y_test_orig_dataset.shape

((569, 3), (101, 3), (569,), (101,))

## Experiments:

In [16]:
X_train_orig_dataset.head()

Unnamed: 0,Question,key,answer
271,how / they / get to work?,they,how do they get to work?
488,we were late for the plane because we _our pas...,forgot,we were late for the plane because we had forg...
131,we _ thirsty.,be,we aren’t thirsty.
643,"he_, so the kitchen will be warm.",cook,"he will have been cooking, so the kitchen will..."
155,you / in a cafe?,be,are you in a cafe?


### Model Setup

In [0]:
text_vectorizers = [ ('CountVectorizer', CountVectorizer()), ('TfIdFVectorize', TfidfVectorizer())]
classifiers = [('LogisticRegression', LogisticRegression(max_iter=5000)), ('SVM', SVC()), ('RandomForest', RandomForestClassifier())]

In [0]:
result_dataframe = pd.DataFrame({
        'Vectorizer': [name for name, model in text_vectorizers]
        }, columns = ['Vectorizer'] + [name for name, model in classifiers])
result_dataframe.set_index('Vectorizer', inplace=True)

In [0]:
def check_classification(X_train, y_train, X_test, y_test):

    # Result DataFrame
    result_dataframe = pd.DataFrame({
        'Vectorizer': [name for name, model in text_vectorizers]
        }, columns = ['Vectorizer'] + [name for name, model in classifiers])
    result_dataframe.set_index('Vectorizer', inplace=True)


    best_accuracy = 0
    best_model = None
    for classifier_name, classifier in classifiers:
        for text_vectorizer_name, text_vectorizer in text_vectorizers:
            pipe = Pipeline(steps=[
                                ('text_vec', text_vectorizer),
                                ('class', classifier)
            ])
            pipe.fit(X_train, y_train)
            acc = pipe.score(X_test, y_test)
            # print('Model : {} -> {}:  accuracy: {:.4f}'.format(text_vectorizer_name, classifier_name, acc*100))

            result_dataframe[classifier_name][text_vectorizer_name] = '{:.4f}'.format(acc * 100)

            if acc > best_accuracy:
                best_accuracy = acc
                best_model = '{} -> {}'.format(text_vectorizer_name, classifier_name)


    print("\n\nBest Accuracy was : {:.4f} with the Model: {}".format(best_accuracy*100, best_model))
    return result_dataframe

In [0]:
results = []

### Experiment 1

Contacenating Texts

In [0]:
experiment_text = "Concatenating question + key + answer."

In [0]:
X_train = X_train_orig_dataset["Question"] + " " + X_train_orig_dataset["key"] + " " + X_train_orig_dataset["answer"]
y_train = y_train_orig_dataset
X_test = X_test_orig_dataset["Question"] + " " + X_test_orig_dataset["key"] + " " + X_test_orig_dataset["answer"]
y_test = y_test_orig_dataset

In [23]:
result = check_classification(X_train, y_train, X_test, y_test)
results.append((experiment_text, result))
result



Best Accuracy was : 86.1386 with the Model: CountVectorizer -> RandomForest


Unnamed: 0_level_0,LogisticRegression,SVM,RandomForest
Vectorizer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
CountVectorizer,69.3069,49.505,86.1386
TfIdFVectorize,52.4752,43.5644,86.1386


Best Model Combination was CountVectorizer with RandomForestClassifier

### Experiment 2
With POS Tags

In [0]:
experiment_text = "Adding Pos Tags along with (q, tags) + (k, tags) + (a, tags)"

In [0]:
X_train = X_train_orig_dataset["Question"] + " " + X_train_orig_dataset["key"] + " " + X_train_orig_dataset["answer"]
y_train = y_train_orig_dataset
X_test = X_test_orig_dataset["Question"] + " " + X_test_orig_dataset["key"] + " " + X_test_orig_dataset["answer"]
y_test = y_test_orig_dataset

In [0]:
nlp = spacy.load("en_core_web_sm") 

In [27]:
def add_pos_tagging(text):
    tokens = nlp(text)
    text = []
    for token in tokens:
        text.append("({}, {})".format(token.text, token.pos_))
    
    return " ".join(text)

# Testing method
test_sentence = "Testing the Pos Tagger in this sentence let's see how it works!".lower()
add_pos_tagging(test_sentence)

"(testing, VERB) (the, DET) (pos, NOUN) (tagger, NOUN) (in, ADP) (this, DET) (sentence, NOUN) (let, VERB) ('s, PRON) (see, VERB) (how, ADV) (it, PRON) (works, VERB) (!, PUNCT)"

In [28]:
X_train = X_train.progress_apply(add_pos_tagging)
X_test = X_test.progress_apply(add_pos_tagging)

HBox(children=(FloatProgress(value=0.0, max=569.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=101.0), HTML(value='')))




In [29]:
result = check_classification(X_train, y_train, X_test, y_test)
results.append((experiment_text, result))
result



Best Accuracy was : 95.0495 with the Model: CountVectorizer -> RandomForest


Unnamed: 0_level_0,LogisticRegression,SVM,RandomForest
Vectorizer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
CountVectorizer,81.1881,57.4257,95.0495
TfIdFVectorize,57.4257,55.4455,88.1188


### Experiment 3
With Question, Key, Answer Tagging

In [0]:
experiment_text = "Tagging word with q, k, a example: (word, q) (word, k) (word, a)"

In [0]:
def add_tag(text, tag):
    tokens = nlp(text)
    text = []
    for token in tokens:
        text.append("({}, {})".format(token.text, tag))
    
    return " ".join(text)

In [0]:
X_train = X_train_orig_dataset["Question"].apply(lambda x: add_tag(x, "q")) + " " + X_train_orig_dataset["key"].apply(lambda x: add_tag(x, "k")) + " " + X_train_orig_dataset["answer"].apply(lambda x: add_tag(x, "a"))
y_train = y_train_orig_dataset
X_test = X_test_orig_dataset["Question"].apply(lambda x: add_tag(x, "q")) + " " + X_test_orig_dataset["key"].apply(lambda x: add_tag(x, "k")) + " " + X_test_orig_dataset["answer"].apply(lambda x: add_tag(x, "a"))
y_test = y_test_orig_dataset

In [33]:
X_train[0]

'(she, q) (_, q) (some, q) (chocolates, q) (to, q) (the, q) (party, q) ( , k) (bring, k) ( , a) (she, a) (brought, a) (some, a) (chocolates, a) (to, a) (the, a) (party, a) (., a)'

In [34]:
result = check_classification(X_train, y_train, X_test, y_test)
results.append((experiment_text, result))
result



Best Accuracy was : 93.0693 with the Model: CountVectorizer -> RandomForest


Unnamed: 0_level_0,LogisticRegression,SVM,RandomForest
Vectorizer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
CountVectorizer,69.3069,52.4752,93.0693
TfIdFVectorize,50.495,42.5743,85.1485


### Experiment 4
POS Tagger with q,k,a tagging

In [0]:
experiment_text = "Adding POS Tagging and qka tagging example: (word, tag, q) (word, tag, k) (word, tag, a)"

In [0]:
def add_tag_and_pos(text, tag):
    tokens = nlp(text)
    text = []
    for token in tokens:
        text.append("({}, {}, {})".format(token.text, token.pos_, tag))

    return " ".join(text)

In [0]:
X_train = X_train_orig_dataset["Question"].apply(lambda x: add_tag_and_pos(x, "q")) + " " + X_train_orig_dataset["key"].apply(lambda x: add_tag_and_pos(x, "k")) + " " + X_train_orig_dataset["answer"].apply(lambda x: add_tag_and_pos(x, "a"))
y_train = y_train_orig_dataset
X_test = X_test_orig_dataset["Question"].apply(lambda x: add_tag_and_pos(x, "q")) + " " + X_test_orig_dataset["key"].apply(lambda x: add_tag_and_pos(x, "k")) + " " + X_test_orig_dataset["answer"].apply(lambda x: add_tag_and_pos(x, "a"))
y_test = y_test_orig_dataset

In [38]:
result = check_classification(X_train, y_train, X_test, y_test)
results.append((experiment_text, result))
result



Best Accuracy was : 93.0693 with the Model: CountVectorizer -> RandomForest


Unnamed: 0_level_0,LogisticRegression,SVM,RandomForest
Vectorizer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
CountVectorizer,80.198,57.4257,93.0693
TfIdFVectorize,58.4158,53.4653,91.0891


# Results

In [40]:
from IPython.display import display, HTML

for i, (description, result) in enumerate(results):
    print("Experiment {}:".format(i+1))
    print(description)
    display(HTML(result.to_html()))
    print("\n\n")

Experiment 1:
Concatenating question + key + answer.


Unnamed: 0_level_0,LogisticRegression,SVM,RandomForest
Vectorizer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
CountVectorizer,69.3069,49.505,86.1386
TfIdFVectorize,52.4752,43.5644,86.1386





Experiment 2:
Adding Pos Tags along with (q, tags) + (k, tags) + (a, tags)


Unnamed: 0_level_0,LogisticRegression,SVM,RandomForest
Vectorizer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
CountVectorizer,81.1881,57.4257,95.0495
TfIdFVectorize,57.4257,55.4455,88.1188





Experiment 3:
Tagging word with q, k, a example: (word, q) (word, k) (word, a)


Unnamed: 0_level_0,LogisticRegression,SVM,RandomForest
Vectorizer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
CountVectorizer,69.3069,52.4752,93.0693
TfIdFVectorize,50.495,42.5743,85.1485





Experiment 4:
Adding POS Tagging and qka tagging example: (word, tag, q) (word, tag, k) (word, tag, a)


Unnamed: 0_level_0,LogisticRegression,SVM,RandomForest
Vectorizer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
CountVectorizer,80.198,57.4257,93.0693
TfIdFVectorize,58.4158,53.4653,91.0891







In [42]:
X_train[0]

'(she, PRON, q) (_, VERB, q) (some, DET, q) (chocolates, NOUN, q) (to, ADP, q) (the, DET, q) (party, NOUN, q) ( , SPACE, k) (bring, VERB, k) ( , SPACE, a) (she, PRON, a) (brought, VERB, a) (some, DET, a) (chocolates, NOUN, a) (to, ADP, a) (the, DET, a) (party, NOUN, a) (., PUNCT, a)'