## METHOD 1: Using basic Parse tree created using Stanford's CORE NLP

In [26]:
import pandas as pd
from pycorenlp import StanfordCoreNLP

class isQuestionBasic():
    
    # Init Constructor
    # Initialize StanfordCore NLP local instance on port 9000
    def __init__(self):
        self.nlp = StanfordCoreNLP('http://localhost:9000')
        
    # Input: Sentence to be predicted
    # Processing: 1. Uses Stanfors NLP's 'annotate' method to create Parse Tree
    # 2. Checks for occurence of 'SQ' or 'SBARQ' in the parse tree
    # Return: 1 - If sentence is question | 0 - If sentence is not a question
    def isQuestion(self, sentence):
        if '?' in sentence:
            return 1
        output = self.nlp.annotate(sentence, properties={
            'annotators': 'parse',
            'outputFormat': 'json',
            'timeout': 1000,
        })

        if ('SQ' or 'SBARQ') in output['sentences'][0]["parse"]:
            return 1    
        else:
            return 0
        

In [27]:
isQuestionBasic_obj = isQuestionBasic()

### Run on the test data

In [28]:
df = pd.read_csv('queries-10k-txt', sep='\t')
df['is_question'] = df['QUERY'].apply(isQuestionBasic_obj.isQuestion)

In [29]:
df['is_question'].value_counts()

0    9332
1     668
Name: is_question, dtype: int64

## METHOD 2: DETECTION USING NLTK CLASSIFICATION TECHNIQUE

In [30]:
import nltk
# nltk.download('nps_chat')
# nltk.download('punkt')

In [31]:
import re
import nltk.corpus
from nltk.corpus import nps_chat
import pandas as pd

class IsQuestion():
    
    # Init constructor
    def __init__(self):
        posts = self.__get_posts()
        feature_set = self.__get_feature_set(posts)
        self.classifier = self.__perform_classification(feature_set)
        
    # Method (Private): __get_posts
    # Input: None
    # Output: Posts (Text) from NLTK's nps_chat package
    def __get_posts(self):
        return nltk.corpus.nps_chat.xml_posts()
    
    # Method (Private): __get_feature_set
    # Input: Posts from NLTK's nps_chat package
    # Processing: 1. preserve alpha numeric characters, whitespace, apostrophe
    # 2. Tokenize sentences using NLTK's word_tokenize
    # 3. Create a dictionary of list of tuples for each post where tuples index 0 is the dictionary of words occuring in the sentence and index 1 is the class as received from nps_chat package 
    # Return: List of tuples for each post
    def __get_feature_set(self, posts):
        feature_list = []
        for post in posts:
            post_text = post.text            
            features = {}
            words = nltk.word_tokenize(post_text)
            for word in words:
                features['contains({})'.format(word.lower())] = True
            feature_list.append((features, post.get('class')))
        return feature_list
    
    # Method (Private): __perform_classification
    # Input: List of tuples for each post
    # Processing: 1. Divide data into 80% training and 10% testing sets
    # 2. Use NLTK's Multinomial Naive Bayes to perform classifcation
    # 3. Print the Accuracy of the model
    # Return: Classifier object
    def __perform_classification(self, feature_set):
        training_size = int(len(feature_set) * 0.1)
        train_set, test_set = feature_set[training_size:], feature_set[:training_size]
        classifier = nltk.NaiveBayesClassifier.train(train_set)
        print('Accuracy is : ', nltk.classify.accuracy(classifier, test_set))
        return classifier
        
    # Method (private): __get_question_words_set
    # Input: None
    # Return: Set of commonly occuring words in questions
    def __get_question_words_set(self):
        question_word_list = ['what', 'where', 'when','how','why','did','do','does','have','has','am','is','are','can','could','may','would','will','should'
"didn't","doesn't","haven't","isn't","aren't","can't","couldn't","wouldn't","won't","shouldn't",'?']
        return set(question_word_list)        
    
    # Method (Public): predict_question
    # Input: Sentence to be predicted
    # Return: 1 - If sentence is question | 0 - If sentence is not question
    def predict_question(self, text):
        words = nltk.word_tokenize(text.lower())        
        if self.__get_question_words_set().intersection(words) == False:
            return 0
        if '?' in text:
            return 1
        
        features = {}
        for word in words:
            features['contains({})'.format(word.lower())] = True            
        
        prediction_result = self.classifier.classify(features)
        if prediction_result == 'whQuestion' or prediction_result == 'ynQuestion':
            return 1
        return 0
    
    # Method (Public): predict_question_type
    # Input: Sentence to be predicted
    # Return: 'WH' - If question is WH question | 'YN' - If sentence is Yes/NO question | 'unknown' - If unknown question type
    def predict_question_type(self, text):
        words = nltk.word_tokenize(text.lower())                
        features = {}
        for word in words:
            features['contains({})'.format(word.lower())] = True            
        
        prediction_result = self.classifier.classify(features)
        if prediction_result == 'whQuestion':
            return 'WH'
        elif prediction_result == 'ynQuestion':
            return 'YN'
        else:
            return 'unknown'
    
        

In [32]:
isQ = IsQuestion()

Accuracy is :  0.6685606060606061


### Test sample sentence

In [33]:
isQ.predict_question('what is this')

1

In [34]:
isQ.predict_question_type('what is this')

'WH'

### Apply on the test data

In [35]:
df_1 = pd.read_csv('queries-10k-txt', sep='\t')
df_1['is_question'] = df_1['QUERY'].apply(isQ.predict_question)
df_1['question_type'] = df_1[df_1['is_question'] == 1]['QUERY'].apply(isQ.predict_question_type)

In [36]:
df_1['is_question'].value_counts()

0    8799
1    1201
Name: is_question, dtype: int64

In [37]:
df_1['question_type'].value_counts()

WH    944
YN    257
Name: question_type, dtype: int64

## METHOD 3: DETECTION USING ADVANCED CLASSIFICATION

In [38]:
import sklearn
import pandas as pd
import re
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

class IsQuestionAdvanced():
    
    # Init constructor
    # Input: Type of classification: 'MNB' - Multinomial Naive Bayes | 'SVM' - Support Vector Machine
    def __init__(self, classification_type):
        self.classification_type = classification_type
        df = self.__get_data()
        df = self.__clean_data(df)
        df = self.__label_encode(df)
        vectorizer_classifier = self.__create_classifier(df, self.classification_type)
        if vectorizer_classifier is not None:
            self.vectorizer = vectorizer_classifier['vectorizer']
            self.classifier = vectorizer_classifier['classifier']        
        
    # Method (Private):  __clean_data
    # Input: Raw input dataframe
    # Processing: 1. Rename column 
    # 2. lowercase text
    # 3. preserve alpha numeric characters, whitespace, apostrophe
    # 4. filter dataframe with questiin types - what, who, when, affirmation, unknown
    # Return: Processed filtered dataframe
    def __clean_data(self, df):
        df.rename(columns={0: 'text', 1: 'type'}, inplace=True)
        df['type'] = df['type'].str.strip()
        df['text'] = df['text'].apply(lambda x: x.lower())
        df['text'] = df['text'].apply((lambda x: re.sub('[^a-zA-z0-9\s\']','',x)))
        return df[(df['type'] == 'what') | (df['type'] == 'who') | (df['type'] == 'when') | (df['type'] == 'unknown') | (df['type'] == 'affirmation')]
    

    # Method (Private): __label_encode
    # Input: Processed dataframe
    # Processing: Use label encoding to convert text label to integer label and add it to a new column
    # Return: Processed dataframe with label encoding column
    def __label_encode(self, df):
        self.le = preprocessing.LabelEncoder()
        self.le.fit(df['type'])
        df['label'] = list(self.le.transform(df['type']))
        return df
    
    # Method (Private): __create_classifier
    # Input: 1. Processed dataframe 2. Type of classification
    # Processing: 1. Perform TFIDF Vectorization
    # 2. Appy fit_tranform using TFIDF on text column
    # 3. Split data into 70% training and 30% testing
    # 4. Perform Multinomial Naive Bayes OR SVM classifcation based on input provided
    # 5. Peform prediction for both classification techniques on test data
    # 6. Show confusion matrix and accuracy
    # Return: Dict - TFIDF Vetctorizer, Classifier    
    def __create_classifier(self, df, classification_type):
        v = TfidfVectorizer(analyzer='word',lowercase=True)
        X = v.fit_transform(df['text'])
        X_train, X_test, y_train, y_test = train_test_split(X, df['label'], test_size=0.30)
        if classification_type == 'MNB':
            clf = MultinomialNB()
            clf.fit(X_train,y_train)
            preds = clf.predict(X_test)
            print(classification_report(preds,y_test))
            print('Accuracy is: ', clf.score(X_test,y_test))
            return {'vectorizer': v, 'classifier': clf}
        elif classification_type == 'SVM':
            clf_svm = SVC(kernel='linear')
            clf_svm.fit(X_train,y_train)
            preds = clf_svm.predict(X_test)
            preds = print(classification_report(preds,y_test))
            print('Accuracy is: ', clf_svm.score(X_test,y_test))
            return {'vectorizer': v, 'classifier': clf_svm}
        else:
            print("Wrong classification type: \n Type 'MNB' - Multinomial Naive Bayes \n Type 'SVM' - Support Vector Machine")    
            

    # Method (Private): __get_data
    # Processing: Get the sample input data used to create traning, test, vectorizer, classifier data
    # Return: Pandas dataframe
    def __get_data(self):
        return pd.read_csv('sample.txt', sep=',,,', header=None)
    
    # Method (Public): predict
    # Input: An unknown new sentence
    # Return: Prediction - Typpe of question 'what', 'when', 'who'
    def predict(self, sentence):
        ex = self.vectorizer.transform([sentence])
        return list(self.le.inverse_transform(self.classifier.predict(ex)))[0]                

    

In [41]:
# Model created using SVM
obj = IsQuestionAdvanced('SVM')



              precision    recall  f1-score   support

           0       0.89      1.00      0.94        31
           1       0.96      0.95      0.95        76
           2       0.97      0.97      0.97       196
           3       0.90      0.84      0.87        32
           4       0.99      0.99      0.99       110

    accuracy                           0.96       445
   macro avg       0.94      0.95      0.95       445
weighted avg       0.96      0.96      0.96       445

Accuracy is:  0.9640449438202248


In [42]:
obj.predict('what time are you going there')

'when'

### Run model on questions classified by method 1 <br> df is created by method 1

In [43]:
df_2 = df[df['is_question'] == 1].copy()
df_2['question_type'] = df_2['QUERY'].apply(obj.predict)

In [44]:
df_2['question_type'].value_counts()

what       405
unknown    193
who         52
when        18
Name: question_type, dtype: int64

### Run model on questions classified by method 2 <br> df_1 is created by method 2

In [45]:
df_3 = df_1[df_1['is_question'] == 1].copy()
df_3['question_type'] = df_3['QUERY'].apply(obj.predict)

In [46]:
df_3['question_type'].value_counts()

unknown        593
what           544
who             50
when             8
affirmation      6
Name: question_type, dtype: int64