In [1]:
#importing required libraries

from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble
import pandas, numpy, string
from keras.preprocessing import text, sequence
from keras import layers, models, optimizers
from keras.utils import np_utils
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
import numpy as np
import pandas as pd

In [2]:
#converting data in the dataset into tfidf word level vectors

def tfidf_word_level(queries,lb_len,y):
    
    tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000, min_df=1) # words as features, tokenize only words of 1+ chars, maximum features=5000
    Y1 =  tfidf_vect.fit_transform(queries) #
    X1_ = Y1[:lb_len]
    test1=Y1[len(queries)-1].toarray()
    X1_train, X1_test, Y1_train, Y1_test = train_test_split(X1_, y, test_size=0.3, random_state=0) #training-70% testing-30%
    
    return X1_train, X1_test, Y1_train, Y1_test,test1

In [3]:
#converting data in the dataset into tfidf ngram level vectors with words as features

def tfidf_ngram_level(queries,lb_len,y):
    
    tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000, min_df=1) #min_ngram_range=2, max_n_gram_range=3
    Y2 = tfidf_vect_ngram.fit_transform(queries) #fitting queries(tokenizing and building vocabulary) and transform(encoding the document)
    X2_ = Y2[:lb_len]
    test2=Y2[len(queries)-1].toarray()
    X2_train, X2_test, Y2_train, Y2_test = train_test_split(X2_, y, test_size=0.3, random_state=0) #random_state=0 training and testing data will not change
    
    return X2_train, X2_test, Y2_train, Y2_test,test2

In [4]:
#converting data in the dataset into tfidf ngram level vectors with characters as features

def tfidf_char_level(queries,lb_len,y):    
    
    tfidf_vect_ngram_chars = TfidfVectorizer(analyzer='char', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=15000,min_df=1) #char as features
    Y3 = tfidf_vect_ngram_chars.fit_transform(queries)
    X3_ = Y3[:lb_len]
    test3=Y3[len(queries)-1].toarray()
    X3_train, X3_test, Y3_train, Y3_test = train_test_split(X3_, y, test_size=0.3, random_state=0)
    
    return X3_train, X3_test, Y3_train, Y3_test,test3

In [5]:
#converting data in the dataset into count vectors 

def count(queries,lb_len,y):
    
    count_vect = CountVectorizer()
    Y4 =  count_vect.fit_transform(queries)
    X4_ = Y4[:lb_len]
    test4=Y4[len(queries)-1].toarray()
    X4_train, X4_test, Y4_train, Y4_test = train_test_split(X4_, y, test_size=0.3, random_state=0)
    
    return X4_train, X4_test, Y4_train, Y4_test,test4

In [6]:
#defining train model function

def train_model(classifier, feature_vector_train, label, feature_vector_valid,valid_y):  
    classifier.fit(feature_vector_train, label)
    predictions = classifier.predict(feature_vector_valid)
    acc = metrics.accuracy_score(predictions,valid_y)  #calculates accuracy score based on valid labels and predicted labels
    return acc    

In [7]:
def QC_Statistical(clf,X_train,y_train,X_test,y_test,tests):

    # Linear Classifier on Word Level TF IDF Vectors
    xtrain_tfidf,y1_train,xvalid_tfidf,y1_valid,test1 = X_train[0],y_train[0],X_test[0],y_test[0],tests[0]
    accuracy = train_model(clf, xtrain_tfidf,y1_train,xvalid_tfidf,y1_valid) #training model
    print("WordLevel TF-IDF: ", accuracy)
    result1=clf.predict(test1)   #predicting the encoded label for the given input query
    result1 = tag(result1)   #decoding the predicted numerical value of the label to label
    print(result1)

    #Linear Classifier on Ngram Level TF IDF Vectors
    xtrain_tfidf_ngram,y2_train, xvalid_tfidf_ngram,y2_valid,test2 = X_train[1],y_train[1],X_test[1],y_test[1],tests[1]
    accuracy = train_model(clf, xtrain_tfidf_ngram,y2_train, xvalid_tfidf_ngram,y2_valid)
    print("N-Gram Vectors: ", accuracy)
    result2=clf.predict(test2)
    result2 = tag(result2)
    print(result2)

    # Linear Classifier on Character Level TF IDF Vectors
    xtrain_tfidf_ngram_chars,y3_train, xvalid_tfidf_ngram_chars,y3_valid,test3 = X_train[2],y_train[2],X_test[2],y_test[2],tests[2]
    accuracy = train_model(clf, xtrain_tfidf_ngram_chars,y3_train, xvalid_tfidf_ngram_chars,y3_valid)
    print("CharLevel Vectors: ", accuracy)
    result3=clf.predict(test3)
    result3 = tag(result3)
    print(result3)
    
    #Linear Classifier with Count vectors
    xtrain_count,y4_train, xvalid_count,y4_valid,test4 = X_train[3],y_train[3],X_test[3],y_test[3],tests[3]
    accuracy = train_model(clf, xtrain_count,y4_train, xvalid_count,y4_valid)
    print("Count vectors: ", accuracy)
    result4=clf.predict(test4)
    result4 = tag(result4)
    print(result4)
    

In [8]:
#decoding the encoded numerical label to actual label and returning it

def tag(result):
    if(result==0):
        return("DATE")

    elif(result==1):
        return("LOCATION")

    elif(result==2):
        return("MONEY")

    elif(result==3):
        return("NUMBER")

    elif(result==4):
        return("ORGANISATION")

    elif(result==5):
        return("PERCENTAGE")
    
    elif(result==6):
        return("PERSON")

    else:
        return("TIME")

In [9]:
def Question_Classification_Statistical(input):
    qa=open("data\Dataset.txt","a",encoding="utf-8")
    qa.write("\n")
    qa.write("    :"+input)
    qa.close()
    f=open("data\Dataset.txt","r",encoding="utf-8")
    tags,labels,queries  = [],[],[]

    for line in f:
        line=line.rstrip('\n')
        lb=(line.split()[0]).split(":")[0]
        if len(lb)!=0:
            tags.append(lb)
        queries.append(line[5:])
        
    
    labelEncoder = preprocessing.LabelEncoder()
    labelEncoder.fit(['NUMB','PERS','LOCA','DATE','MONE','TIME','PERC','ORGA'])
    labels = labelEncoder.transform(tags)
     
    lb_len = len(labels)
    y = labels[:len(labels)]
    
    #sending data to the functions and converting them into vectors and returning train and test data after splitting
    X1_train, X1_test, Y1_train, Y1_test,test1 = tfidf_word_level(queries,lb_len,y)
    X2_train, X2_test, Y2_train, Y2_test,test2 = tfidf_ngram_level(queries,lb_len,y)
    X3_train, X3_test, Y3_train, Y3_test,test3 = tfidf_char_level(queries,lb_len,y)
    X4_train, X4_test, Y4_train, Y4_test,test4 = count(queries,lb_len,y)
    
    tests = [test1,test2,test3,test4]
    X_train=[X1_train,X2_train,X3_train,X4_train]
    y_train=[Y1_train,Y2_train,Y3_train,Y4_train]
    X_test=[X1_test,X2_test,X3_test,X4_test]
    y_test=[Y1_test,Y2_test,Y3_test,Y4_test]
    
    #building models
    lr = LogisticRegression(max_iter=500, multi_class='multinomial')
    svm =  LinearSVC(max_iter=500, multi_class='ovr')
    mlp = MLPClassifier(max_iter=500, activation='relu')
    nb = naive_bayes.MultinomialNB()
    rf = ensemble.RandomForestClassifier()
    
    #training and testing the models by providing the required data to QC_Statistical function
    print("Statstical Models")
    print("\n"+"LR")
    QC_Statistical(lr,X_train,y_train,X_test,y_test,tests)
    print("\n"+"SVM")
    QC_Statistical(svm,X_train,y_train,X_test,y_test,tests)
    print("\n"+"MLP")
    QC_Statistical(mlp,X_train,y_train,X_test,y_test,tests)
    print("\n"+"NB")
    QC_Statistical(nb,X_train,y_train,X_test,y_test,tests)
    print("\n"+"RF")
    QC_Statistical(rf,X_train,y_train,X_test,y_test,tests)
    print("\n")
    
    #removing last line in the dataset(inserted input query will be deleted)
    fd = open("data\Dataset.txt","r",encoding="utf-8")
    d = fd.read()
    fd.close()
    m = d.split("\n")
    s = "\n".join(m[:-1])
    fd = open("data\Dataset.txt","w+",encoding="utf-8")
    for i in range(len(s)):
        fd.write(s[i])
    fd.close()    
        

In [10]:
#defining an input query
input = "రాజ్యాంగంలో పేర్కొన్న మొత్తం ప్రాథమిక విధుల సంఖ్య ఎంత?"

#providing the input query to Question_Classification_Statistical in order to get accuracies and predicted answer label for given input query using different models
Question_Classification_Statistical(input)

Statstical Models

LR
WordLevel TF-IDF:  0.7661691542288557
NUMBER
N-Gram Vectors:  0.7860696517412935
NUMBER
CharLevel Vectors:  0.8233830845771144
NUMBER
Count vectors:  0.7388059701492538
NUMBER

SVM
WordLevel TF-IDF:  0.8109452736318408
NUMBER
N-Gram Vectors:  0.8407960199004975
NUMBER
CharLevel Vectors:  0.8805970149253731
NUMBER
Count vectors:  0.7288557213930348
NUMBER

MLP
WordLevel TF-IDF:  0.7661691542288557
NUMBER
N-Gram Vectors:  0.8084577114427861
NUMBER
CharLevel Vectors:  0.8606965174129353
NUMBER
Count vectors:  0.7238805970149254
NUMBER

NB
WordLevel TF-IDF:  0.7412935323383084
NUMBER
N-Gram Vectors:  0.7487562189054726
NUMBER
CharLevel Vectors:  0.7810945273631841
NUMBER
Count vectors:  0.7437810945273632
NUMBER

RF
WordLevel TF-IDF:  0.8109452736318408
NUMBER
N-Gram Vectors:  0.8034825870646766
NUMBER
CharLevel Vectors:  0.8258706467661692
NUMBER
Count vectors:  0.7512437810945274
NUMBER


