In [2]:
import json
import pickle
import nltk
import numpy as np
from sklearn.feature_extraction import DictVectorizer
import sklearn as sk
from sklearn.metrics import *
from nltk import word_tokenize
from collections import Counter
from gensim.models import word2vec 
from nltk.corpus import stopwords
from keras.models import Sequential
from gensim.similarities import WmdSimilarity
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from keras.layers import Dense, Activation,LSTM,Embedding
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV

In [3]:
def preprocess(doc):
    doc = doc.lower()  # Lower the text.
    doc = word_tokenize(doc)  # Split into words.
    stop_words = stopwords.words('english')
    doc = [w for w in doc if not w in stop_words]  # Remove stopwords.
    return doc

In [4]:
def get_features(inputfile, outputfile):
    with open(inputfile) as data_file:
        data = json.load(data_file)
        for idx,topic in enumerate(data['data']):
            w2v_corpus = []  #Paras to train word2vec on.
            for para in topic['paragraphs']:
                context = para['context']
                text = preprocess(context)
                w2v_corpus.append(text)
            # Train Word2Vec on all the Paras.
            model = word2vec.Word2Vec(w2v_corpus, workers=3, size=500)
            for para in topic['paragraphs']:
                sentences = nltk.sent_tokenize(para['context'])
                wmd_corpus = []  #Para to run queries against.
                documents = []  # wmd_corpus, with no pre-processing (so we can see the original documents).
                for sentence in sentences:
                    text = preprocess(sentence)
                    wmd_corpus.append(text)
                    documents.append(sentence)
                num_best = 4
                instance = WmdSimilarity(wmd_corpus, model, num_best)
                for quest in para['qas']:
                    q = quest['question']
                    query = preprocess(q)
                    if(len(instance.corpus) > len(query) and len(query) > 0):
                        sims = instance[query]
                        if(len(sims)>0):
                            for i in range(len(sims)):
                                with open(outputfile,'ab') as f:
                                    if quest['answers'][0]['text'] in documents[sims[i][0]]:
                                        pickle.dump((q+" "+documents[sims[i][0]],1),f)
                                    else:
                                        pickle.dump((q+" "+documents[sims[i][0]],0),f)

In [1]:
#Uncomment and Run it only once. This will create Features for Training
#get_features('train-v1.1.json','train_set.pkl')

In [2]:
#Uncomment and Run it only once. This will create Features for Testing
#get_features('dev-v1.1.json','test_set.pkl')

In [3]:
#Uncomment and Run it only once. This will create Features for Validation
#get_features('train-v1.1.json','dev_set.pkl')

In [5]:
def read_features(filename):
    vector = []
    with open(filename,"rb") as f:
        while 1:
            try:
                vector.append(pickle.load(f))
            except EOFError:
                break
    dicts = []
    labels = []
    for v in vector:
        dicts.append(v[0])
        labels.append(v[1])
    return np.array(dicts), np.array(labels)

In [6]:
def tune_parameters(X_train,y_train,X_val,y_val):
    tuned_parameters = [{'random_state': [2,42,None], 'penalty': ['l1', 'l2'],'C': [1,0.1],
                         'class_weight': ['balanced',None]}]

    scores = ['precision', 'recall','f1']
    for score in scores:
        print("# Tuning hyper-parameters for %s" % score)
        print()
        clf = GridSearchCV(LogisticRegression(), tuned_parameters, cv=5,
                           scoring='%s_macro' % score)
        clf.fit(X_train,y_train)
        print("Best parameters set found on development set:")
        print()
        print(clf.best_params_)
        print("Grid scores on development set:")
        print()
        means = clf.cv_results_['mean_test_score']
        stds = clf.cv_results_['std_test_score']
        for mean, std, params in zip(means, stds, clf.cv_results_['params']):
            print("%0.3f (+/-%0.03f) for %r"% (mean, std * 2, params))
        print()
        print("Detailed classification report:")
        print()
        print("The model is evaluated on the full Training set.")
        print("The scores are computed on the full evaluation set.")
        print()
        y_true, y_pred = y_val, clf.predict(X_val)
        print(sk.metrics.classification_report(y_true, y_pred))
    return clf.best_params_

In [7]:
vec = TfidfVectorizer()

In [7]:
X_features_train, y_train = read_features("train_set.pkl")
X_train = vec.fit_transform(X_features_train)
print(X_train.shape,y_train.shape)

(70732, 48241) (70732,)


In [8]:
X_features_val, y_val = read_features("dev_set.pkl")
X_val = vec.transform(X_features_val)
print(X_val.shape,y_val.shape)

(7207, 48241) (7207,)


In [9]:
X_features_test, y_test = read_features("test_set.pkl")
X_test = vec.transform(X_features_test)
print(X_test.shape,y_test.shape)

(8485, 48241) (8485,)


In [64]:
tune_parameters(X_train,y_train,X_val,y_val)

# Tuning hyper-parameters for precision



  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

Best parameters set found on development set:

{'C': 0.1, 'class_weight': 'balanced', 'penalty': 'l1', 'random_state': 2}
Grid scores on development set:

0.545 (+/-0.013) for {'C': 1, 'class_weight': 'balanced', 'penalty': 'l1', 'random_state': 2}
0.545 (+/-0.013) for {'C': 1, 'class_weight': 'balanced', 'penalty': 'l1', 'random_state': 42}
0.545 (+/-0.013) for {'C': 1, 'class_weight': 'balanced', 'penalty': 'l1', 'random_state': None}
0.540 (+/-0.017) for {'C': 1, 'class_weight': 'balanced', 'penalty': 'l2', 'random_state': 2}
0.540 (+/-0.017) for {'C': 1, 'class_weight': 'balanced', 'penalty': 'l2', 'random_state': 42}
0.540 (+/-0.017) for {'C': 1, 'class_weight': 'balanced', 'penalty': 'l2', 'random_state': None}
0.494 (+/-0.102) for {'C': 1, 'class_weight': None, 'penalty': 'l1', 'random_state': 2}
0.494 (+/-0.102) for {'C': 1, 'class_weight': None, 'penalty': 'l1', 'random_state': 42}
0.494 (+/-0.102) for {'C': 1, 'class_weight': None, 'penalty': 'l1', 'random_state': None}
0.531

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

Best parameters set found on development set:

{'C': 0.1, 'class_weight': 'balanced', 'penalty': 'l1', 'random_state': 2}
Grid scores on development set:

0.514 (+/-0.025) for {'C': 1, 'class_weight': 'balanced', 'penalty': 'l1', 'random_state': 2}
0.514 (+/-0.025) for {'C': 1, 'class_weight': 'balanced', 'penalty': 'l1', 'random_state': 42}
0.514 (+/-0.025) for {'C': 1, 'class_weight': 'balanced', 'penalty': 'l1', 'random_state': None}
0.518 (+/-0.028) for {'C': 1, 'class_weight': 'balanced', 'penalty': 'l2', 'random_state': 2}
0.518 (+/-0.028) for {'C': 1, 'class_weight': 'balanced', 'penalty': 'l2', 'random_state': 42}
0.518 (+/-0.028) for {'C': 1, 'class_weight': 'balanced', 'penalty': 'l2', 'random_state': None}
0.443 (+/-0.005) for {'C': 1, 'class_weight': None, 'penalty': 'l1', 'random_state': 2}
0.443 (+/-0.005) for {'C': 1, 'class_weight': None, 'penalty': 'l1', 'random_state': 42}
0.443 (+/-0.005) for {'C': 1, 'class_weight': None, 'penalty': 'l1', 'random_state': None}
0.445

{'C': 0.1, 'class_weight': 'balanced', 'penalty': 'l1', 'random_state': 2}

In [65]:
#Logistic Regression
clf = LogisticRegression(random_state=2,C=0.1,class_weight="balanced",penalty='l1')
clf.fit(X_train,y_train)
acc = cross_val_score(clf,X_train,y_train,cv=10)
print("Average Training Accuracy",np.mean(acc))
y_pred = clf.predict(X_test)
"""
y_proba = clf.predict_proba(X_test)
for i in range(len(y_pred)):
    if y_pred[i] == 1:
        print(X_features_test[i], y_test[i], y_proba[i][1])
"""
print(sk.metrics.classification_report(y_test,y_pred))
print ("\nPrecision of Correct Answer: ", sk.metrics.precision_score(y_test, y_pred))
print ("\nRecall of Correct Answer: ", sk.metrics.recall_score(y_test, y_pred))
print ("\nf1_score of Correct Answer: ", sk.metrics.f1_score(y_test, y_pred))
print ("\nconfusion_matrix")
print (sk.metrics.confusion_matrix(y_test, y_pred))

Average Training Accuracy 0.567338402486
             precision    recall  f1-score   support

          0       0.82      0.61      0.70      6562
          1       0.29      0.54      0.38      1923

avg / total       0.70      0.60      0.63      8485


Precision of Correct Answer:  0.290233074362

Recall of Correct Answer:  0.54394175767

f1_score of Correct Answer:  0.378505518364

confusion_matrix
[[4004 2558]
 [ 877 1046]]


In [41]:
#MLP Classifier
clf = MLPClassifier(random_state=1, solver="lbfgs", hidden_layer_sizes=(10,4))
clf.fit(X_train,y_train)
acc = cross_val_score(clf,X_train,y_train,cv=10)
print("Average Training Accuracy",np.mean(acc))
y_pred = clf.predict(X_test)
print(sk.metrics.classification_report(y_test,y_pred))
print ("\nPrecision of Correct Answer: ", sk.metrics.precision_score(y_test, y_pred))
print ("\nRecall of Correct Answer: ", sk.metrics.recall_score(y_test, y_pred))
print ("\nf1_score of Correct Answer: ", sk.metrics.f1_score(y_test, y_pred))
print ("\nconfusion_matrix")
print (sk.metrics.confusion_matrix(y_test, y_pred))

Average Training Accuracy 0.691200132661
             precision    recall  f1-score   support

          0       0.78      0.89      0.83      6543
          1       0.31      0.16      0.21      1942

avg / total       0.67      0.73      0.69      8485


Precision of Correct Answer:  0.311264822134

Recall of Correct Answer:  0.162203913491

f1_score of Correct Answer:  0.21327014218

confusion_matrix
[[5846  697]
 [1627  315]]


In [8]:
X_features_train, y_train1 = read_features("train_set.pkl")
X_train1 = vec.fit_transform(X_features_train)
print(X_train1.shape,y_train1.shape)

(70732, 48241) (70732,)


In [9]:
X_features_test, y_test1 = read_features("test_set.pkl")
X_test1 = vec.transform(X_features_test)
print(X_test1.shape,y_test1.shape)

(8485, 48241) (8485,)


In [10]:
##########################################
####### Keras sequential method ##########
##########################################
from keras.utils.np_utils import to_categorical

X_train = X_train1.toarray()
X_test = X_test1.toarray()


y_true = y_test1

y_train = to_categorical(y_train1)
y_test = to_categorical(y_test1)

print(len(X_train),len(y_train),len(X_test),len(y_test))

model=Sequential()
model.add(Dense(20,input_dim=X_train.shape[1]))
model.add(Activation('relu'))
#print(model.output_shape)
model.add(Dense(2))
model.add(Activation('sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='rmsprop',metrics=['accuracy'])
model.fit(X_train[:25000],y_train[:25000],epochs=2)
loss_metrics=model.evaluate(X_test,y_test)
classes=model.predict(X_test)
score = model.evaluate(X_test, y_test)
print("\n%s: %.2f%%" % (model.metrics_names[1], score[1]*100))

y_pred = [np.argmax(c) for c in classes]
print(Counter([np.argmax(c) for c in classes]))
print(classification_report(y_true,y_pred))
print("\nPrecision", sk.metrics.precision_score(y_true, y_pred))
print("\nRecall", sk.metrics.recall_score(y_true, y_pred))
print("\nf1_score", sk.metrics.f1_score(y_true, y_pred))
print("\nconfusion_matrix")
print(sk.metrics.confusion_matrix(y_true, y_pred))

70732 70732 8485 8485
Epoch 1/2
Epoch 2/2
acc: 77.34%
Counter({0: 8485})
             precision    recall  f1-score   support

          0       0.77      1.00      0.87      6562
          1       0.00      0.00      0.00      1923

avg / total       0.60      0.77      0.67      8485


Precision 0.0

Recall 0.0

f1_score 0.0

confusion_matrix
[[6562    0]
 [1923    0]]


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [None]:
#######################################
######## Keras LSTM method ############
#######################################
from keras.utils.np_utils import to_categorical

y_train = to_categorical(y_train)
y_test = to_categorical(y_test)

model=Sequential()
model.add(Embedding(X_train.shape[1],output_dim=2))
model.add(LSTM(10,input_dim=X_train.shape[1]))
model.add(Activation('relu'))
print(model.output_shape)
model.add(Dense(2))
model.add(Activation('sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='sgd',metrics=['accuracy'])

model.fit(X_train[:2500],y_train[:2500],epochs=1)
loss_metrics=model.evaluate(X_test,y_test)
classes=model.predict(X_test)
score = model.evaluate(X_test, y_test)
print("\n%s: %.2f%%" % (model.metrics_names[1], score[1]*100))