In [1]:
import pandas as pd
import numpy as np
import pickle
from textblob import TextBlob
import torch

from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics.pairwise import cosine_similarity

import numpy as np
from numpy import dot
from numpy.linalg import norm
from sklearn.externals import joblib

import warnings
warnings.filterwarnings('ignore')



In [2]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/jaypatel/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
import numpy as np
import time

import torch
import torch.nn as nn


class InferSent(nn.Module):

    def __init__(self, config):
        super(InferSent, self).__init__()
        self.bsize = config['bsize']
        self.word_emb_dim = config['word_emb_dim']
        self.enc_lstm_dim = config['enc_lstm_dim']
        self.pool_type = config['pool_type']
        self.dpout_model = config['dpout_model']
        self.version = 1 if 'version' not in config else config['version']

        self.enc_lstm = nn.LSTM(self.word_emb_dim, self.enc_lstm_dim, 1,
                                bidirectional=True, dropout=self.dpout_model)

        assert self.version in [1, 2]
        if self.version == 1:
            self.bos = '<s>'
            self.eos = '</s>'
            self.max_pad = True
            self.moses_tok = False
        elif self.version == 2:
            self.bos = '<p>'
            self.eos = '</p>'
            self.max_pad = False
            self.moses_tok = True

    def is_cuda(self):
        # either all weights are on cpu or they are on gpu
        return self.enc_lstm.bias_hh_l0.data.is_cuda

    def forward(self, sent_tuple):
        # sent_len: [max_len, ..., min_len] (bsize)
        # sent: (seqlen x bsize x worddim)
        sent, sent_len = sent_tuple

        # Sort by length (keep idx)
        sent_len_sorted, idx_sort = np.sort(sent_len)[::-1], np.argsort(-sent_len)
        sent_len_sorted = sent_len_sorted.copy()
        idx_unsort = np.argsort(idx_sort)

        idx_sort = torch.from_numpy(idx_sort).cuda() if self.is_cuda() \
            else torch.from_numpy(idx_sort)
        sent = sent.index_select(1, idx_sort)

        # Handling padding in Recurrent Networks
        sent_packed = nn.utils.rnn.pack_padded_sequence(sent, sent_len_sorted)
        sent_output = self.enc_lstm(sent_packed)[0]  # seqlen x batch x 2*nhid
        sent_output = nn.utils.rnn.pad_packed_sequence(sent_output)[0]

        # Un-sort by length
        idx_unsort = torch.from_numpy(idx_unsort).cuda() if self.is_cuda() \
            else torch.from_numpy(idx_unsort)
        sent_output = sent_output.index_select(1, idx_unsort)

        # Pooling
        if self.pool_type == "mean":
            sent_len = torch.FloatTensor(sent_len.copy()).unsqueeze(1).cuda()
            emb = torch.sum(sent_output, 0).squeeze(0)
            emb = emb / sent_len.expand_as(emb)
        elif self.pool_type == "max":
            if not self.max_pad:
                sent_output[sent_output == 0] = -1e9
            emb = torch.max(sent_output, 0)[0]
            if emb.ndimension() == 3:
                emb = emb.squeeze(0)
                assert emb.ndimension() == 2

        return emb

    def set_w2v_path(self, w2v_path):
        self.w2v_path = w2v_path

    def get_word_dict(self, sentences, tokenize=True):
        # create vocab of words
        word_dict = {}
        sentences = [s.split() if not tokenize else self.tokenize(s) for s in sentences]
        for sent in sentences:
            for word in sent:
                if word not in word_dict:
                    word_dict[word] = ''
        word_dict[self.bos] = ''
        word_dict[self.eos] = ''
        return word_dict

    def get_w2v(self, word_dict):
        assert hasattr(self, 'w2v_path'), 'w2v path not set'
        # create word_vec with w2v vectors
        word_vec = {}
        with open(self.w2v_path, encoding='utf-8') as f:
            for line in f:
                word, vec = line.split(' ', 1)
                if word in word_dict:
                    word_vec[word] = np.fromstring(vec, sep=' ')
        print('Found %s(/%s) words with w2v vectors' % (len(word_vec), len(word_dict)))
        return word_vec

    def get_w2v_k(self, K):
        assert hasattr(self, 'w2v_path'), 'w2v path not set'
        # create word_vec with k first w2v vectors
        k = 0
        word_vec = {}
        with open(self.w2v_path, encoding='utf-8') as f:
            for line in f:
                word, vec = line.split(' ', 1)
                if k <= K:
                    word_vec[word] = np.fromstring(vec, sep=' ')
                    k += 1
                if k > K:
                    if word in [self.bos, self.eos]:
                        word_vec[word] = np.fromstring(vec, sep=' ')

                if k > K and all([w in word_vec for w in [self.bos, self.eos]]):
                    break
        return word_vec

    def build_vocab(self, sentences, tokenize=True):
        assert hasattr(self, 'w2v_path'), 'w2v path not set'
        word_dict = self.get_word_dict(sentences, tokenize)
        self.word_vec = self.get_w2v(word_dict)
        print('Vocab size : %s' % (len(self.word_vec)))

    # build w2v vocab with k most frequent words
    def build_vocab_k_words(self, K):
        assert hasattr(self, 'w2v_path'), 'w2v path not set'
        self.word_vec = self.get_w2v_k(K)
        print('Vocab size : %s' % (K))

    def update_vocab(self, sentences, tokenize=True):
        assert hasattr(self, 'w2v_path'), 'warning : w2v path not set'
        assert hasattr(self, 'word_vec'), 'build_vocab before updating it'
        word_dict = self.get_word_dict(sentences, tokenize)

        # keep only new words
        for word in self.word_vec:
            if word in word_dict:
                del word_dict[word]

        # udpate vocabulary
        if word_dict:
            new_word_vec = self.get_w2v(word_dict)
            self.word_vec.update(new_word_vec)
        else:
            new_word_vec = []
        print('New vocab size : %s (added %s words)'% (len(self.word_vec), len(new_word_vec)))

    def get_batch(self, batch):
        # sent in batch in decreasing order of lengths
        # batch: (bsize, max_len, word_dim)
        embed = np.zeros((len(batch[0]), len(batch), self.word_emb_dim))

        for i in range(len(batch)):
            for j in range(len(batch[i])):
                embed[j, i, :] = self.word_vec[batch[i][j]]

        return torch.FloatTensor(embed)

    def tokenize(self, s):
        from nltk.tokenize import word_tokenize
        if self.moses_tok:
            s = ' '.join(word_tokenize(s))
            s = s.replace(" n't ", "n 't ")  # HACK to get ~MOSES tokenization
            return s.split()
        else:
            return word_tokenize(s)

    def prepare_samples(self, sentences, bsize, tokenize, verbose):
        sentences = [[self.bos] + s.split() + [self.eos] if not tokenize else
                     [self.bos] + self.tokenize(s) + [self.eos] for s in sentences]
        n_w = np.sum([len(x) for x in sentences])

        # filters words without w2v vectors
        for i in range(len(sentences)):
            s_f = [word for word in sentences[i] if word in self.word_vec]
            if not s_f:
                import warnings
                warnings.warn('No words in "%s" (idx=%s) have w2v vectors. \
                               Replacing by "</s>"..' % (sentences[i], i))
                s_f = [self.eos]
            sentences[i] = s_f

        lengths = np.array([len(s) for s in sentences])
        n_wk = np.sum(lengths)
        if verbose:
            print('Nb words kept : %s/%s (%.1f%s)' % (
                        n_wk, n_w, 100.0 * n_wk / n_w, '%'))

        # sort by decreasing length
        lengths, idx_sort = np.sort(lengths)[::-1], np.argsort(-lengths)
        sentences = np.array(sentences)[idx_sort]

        return sentences, lengths, idx_sort

    def encode(self, sentences, bsize=64, tokenize=True, verbose=False):
        tic = time.time()
        sentences, lengths, idx_sort = self.prepare_samples(
                        sentences, bsize, tokenize, verbose)

        embeddings = []
        for stidx in range(0, len(sentences), bsize):
            batch = self.get_batch(sentences[stidx:stidx + bsize])
            if self.is_cuda():
                batch = batch.cuda()
            with torch.no_grad():
                batch = self.forward((batch, lengths[stidx:stidx + bsize])).data.cpu().numpy()
            embeddings.append(batch)
        embeddings = np.vstack(embeddings)

        # unsort
        idx_unsort = np.argsort(idx_sort)
        embeddings = embeddings[idx_unsort]

        if verbose:
            print('Speed : %.1f sentences/s (%s mode, bsize=%s)' % (
                    len(embeddings)/(time.time()-tic),
                    'gpu' if self.is_cuda() else 'cpu', bsize))
        return embeddings

    def visualize(self, sent, tokenize=True):

        sent = sent.split() if not tokenize else self.tokenize(sent)
        sent = [[self.bos] + [word for word in sent if word in self.word_vec] + [self.eos]]

        if ' '.join(sent[0]) == '%s %s' % (self.bos, self.eos):
            import warnings
            warnings.warn('No words in "%s" have w2v vectors. Replacing \
                           by "%s %s"..' % (sent, self.bos, self.eos))
        batch = self.get_batch(sent)

        if self.is_cuda():
            batch = batch.cuda()
        output = self.enc_lstm(batch)[0]
        output, idxs = torch.max(output, 0)
        # output, idxs = output.squeeze(), idxs.squeeze()
        idxs = idxs.data.cpu().numpy()
        argmaxs = [np.sum((idxs == k)) for k in range(len(sent[0]))]

        # visualize model
        import matplotlib.pyplot as plt
        x = range(len(sent[0]))
        y = [100.0 * n / np.sum(argmaxs) for n in argmaxs]
        plt.xticks(x, sent[0], rotation=45)
        plt.bar(x, y)
        plt.ylabel('%')
        plt.title('Visualisation of words importance')
        plt.show()

        return output, idxs

In [5]:
V = 1
MODEL_PATH = 'infersent%s.pkl' % V
params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048,
                'pool_type': 'max', 'dpout_model': 0.0, 'version': V}
infersent = InferSent(params_model)
infersent.load_state_dict(torch.load(MODEL_PATH))
W2V_PATH = 'glove.840B.300d.txt'
infersent.set_w2v_path(W2V_PATH)

In [11]:
train = pd.read_json('train-v1.1.json')
train.size

884

In [15]:
contexts = []
questions = []
answers = []
titles = []

def get_attributes(item):
    data = item['data']
    title = data['title']
    for paragraph in data['paragraphs']:
        for qas in paragraph['qas']:
            answers.append(qas['answers'][0]['text'])
            questions.append(qas['question'])
            contexts.append(paragraph['context'])
            titles.append(title)
            
def build_dataframe(train):
    train.apply(get_attributes, axis = 1)
    train_df = pd.DataFrame({
    'contexts':contexts,
    'questions': questions,
    'answers': answers,
    'titles': titles
})
    return train_df
    
train_df = build_dataframe(train)
train_df = train_df.head(5000)

In [16]:
train_df.shape

(5000, 4)

In [17]:
train_df['sentences'] = train_df['contexts'].apply(lambda x : [item.raw for item in TextBlob(x).sentences ])

In [18]:
def get_target(item):
    """ Builds the target using the index number of answer in the list of sentences
    """
    for index, sentence in enumerate( item['sentences']):
        if item['answers'] in sentence:
            return index
    return 0
    
train_df['target'] = train_df.apply(get_target, axis = 1)

In [19]:
def get_all_sentences(sentences):
    all_sentences = []
    sentences = sentences.tolist()
    for context_sentences in sentences:
        for setence in context_sentences:
            all_sentences.append(setence)
        
    all_sentences = list(dict.fromkeys(all_sentences))
    return all_sentences

In [20]:
paras = list(train_df["contexts"].drop_duplicates().reset_index(drop= True))
blob = TextBlob(" ".join(paras))
sentences = get_all_sentences(train_df['sentences'])
infersent.build_vocab(sentences, tokenize=True)

Found 14620(/15387) words with w2v vectors
Vocab size : 14620


In [22]:

# Sentence Embeddings
dict_embeddings = {}
for i in range(len(sentences)):
    dict_embeddings[sentences[i]] = infersent.encode([sentences[i]], tokenize=True)[0]
  
# Question Embeddings
questions = list(train_df["questions"])    
for i in range(len(questions)):
    dict_embeddings[questions[i]] = infersent.encode([questions[i]], tokenize=True)[0]

In [23]:
def get_context_embeddings(item):
    embeddings = []
    for sentence in item.sentences:
        embeddings.append(dict_embeddings[sentence])
    return embeddings

In [24]:
train_df['question_embedding'] = train_df['questions'].apply(lambda x : dict_embeddings[x])
train_df['context_embedding'] = train_df.apply(get_context_embeddings, axis = 1)

In [25]:
train_df.head()

Unnamed: 0,contexts,questions,answers,titles,sentences,target,question_embedding,context_embedding
0,"Architecturally, the school has a Catholic cha...",To whom did the Virgin Mary allegedly appear i...,Saint Bernadette Soubirous,University_of_Notre_Dame,"[Architecturally, the school has a Catholic ch...",5,"[0.1101008, 0.1142294, 0.11560898, 0.05489475,...","[[0.055199966, 0.05013141, 0.047870383, 0.0162..."
1,"Architecturally, the school has a Catholic cha...",What is in front of the Notre Dame Main Building?,a copper statue of Christ,University_of_Notre_Dame,"[Architecturally, the school has a Catholic ch...",2,"[0.10951651, 0.11030624, 0.052100062, 0.030539...","[[0.055199966, 0.05013141, 0.047870383, 0.0162..."
2,"Architecturally, the school has a Catholic cha...",The Basilica of the Sacred heart at Notre Dame...,the Main Building,University_of_Notre_Dame,"[Architecturally, the school has a Catholic ch...",1,"[0.011956469, 0.14930709, 0.028481217, 0.05278...","[[0.055199966, 0.05013141, 0.047870383, 0.0162..."
3,"Architecturally, the school has a Catholic cha...",What is the Grotto at Notre Dame?,a Marian place of prayer and reflection,University_of_Notre_Dame,"[Architecturally, the school has a Catholic ch...",4,"[0.0711433, 0.054118324, -0.013959841, 0.05310...","[[0.055199966, 0.05013141, 0.047870383, 0.0162..."
4,"Architecturally, the school has a Catholic cha...",What sits on top of the Main Building at Notre...,a golden statue of the Virgin Mary,University_of_Notre_Dame,"[Architecturally, the school has a Catholic ch...",1,"[0.16131133, 0.15654244, 0.08214858, 0.0437286...","[[0.055199966, 0.05013141, 0.047870383, 0.0162..."


In [26]:
from sklearn.metrics.pairwise import euclidean_distances

In [27]:
def get_metric(item, metric):
    result = []
    for i in range(0,len(item.sentences)):
        question_embedding = [item.question_embedding]
        sentence_embedding = [item['context_embedding'][i]]

        if metric == 'cosine_similarity':
            metric = cosine_similarity(question_embedding, sentence_embedding)
            
        if metric == 'euclidean':
            metric = euclidean_distances(question_embedding, sentence_embedding)  

        result.append(metric[0][0])  
    return result

In [28]:
train_df['cosine_similarity'] = train_df.apply(lambda item : get_metric(item, 'cosine_similarity'), axis = 1)
train_df['euclidean'] = train_df.apply(lambda item : get_metric(item, 'euclidean'), axis = 1)

In [29]:
train_df.head()

Unnamed: 0,contexts,questions,answers,titles,sentences,target,question_embedding,context_embedding,cosine_similarity,euclidean
0,"Architecturally, the school has a Catholic cha...",To whom did the Virgin Mary allegedly appear i...,Saint Bernadette Soubirous,University_of_Notre_Dame,"[Architecturally, the school has a Catholic ch...",5,"[0.1101008, 0.1142294, 0.11560898, 0.05489475,...","[[0.055199966, 0.05013141, 0.047870383, 0.0162...","[0.5752636, 0.5752636, 0.5752636, 0.5752636, 0...","[3.8162625, 3.8162625, 3.8162625, 3.8162625, 3..."
1,"Architecturally, the school has a Catholic cha...",What is in front of the Notre Dame Main Building?,a copper statue of Christ,University_of_Notre_Dame,"[Architecturally, the school has a Catholic ch...",2,"[0.10951651, 0.11030624, 0.052100062, 0.030539...","[[0.055199966, 0.05013141, 0.047870383, 0.0162...","[0.5459254, 0.5459254, 0.5459254, 0.5459254, 0...","[3.590196, 3.590196, 3.590196, 3.590196, 3.590..."
2,"Architecturally, the school has a Catholic cha...",The Basilica of the Sacred heart at Notre Dame...,the Main Building,University_of_Notre_Dame,"[Architecturally, the school has a Catholic ch...",1,"[0.011956469, 0.14930709, 0.028481217, 0.05278...","[[0.055199966, 0.05013141, 0.047870383, 0.0162...","[0.60825235, 0.60825235, 0.60825235, 0.6082523...","[3.4122276, 3.4122276, 3.4122276, 3.4122276, 3..."
3,"Architecturally, the school has a Catholic cha...",What is the Grotto at Notre Dame?,a Marian place of prayer and reflection,University_of_Notre_Dame,"[Architecturally, the school has a Catholic ch...",4,"[0.0711433, 0.054118324, -0.013959841, 0.05310...","[[0.055199966, 0.05013141, 0.047870383, 0.0162...","[0.50993013, 0.50993013, 0.50993013, 0.5099301...","[3.6493201, 3.6493201, 3.6493201, 3.6493201, 3..."
4,"Architecturally, the school has a Catholic cha...",What sits on top of the Main Building at Notre...,a golden statue of the Virgin Mary,University_of_Notre_Dame,"[Architecturally, the school has a Catholic ch...",1,"[0.16131133, 0.15654244, 0.08214858, 0.0437286...","[[0.055199966, 0.05013141, 0.047870383, 0.0162...","[0.52223635, 0.52223635, 0.52223635, 0.5222363...","[3.7629066, 3.7629066, 3.7629066, 3.7629066, 3..."


In [30]:
train_df_copy = train_df.copy()

In [31]:
def find_max_number_of_sentences():
    """
        finds the maximum number of sentences possible in any context
    """
    max_number_of_sentences = 0
    for i in range(0, train_df.shape[0]):
        length = len(train_df.iloc[i].sentences)
        if length > max_number_of_sentences:
            max_number_of_sentences = length  
    return max_number_of_sentences     
    
max_number_of_sentences = find_max_number_of_sentences()

In [32]:
max_number_of_sentences

22

In [33]:
def pad(data, max_length):
    mean = sum(data)/len(data)
    length_of_data = len(data)
    pad_number = max_length - length_of_data
    data = data + [mean]*pad_number
    return data

In [34]:
resultant_data = []
def combine_features(item):
    """
    Pads the euclidean and cosine values for particualr instance and generates resultant dataframe
    for modelling , it has eculidean distance between question and all sentnces and cosine similarity
    between between question and all sentences as well and last feature is the index of the answer in the sentnces
    """
    length_of_sentence = len(item.sentences)
    cosine_similarity = item.cosine_similarity
    euclidean = item.euclidean
    
    if length_of_sentence < max_number_of_sentences:
        euclidean = pad(euclidean, max_number_of_sentences)
        cosine_similarity = pad(cosine_similarity, max_number_of_sentences)
        
    features = euclidean + cosine_similarity + [item.target]    
    resultant_data.append(features)
train_df_copy.apply(combine_features, axis = 1)

resultant_data = pd.DataFrame(resultant_data)

In [35]:
resultant_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,35,36,37,38,39,40,41,42,43,44
0,3.816262,3.816262,3.816262,3.816262,3.816262,3.816262,3.816262,3.816262,3.816262,3.816262,...,0.575264,0.575264,0.575264,0.575264,0.575264,0.575264,0.575264,0.575264,0.575264,5
1,3.590196,3.590196,3.590196,3.590196,3.590196,3.590196,3.590196,3.590196,3.590196,3.590196,...,0.545925,0.545925,0.545925,0.545925,0.545925,0.545925,0.545925,0.545925,0.545925,2
2,3.412228,3.412228,3.412228,3.412228,3.412228,3.412228,3.412228,3.412228,3.412228,3.412228,...,0.608252,0.608252,0.608252,0.608252,0.608252,0.608252,0.608252,0.608252,0.608252,1
3,3.64932,3.64932,3.64932,3.64932,3.64932,3.64932,3.64932,3.64932,3.64932,3.64932,...,0.50993,0.50993,0.50993,0.50993,0.50993,0.50993,0.50993,0.50993,0.50993,4
4,3.762907,3.762907,3.762907,3.762907,3.762907,3.762907,3.762907,3.762907,3.762907,3.762907,...,0.522236,0.522236,0.522236,0.522236,0.522236,0.522236,0.522236,0.522236,0.522236,1


In [36]:

X = resultant_data.iloc[:,:-1]
y = resultant_data.iloc[:,-1]

In [37]:
train_x, test_x, train_y, test_y = train_test_split(X, y, train_size=0.8, random_state = 5)

In [38]:
mul_lr = linear_model.LogisticRegression(multi_class='multinomial', solver='newton-cg')
mul_lr.fit(train_x, train_y)

print("Multinomial Logistic regression Train Accuracy : ", metrics.accuracy_score(train_y, mul_lr.predict(train_x)))
print("Multinomial Logistic regression Test Accuracy : ", metrics.accuracy_score(test_y, mul_lr.predict(test_x)))

Multinomial Logistic regression Train Accuracy :  0.374
Multinomial Logistic regression Test Accuracy :  0.384


In [39]:
rf = RandomForestClassifier(min_samples_leaf=8, n_estimators=60)
rf.fit(train_x, train_y)

print("Multinomial Logistic regression Train Accuracy : ", metrics.accuracy_score(train_y, rf.predict(train_x)))
print("Multinomial Logistic regression Test Accuracy : ", metrics.accuracy_score(test_y, rf.predict(test_x)))

Multinomial Logistic regression Train Accuracy :  0.54975
Multinomial Logistic regression Test Accuracy :  0.359
