In [1]:
import warnings
warnings.filterwarnings('ignore')
import pickle
import numpy as np
import pandas as pd
import json

from textblob import TextBlob
import nltk
from scipy import spatial
import torch
import spacy
en_nlp = spacy.load('en_core_web_sm')

In [2]:
train = pd.read_json(r"/content/drive/MyDrive/train-v1.1.json")

In [3]:
contexts = []
questions = []
answers_text = []
answers_start = []
for i in range(train.shape[0]):
    topic = train.iloc[i,0]['paragraphs']
    for sub_para in topic:
        for q_a in sub_para['qas']:
            questions.append(q_a['question'])
            answers_start.append(q_a['answers'][0]['answer_start'])
            answers_text.append(q_a['answers'][0]['text'])
            contexts.append(sub_para['context'])
df = pd.DataFrame({"context":contexts, "question": questions, "answer_start": answers_start, "text": answers_text})

In [4]:
paras = list(df["context"].drop_duplicates().reset_index(drop= True))


In [5]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [6]:
df

Unnamed: 0,context,question,answer_start,text
0,"Architecturally, the school has a Catholic cha...",To whom did the Virgin Mary allegedly appear i...,515,Saint Bernadette Soubirous
1,"Architecturally, the school has a Catholic cha...",What is in front of the Notre Dame Main Building?,188,a copper statue of Christ
2,"Architecturally, the school has a Catholic cha...",The Basilica of the Sacred heart at Notre Dame...,279,the Main Building
3,"Architecturally, the school has a Catholic cha...",What is the Grotto at Notre Dame?,381,a Marian place of prayer and reflection
4,"Architecturally, the school has a Catholic cha...",What sits on top of the Main Building at Notre...,92,a golden statue of the Virgin Mary
...,...,...,...,...
87594,"Kathmandu Metropolitan City (KMC), in order to...",In what US state did Kathmandu first establish...,229,Oregon
87595,"Kathmandu Metropolitan City (KMC), in order to...",What was Yangon previously known as?,414,Rangoon
87596,"Kathmandu Metropolitan City (KMC), in order to...",With what Belorussian city does Kathmandu have...,476,Minsk
87597,"Kathmandu Metropolitan City (KMC), in order to...",In what year did Kathmandu create its initial ...,199,1975


In [7]:
df['sentences'] = df['context'].apply(lambda x: [item.raw for item in TextBlob(x).sentences])

In [8]:
data=df

In [9]:
def get_target(x):
    idx = -1
    for i in range(len(x["sentences"])):
        if x["text"] in x["sentences"][i]: idx = i
    return idx

In [10]:
data["target"] = data.apply(get_target, axis = 1)

In [11]:
def get_text(x):
  r= x['target']
  s=x['sentences']
  return s[r]



In [12]:
data["answer text"] = data.apply(get_text, axis = 1)

In [13]:
data.head(2)

Unnamed: 0,context,question,answer_start,text,sentences,target,answer text
0,"Architecturally, the school has a Catholic cha...",To whom did the Virgin Mary allegedly appear i...,515,Saint Bernadette Soubirous,"[Architecturally, the school has a Catholic ch...",5,"It is a replica of the grotto at Lourdes, Fran..."
1,"Architecturally, the school has a Catholic cha...",What is in front of the Notre Dame Main Building?,188,a copper statue of Christ,"[Architecturally, the school has a Catholic ch...",2,Immediately in front of the Main Building and ...


In [14]:
from gensim.models import KeyedVectors
word2vec_model_path = r"/content/drive/MyDrive/GoogleNews-vectors-negative300.bin.gz"
word2vec_model = KeyedVectors.load_word2vec_format(word2vec_model_path, binary=True)


In [15]:
def embed_text(text, model):
    words = text.lower().split()
    vectors = [model[word] for word in words if word in model]
    return sum(vectors) / len(vectors) if vectors else None


In [16]:
data['question_word2vec'] = data['question'].apply(lambda x: embed_text(x, word2vec_model))
data['answer_word2vec'] = data['answer text'].apply(lambda x: embed_text(x, word2vec_model))

In [17]:
from sklearn.metrics.pairwise import cosine_similarity

In [18]:
data

Unnamed: 0,context,question,answer_start,text,sentences,target,answer text,question_word2vec,answer_word2vec
0,"Architecturally, the school has a Catholic cha...",To whom did the Virgin Mary allegedly appear i...,515,Saint Bernadette Soubirous,"[Architecturally, the school has a Catholic ch...",5,"It is a replica of the grotto at Lourdes, Fran...","[0.04432763, 0.05114068, 0.025661893, 0.027520...","[0.07558136, 0.047441483, 0.050408937, 0.11994..."
1,"Architecturally, the school has a Catholic cha...",What is in front of the Notre Dame Main Building?,188,a copper statue of Christ,"[Architecturally, the school has a Catholic ch...",2,Immediately in front of the Main Building and ...,"[0.053157806, 0.030845642, 0.10076904, 0.05203...","[0.07821013, 0.036659643, 0.03910346, 0.070101..."
2,"Architecturally, the school has a Catholic cha...",The Basilica of the Sacred heart at Notre Dame...,279,the Main Building,"[Architecturally, the school has a Catholic ch...",3,Next to the Main Building is the Basilica of t...,"[0.07283991, 0.03819691, 0.12393466, 0.1254106...","[0.1040819, 0.022325303, 0.123535156, 0.070712..."
3,"Architecturally, the school has a Catholic cha...",What is the Grotto at Notre Dame?,381,a Marian place of prayer and reflection,"[Architecturally, the school has a Catholic ch...",4,"Immediately behind the basilica is the Grotto,...","[0.05777486, 0.10460409, 0.13729858, 0.0961710...","[0.07532077, -0.003913032, 0.119249135, 0.0689..."
4,"Architecturally, the school has a Catholic cha...",What sits on top of the Main Building at Notre...,92,a golden statue of the Virgin Mary,"[Architecturally, the school has a Catholic ch...",1,Atop the Main Building's gold dome is a golden...,"[0.0009358724, 0.013122559, 0.124565974, 0.051...","[0.03282089, 0.09317627, 0.05167236, 0.0454956..."
...,...,...,...,...,...,...,...,...,...
87594,"Kathmandu Metropolitan City (KMC), in order to...",In what US state did Kathmandu first establish...,229,Oregon,"[Kathmandu Metropolitan City (KMC), in order t...",1,KMC's first international relationship was est...,"[0.0033081055, 0.057928465, 0.058052063, 0.067...","[-0.02399292, 0.06692505, 0.0375, 0.056396484,..."
87595,"Kathmandu Metropolitan City (KMC), in order to...",What was Yangon previously known as?,414,Rangoon,"[Kathmandu Metropolitan City (KMC), in order t...",2,This activity has been further enhanced by est...,"[-0.024749756, -0.01738739, 0.10687256, 0.0566...","[-0.02366222, 0.006723577, 0.008219112, 0.1252..."
87596,"Kathmandu Metropolitan City (KMC), in order to...",With what Belorussian city does Kathmandu have...,476,Minsk,"[Kathmandu Metropolitan City (KMC), in order t...",2,This activity has been further enhanced by est...,"[0.017283121, 0.0051574707, 0.08671061, 0.0952...","[-0.02366222, 0.006723577, 0.008219112, 0.1252..."
87597,"Kathmandu Metropolitan City (KMC), in order to...",In what year did Kathmandu create its initial ...,199,1975,"[Kathmandu Metropolitan City (KMC), in order t...",1,KMC's first international relationship was est...,"[0.019544814, 0.08488634, 0.08759393, 0.062188...","[-0.02399292, 0.06692505, 0.0375, 0.056396484,..."


In [19]:
import ast

In [22]:

data = data[data["sentences"].str.len() < 11].reset_index(drop=True)


In [23]:
len(data)

85120

In [24]:
def cosine_sim(x):
    li = []
    for item in x["sent_emb"]:
        li.append(spatial.distance.cosine(item,x["quest_emb"][0]))
    return li


In [25]:
def pred_idx(distances):
    return np.argmin(distances)

In [28]:
def embed_list_of_strings(lst, model):
    return [embed_text(sentence, model) for sentence in lst]


In [29]:
data['sent_emb'] = data['sentences'].apply(lambda x: embed_list_of_strings(x, word2vec_model))

In [30]:
data

Unnamed: 0,context,question,answer_start,text,sentences,target,answer text,question_word2vec,answer_word2vec,sent_emb
0,"Architecturally, the school has a Catholic cha...",To whom did the Virgin Mary allegedly appear i...,515,Saint Bernadette Soubirous,"[Architecturally, the school has a Catholic ch...",5,"It is a replica of the grotto at Lourdes, Fran...","[0.04432763, 0.05114068, 0.025661893, 0.027520...","[0.07558136, 0.047441483, 0.050408937, 0.11994...","[[0.08312988, 0.02627182, 0.11063385, 0.193359..."
1,"Architecturally, the school has a Catholic cha...",What is in front of the Notre Dame Main Building?,188,a copper statue of Christ,"[Architecturally, the school has a Catholic ch...",2,Immediately in front of the Main Building and ...,"[0.053157806, 0.030845642, 0.10076904, 0.05203...","[0.07821013, 0.036659643, 0.03910346, 0.070101...","[[0.08312988, 0.02627182, 0.11063385, 0.193359..."
2,"Architecturally, the school has a Catholic cha...",The Basilica of the Sacred heart at Notre Dame...,279,the Main Building,"[Architecturally, the school has a Catholic ch...",3,Next to the Main Building is the Basilica of t...,"[0.07283991, 0.03819691, 0.12393466, 0.1254106...","[0.1040819, 0.022325303, 0.123535156, 0.070712...","[[0.08312988, 0.02627182, 0.11063385, 0.193359..."
3,"Architecturally, the school has a Catholic cha...",What is the Grotto at Notre Dame?,381,a Marian place of prayer and reflection,"[Architecturally, the school has a Catholic ch...",4,"Immediately behind the basilica is the Grotto,...","[0.05777486, 0.10460409, 0.13729858, 0.0961710...","[0.07532077, -0.003913032, 0.119249135, 0.0689...","[[0.08312988, 0.02627182, 0.11063385, 0.193359..."
4,"Architecturally, the school has a Catholic cha...",What sits on top of the Main Building at Notre...,92,a golden statue of the Virgin Mary,"[Architecturally, the school has a Catholic ch...",1,Atop the Main Building's gold dome is a golden...,"[0.0009358724, 0.013122559, 0.124565974, 0.051...","[0.03282089, 0.09317627, 0.05167236, 0.0454956...","[[0.08312988, 0.02627182, 0.11063385, 0.193359..."
...,...,...,...,...,...,...,...,...,...,...
85115,"Kathmandu Metropolitan City (KMC), in order to...",In what US state did Kathmandu first establish...,229,Oregon,"[Kathmandu Metropolitan City (KMC), in order t...",1,KMC's first international relationship was est...,"[0.0033081055, 0.057928465, 0.058052063, 0.067...","[-0.02399292, 0.06692505, 0.0375, 0.056396484,...","[[-0.03152466, 0.10537284, 0.07245091, 0.00766..."
85116,"Kathmandu Metropolitan City (KMC), in order to...",What was Yangon previously known as?,414,Rangoon,"[Kathmandu Metropolitan City (KMC), in order t...",2,This activity has been further enhanced by est...,"[-0.024749756, -0.01738739, 0.10687256, 0.0566...","[-0.02366222, 0.006723577, 0.008219112, 0.1252...","[[-0.03152466, 0.10537284, 0.07245091, 0.00766..."
85117,"Kathmandu Metropolitan City (KMC), in order to...",With what Belorussian city does Kathmandu have...,476,Minsk,"[Kathmandu Metropolitan City (KMC), in order t...",2,This activity has been further enhanced by est...,"[0.017283121, 0.0051574707, 0.08671061, 0.0952...","[-0.02366222, 0.006723577, 0.008219112, 0.1252...","[[-0.03152466, 0.10537284, 0.07245091, 0.00766..."
85118,"Kathmandu Metropolitan City (KMC), in order to...",In what year did Kathmandu create its initial ...,199,1975,"[Kathmandu Metropolitan City (KMC), in order t...",1,KMC's first international relationship was est...,"[0.019544814, 0.08488634, 0.08759393, 0.062188...","[-0.02399292, 0.06692505, 0.0375, 0.056396484,...","[[-0.03152466, 0.10537284, 0.07245091, 0.00766..."


In [31]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np



In [32]:
def compute_cosine_similarity(embedding1, embedding2):
    if embedding1 is None or embedding2 is None:
        return None
    else:
        return cosine_similarity([embedding1], [embedding2])[0]

In [33]:
for i, row in data.iterrows():
    question_embedding = row['question_word2vec']
    sent_embeddings = row['sent_emb']
    for j, sent_embedding in enumerate(sent_embeddings):
        # Compute cosine similarity between question_embedding and each sent_embedding
        cosine_sim = compute_cosine_similarity(question_embedding, sent_embedding)
        # Add the cosine similarity to a new column
        data.at[i, f'cos{j}'] = cosine_sim

In [34]:
data

Unnamed: 0,context,question,answer_start,text,sentences,target,answer text,question_word2vec,answer_word2vec,sent_emb,cos0,cos1,cos2,cos3,cos4,cos5,cos6,cos7,cos8,cos9
0,"Architecturally, the school has a Catholic cha...",To whom did the Virgin Mary allegedly appear i...,515,Saint Bernadette Soubirous,"[Architecturally, the school has a Catholic ch...",5,"It is a replica of the grotto at Lourdes, Fran...","[0.04432763, 0.05114068, 0.025661893, 0.027520...","[0.07558136, 0.047441483, 0.050408937, 0.11994...","[[0.08312988, 0.02627182, 0.11063385, 0.193359...",0.431621,0.472877,0.564637,0.421326,0.425999,0.705260,0.524225,,,
1,"Architecturally, the school has a Catholic cha...",What is in front of the Notre Dame Main Building?,188,a copper statue of Christ,"[Architecturally, the school has a Catholic ch...",2,Immediately in front of the Main Building and ...,"[0.053157806, 0.030845642, 0.10076904, 0.05203...","[0.07821013, 0.036659643, 0.03910346, 0.070101...","[[0.08312988, 0.02627182, 0.11063385, 0.193359...",0.369423,0.554328,0.632660,0.599598,0.513503,0.625822,0.621614,,,
2,"Architecturally, the school has a Catholic cha...",The Basilica of the Sacred heart at Notre Dame...,279,the Main Building,"[Architecturally, the school has a Catholic ch...",3,Next to the Main Building is the Basilica of t...,"[0.07283991, 0.03819691, 0.12393466, 0.1254106...","[0.1040819, 0.022325303, 0.123535156, 0.070712...","[[0.08312988, 0.02627182, 0.11063385, 0.193359...",0.420924,0.627810,0.638587,0.777642,0.726845,0.743519,0.619731,,,
3,"Architecturally, the school has a Catholic cha...",What is the Grotto at Notre Dame?,381,a Marian place of prayer and reflection,"[Architecturally, the school has a Catholic ch...",4,"Immediately behind the basilica is the Grotto,...","[0.05777486, 0.10460409, 0.13729858, 0.0961710...","[0.07532077, -0.003913032, 0.119249135, 0.0689...","[[0.08312988, 0.02627182, 0.11063385, 0.193359...",0.409562,0.557070,0.542247,0.638662,0.615400,0.777871,0.573276,,,
4,"Architecturally, the school has a Catholic cha...",What sits on top of the Main Building at Notre...,92,a golden statue of the Virgin Mary,"[Architecturally, the school has a Catholic ch...",1,Atop the Main Building's gold dome is a golden...,"[0.0009358724, 0.013122559, 0.124565974, 0.051...","[0.03282089, 0.09317627, 0.05167236, 0.0454956...","[[0.08312988, 0.02627182, 0.11063385, 0.193359...",0.344913,0.578154,0.618867,0.697404,0.579587,0.542959,0.658568,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
85115,"Kathmandu Metropolitan City (KMC), in order to...",In what US state did Kathmandu first establish...,229,Oregon,"[Kathmandu Metropolitan City (KMC), in order t...",1,KMC's first international relationship was est...,"[0.0033081055, 0.057928465, 0.058052063, 0.067...","[-0.02399292, 0.06692505, 0.0375, 0.056396484,...","[[-0.03152466, 0.10537284, 0.07245091, 0.00766...",0.636493,0.754431,0.672387,0.630387,,,,,,
85116,"Kathmandu Metropolitan City (KMC), in order to...",What was Yangon previously known as?,414,Rangoon,"[Kathmandu Metropolitan City (KMC), in order t...",2,This activity has been further enhanced by est...,"[-0.024749756, -0.01738739, 0.10687256, 0.0566...","[-0.02366222, 0.006723577, 0.008219112, 0.1252...","[[-0.03152466, 0.10537284, 0.07245091, 0.00766...",0.299827,0.485796,0.441521,0.375872,,,,,,
85117,"Kathmandu Metropolitan City (KMC), in order to...",With what Belorussian city does Kathmandu have...,476,Minsk,"[Kathmandu Metropolitan City (KMC), in order t...",2,This activity has been further enhanced by est...,"[0.017283121, 0.0051574707, 0.08671061, 0.0952...","[-0.02366222, 0.006723577, 0.008219112, 0.1252...","[[-0.03152466, 0.10537284, 0.07245091, 0.00766...",0.466352,0.593181,0.614880,0.580059,,,,,,
85118,"Kathmandu Metropolitan City (KMC), in order to...",In what year did Kathmandu create its initial ...,199,1975,"[Kathmandu Metropolitan City (KMC), in order t...",1,KMC's first international relationship was est...,"[0.019544814, 0.08488634, 0.08759393, 0.062188...","[-0.02399292, 0.06692505, 0.0375, 0.056396484,...","[[-0.03152466, 0.10537284, 0.07245091, 0.00766...",0.597686,0.708139,0.659406,0.658406,,,,,,


In [35]:
def compute_euclidean_distance(embedding1, embedding2):
    if embedding1 is None or embedding2 is None:
        return None
    else:
        return np.linalg.norm(embedding1 - embedding2)

In [36]:
for i, row in data.iterrows():
    question_embedding = row['question_word2vec']
    sent_embeddings = row['sent_emb']
    for j, sent_embedding in enumerate(sent_embeddings):
        # Compute Euclidean distance between question_embedding and each sent_embedding
        euclidean_dist = compute_euclidean_distance(question_embedding, sent_embedding)
        # Add the Euclidean distance to a new column
        data.at[i, f'euclidean{j}'] = euclidean_dist

In [37]:
data

Unnamed: 0,context,question,answer_start,text,sentences,target,answer text,question_word2vec,answer_word2vec,sent_emb,...,euclidean0,euclidean1,euclidean2,euclidean3,euclidean4,euclidean5,euclidean6,euclidean7,euclidean8,euclidean9
0,"Architecturally, the school has a Catholic cha...",To whom did the Virgin Mary allegedly appear i...,515,Saint Bernadette Soubirous,"[Architecturally, the school has a Catholic ch...",5,"It is a replica of the grotto at Lourdes, Fran...","[0.04432763, 0.05114068, 0.025661893, 0.027520...","[0.07558136, 0.047441483, 0.050408937, 0.11994...","[[0.08312988, 0.02627182, 0.11063385, 0.193359...",...,1.296807,1.176548,0.923805,1.185900,1.144119,0.807753,0.951149,,,
1,"Architecturally, the school has a Catholic cha...",What is in front of the Notre Dame Main Building?,188,a copper statue of Christ,"[Architecturally, the school has a Catholic ch...",2,Immediately in front of the Main Building and ...,"[0.053157806, 0.030845642, 0.10076904, 0.05203...","[0.07821013, 0.036659643, 0.03910346, 0.070101...","[[0.08312988, 0.02627182, 0.11063385, 0.193359...",...,1.389548,1.108372,0.885345,1.014190,1.085253,0.938403,0.888404,,,
2,"Architecturally, the school has a Catholic cha...",The Basilica of the Sacred heart at Notre Dame...,279,the Main Building,"[Architecturally, the school has a Catholic ch...",3,Next to the Main Building is the Basilica of t...,"[0.07283991, 0.03819691, 0.12393466, 0.1254106...","[0.1040819, 0.022325303, 0.123535156, 0.070712...","[[0.08312988, 0.02627182, 0.11063385, 0.193359...",...,1.355416,1.032379,0.908919,0.772131,0.833630,0.798622,0.922944,,,
3,"Architecturally, the school has a Catholic cha...",What is the Grotto at Notre Dame?,381,a Marian place of prayer and reflection,"[Architecturally, the school has a Catholic ch...",4,"Immediately behind the basilica is the Grotto,...","[0.05777486, 0.10460409, 0.13729858, 0.0961710...","[0.07532077, -0.003913032, 0.119249135, 0.0689...","[[0.08312988, 0.02627182, 0.11063385, 0.193359...",...,1.386715,1.142865,1.043907,1.001641,1.008912,0.761802,1.001150,,,
4,"Architecturally, the school has a Catholic cha...",What sits on top of the Main Building at Notre...,92,a golden statue of the Virgin Mary,"[Architecturally, the school has a Catholic ch...",1,Atop the Main Building's gold dome is a golden...,"[0.0009358724, 0.013122559, 0.124565974, 0.051...","[0.03282089, 0.09317627, 0.05167236, 0.0454956...","[[0.08312988, 0.02627182, 0.11063385, 0.193359...",...,1.393793,1.062279,0.874101,0.867809,0.987312,1.013277,0.817541,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
85115,"Kathmandu Metropolitan City (KMC), in order to...",In what US state did Kathmandu first establish...,229,Oregon,"[Kathmandu Metropolitan City (KMC), in order t...",1,KMC's first international relationship was est...,"[0.0033081055, 0.057928465, 0.058052063, 0.067...","[-0.02399292, 0.06692505, 0.0375, 0.056396484,...","[[-0.03152466, 0.10537284, 0.07245091, 0.00766...",...,0.873747,0.666640,0.727949,0.814696,,,,,,
85116,"Kathmandu Metropolitan City (KMC), in order to...",What was Yangon previously known as?,414,Rangoon,"[Kathmandu Metropolitan City (KMC), in order t...",2,This activity has been further enhanced by est...,"[-0.024749756, -0.01738739, 0.10687256, 0.0566...","[-0.02366222, 0.006723577, 0.008219112, 0.1252...","[[-0.03152466, 0.10537284, 0.07245091, 0.00766...",...,1.392376,1.152521,1.165259,1.260769,,,,,,
85117,"Kathmandu Metropolitan City (KMC), in order to...",With what Belorussian city does Kathmandu have...,476,Minsk,"[Kathmandu Metropolitan City (KMC), in order t...",2,This activity has been further enhanced by est...,"[0.017283121, 0.0051574707, 0.08671061, 0.0952...","[-0.02366222, 0.006723577, 0.008219112, 0.1252...","[[-0.03152466, 0.10537284, 0.07245091, 0.00766...",...,1.067820,0.872944,0.807602,0.884282,,,,,,
85118,"Kathmandu Metropolitan City (KMC), in order to...",In what year did Kathmandu create its initial ...,199,1975,"[Kathmandu Metropolitan City (KMC), in order t...",1,KMC's first international relationship was est...,"[0.019544814, 0.08488634, 0.08759393, 0.062188...","[-0.02399292, 0.06692505, 0.0375, 0.056396484,...","[[-0.03152466, 0.10537284, 0.07245091, 0.00766...",...,0.910435,0.718573,0.730262,0.774187,,,,,,


In [39]:
selected_columns = ['target',
       'cos0', 'cos1', 'cos2', 'cos3', 'cos4', 'cos5', 'cos6', 'cos7', 'cos8',
       'cos9', 'euclidean0', 'euclidean1', 'euclidean2', 'euclidean3',
       'euclidean4', 'euclidean5', 'euclidean6', 'euclidean7', 'euclidean8',
       'euclidean9']

In [40]:
train = data.loc[:, selected_columns]

In [41]:
train

Unnamed: 0,target,cos0,cos1,cos2,cos3,cos4,cos5,cos6,cos7,cos8,...,euclidean0,euclidean1,euclidean2,euclidean3,euclidean4,euclidean5,euclidean6,euclidean7,euclidean8,euclidean9
0,5,0.431621,0.472877,0.564637,0.421326,0.425999,0.705260,0.524225,,,...,1.296807,1.176548,0.923805,1.185900,1.144119,0.807753,0.951149,,,
1,2,0.369423,0.554328,0.632660,0.599598,0.513503,0.625822,0.621614,,,...,1.389548,1.108372,0.885345,1.014190,1.085253,0.938403,0.888404,,,
2,3,0.420924,0.627810,0.638587,0.777642,0.726845,0.743519,0.619731,,,...,1.355416,1.032379,0.908919,0.772131,0.833630,0.798622,0.922944,,,
3,4,0.409562,0.557070,0.542247,0.638662,0.615400,0.777871,0.573276,,,...,1.386715,1.142865,1.043907,1.001641,1.008912,0.761802,1.001150,,,
4,1,0.344913,0.578154,0.618867,0.697404,0.579587,0.542959,0.658568,,,...,1.393793,1.062279,0.874101,0.867809,0.987312,1.013277,0.817541,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
85115,1,0.636493,0.754431,0.672387,0.630387,,,,,,...,0.873747,0.666640,0.727949,0.814696,,,,,,
85116,2,0.299827,0.485796,0.441521,0.375872,,,,,,...,1.392376,1.152521,1.165259,1.260769,,,,,,
85117,2,0.466352,0.593181,0.614880,0.580059,,,,,,...,1.067820,0.872944,0.807602,0.884282,,,,,,
85118,1,0.597686,0.708139,0.659406,0.658406,,,,,,...,0.910435,0.718573,0.730262,0.774187,,,,,,


In [43]:
subset1 = train.iloc[:,:10].fillna(0)
subset2 = train.iloc[:,10:].fillna(60)

In [45]:
train2 = pd.concat([subset1, subset2], axis=1)


In [46]:
train2

Unnamed: 0,target,cos0,cos1,cos2,cos3,cos4,cos5,cos6,cos7,cos8,...,euclidean0,euclidean1,euclidean2,euclidean3,euclidean4,euclidean5,euclidean6,euclidean7,euclidean8,euclidean9
0,5,0.431621,0.472877,0.564637,0.421326,0.425999,0.705260,0.524225,0.0,0.0,...,1.296807,1.176548,0.923805,1.185900,1.144119,0.807753,0.951149,60.0,60.0,60.0
1,2,0.369423,0.554328,0.632660,0.599598,0.513503,0.625822,0.621614,0.0,0.0,...,1.389548,1.108372,0.885345,1.014190,1.085253,0.938403,0.888404,60.0,60.0,60.0
2,3,0.420924,0.627810,0.638587,0.777642,0.726845,0.743519,0.619731,0.0,0.0,...,1.355416,1.032379,0.908919,0.772131,0.833630,0.798622,0.922944,60.0,60.0,60.0
3,4,0.409562,0.557070,0.542247,0.638662,0.615400,0.777871,0.573276,0.0,0.0,...,1.386715,1.142865,1.043907,1.001641,1.008912,0.761802,1.001150,60.0,60.0,60.0
4,1,0.344913,0.578154,0.618867,0.697404,0.579587,0.542959,0.658568,0.0,0.0,...,1.393793,1.062279,0.874101,0.867809,0.987312,1.013277,0.817541,60.0,60.0,60.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
85115,1,0.636493,0.754431,0.672387,0.630387,0.000000,0.000000,0.000000,0.0,0.0,...,0.873747,0.666640,0.727949,0.814696,60.000000,60.000000,60.000000,60.0,60.0,60.0
85116,2,0.299827,0.485796,0.441521,0.375872,0.000000,0.000000,0.000000,0.0,0.0,...,1.392376,1.152521,1.165259,1.260769,60.000000,60.000000,60.000000,60.0,60.0,60.0
85117,2,0.466352,0.593181,0.614880,0.580059,0.000000,0.000000,0.000000,0.0,0.0,...,1.067820,0.872944,0.807602,0.884282,60.000000,60.000000,60.000000,60.0,60.0,60.0
85118,1,0.597686,0.708139,0.659406,0.658406,0.000000,0.000000,0.000000,0.0,0.0,...,0.910435,0.718573,0.730262,0.774187,60.000000,60.000000,60.000000,60.0,60.0,60.0


In [47]:

train2.apply(max, axis = 0)

target         9.000000
cos0           1.000000
cos1           1.000000
cos2           1.000000
cos3           1.000000
cos4           1.000000
cos5           0.997890
cos6           0.980326
cos7           0.979544
cos8           0.985612
cos9          60.000000
euclidean0    60.000000
euclidean1    60.000000
euclidean2    60.000000
euclidean3    60.000000
euclidean4    60.000000
euclidean5    60.000000
euclidean6    60.000000
euclidean7    60.000000
euclidean8    60.000000
euclidean9    60.000000
dtype: float64

In [48]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [49]:
scaler = MinMaxScaler()
X = scaler.fit_transform(train2.iloc[:,1:])

In [51]:
from sklearn.model_selection import train_test_split


In [53]:
train_x, test_x, train_y, test_y = train_test_split(X,
train.iloc[:,0], train_size=0.8, random_state = 5)

In [56]:
train_x.shape

(68096, 20)

In [58]:
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn import linear_model
from sklearn import metrics

In [81]:
mul_lr = linear_model.LogisticRegression(multi_class='multinomial', solver='newton-cg')
mul_lr.fit(train_x, train_y)

print("Multinomial Logistic regression Train Accuracy : ", metrics.accuracy_score(train_y, mul_lr.predict(train_x)))
print("Multinomial Logistic regression Test Accuracy : ", metrics.accuracy_score(test_y, mul_lr.predict(test_x)))

Multinomial Logistic regression Train Accuracy :  0.647703242481203
Multinomial Logistic regression Test Accuracy :  0.6544290413533834


In [175]:
context="Mount Everest, also known as Sagarmatha in Nepal and Chomolungma in Tibet, is Earth's highest mountain above sea level, located in the Mahalangur Himal sub-range of the Himalayas. Its peak is 8,848.86 meters (29,031.7 feet) above sea level, making it one of the Seven Summits. The international border between China and Nepal runs across Everest's precise summit point."

In [176]:
question="What is the height of Mount Everest above sea level?"

In [177]:
blob = TextBlob(context)
sentences = blob.sentences
sentences_list = [str(sentence) for sentence in sentences]
print(len(sentences_list))

3


In [178]:
def embed_text_list(list_of_strings, model):
    embeddings = []
    for text in list_of_strings:
        embedding = embed_text(text, model)
        embeddings.append(embedding)
    return embeddings

In [179]:
sentences_embeddings = embed_text_list(sentences_list, word2vec_model)

In [180]:
quest_emb=embed_text(question, word2vec_model)

In [181]:
def compute_cosine_similarity(separate_array, array_list):
    similarities = []
    for array in array_list:
        similarity = cosine_similarity([separate_array], [array])[0][0]
        similarities.append(similarity)
    return similarities

In [182]:
cossimilarities = compute_cosine_similarity(quest_emb, sentences_embeddings)

In [183]:
def compute_euclidean_distances(separate_array, array_list):
    distances = []
    for array in array_list:
        distance = np.linalg.norm(separate_array - array)
        distances.append(distance)
    return distances


In [184]:
euc_dist=compute_euclidean_distances(quest_emb, sentences_embeddings)


In [185]:
def pad_list(l1,l2):
    while len(l1) <10:
        l1.append(0)
        l2.append(60)
    return l1 , l2


In [186]:
cossim , euc = pad_list(cossimilarities, euc_dist)

In [187]:
p = cossim + euc

In [189]:
predictions = mul_lr.predict([p])
print(predictions)

[1]


In [190]:
sentences_list

["Mount Everest, also known as Sagarmatha in Nepal and Chomolungma in Tibet, is Earth's highest mountain above sea level, located in the Mahalangur Himal sub-range of the Himalayas.",
 'Its peak is 8,848.86 meters (29,031.7 feet) above sea level, making it one of the Seven Summits.',
 "The international border between China and Nepal runs across Everest's precise summit point."]

In [191]:
question

'What is the height of Mount Everest above sea level?'

In [192]:
context

"Mount Everest, also known as Sagarmatha in Nepal and Chomolungma in Tibet, is Earth's highest mountain above sea level, located in the Mahalangur Himal sub-range of the Himalayas. Its peak is 8,848.86 meters (29,031.7 feet) above sea level, making it one of the Seven Summits. The international border between China and Nepal runs across Everest's precise summit point."