In [1]:
import numpy as np, pandas as pd
import json
import ast 
from textblob import TextBlob
import nltk
import torch
import pickle
from scipy import spatial
import warnings
warnings.filterwarnings('ignore')
import spacy
from nltk import Tree
en_nlp = spacy.load('en')
from nltk.stem.lancaster import LancasterStemmer
st = LancasterStemmer()
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer

In [4]:
train = pd.read_csv("data/train.csv")
print(train.shape)
train.dropna(inplace=True)
print(train.shape)
train.head()

(84600, 4)
(84599, 4)


Unnamed: 0,context,question,answer_start,text
0,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,When did Beyonce start becoming popular?,269,in the late 1990s
1,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,What areas did Beyonce compete in when she was...,207,singing and dancing
2,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,When did Beyonce leave Destiny's Child and bec...,526,2003
3,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,In what city and state did Beyonce grow up?,166,"Houston, Texas"
4,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,In which decade did Beyonce become famous?,276,late 1990s


In [3]:
with open("data/dict_embeddings.pickle", "rb") as f:
    dict_emb = pickle.load(f)
print(len(dict_emb))

176931


In [5]:
def get_target(x):
    idx = -1
    for i in range(len(x["sentences"])):
        if x["text"] in x["sentences"][i]: idx = i
    return idx

In [6]:
def process_data(train):
    
    print("step 1")
    train['sentences'] = train['context'].apply(lambda x: [item.raw for item in TextBlob(x).sentences])
    
    print("step 2")
    train["target"] = train.apply(get_target, axis = 1)
    
    print("step 3")
    train['sent_emb'] = train['sentences'].apply(lambda x: [dict_emb[item][0] if item in\
                                                           dict_emb else np.zeros(4096) for item in x])
    print("step 4")
    train['quest_emb'] = train['question'].apply(lambda x: dict_emb[x] if x in dict_emb else np.zeros(4096) )
        
    return train

In [7]:
train = process_data(train)
train.head()

step 1
step 2
step 3
step 4


Unnamed: 0,context,question,answer_start,text,sentences,target,sent_emb,quest_emb
0,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,When did Beyonce start becoming popular?,269,in the late 1990s,[Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ ...,1,"[[0.0074688885, -0.04449936, 0.113475606, 0.01...","[[0.0074688885, -0.047163706, 0.08111103, -0.0..."
1,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,What areas did Beyonce compete in when she was...,207,singing and dancing,[Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ ...,1,"[[0.0074688885, -0.04449936, 0.113475606, 0.01...","[[0.0074688885, -0.05823731, 0.07162362, -0.00..."
2,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,When did Beyonce leave Destiny's Child and bec...,526,2003,[Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ ...,3,"[[0.0074688885, -0.04449936, 0.113475606, 0.01...","[[0.0074688885, -0.047163706, 0.112324856, -0...."
3,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,In what city and state did Beyonce grow up?,166,"Houston, Texas",[Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ ...,1,"[[0.0074688885, -0.04449936, 0.113475606, 0.01...","[[0.0074688885, -0.034030482, 0.08177199, -0.0..."
4,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,In which decade did Beyonce become famous?,276,late 1990s,[Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ ...,1,"[[0.0074688885, -0.04449936, 0.113475606, 0.01...","[[0.0074688885, -0.06298249, 0.10006339, -0.00..."


In [20]:
# remove the sample with context > 10 sentences
train = train[train["sentences"].apply(lambda x: len(x))<11].reset_index(drop=True)
print(train.shape)
train.head()

(82174, 10)


Unnamed: 0,context,question,answer_start,text,sentences,target,sent_emb,quest_emb,cosine_sim,euclidean_dis
0,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,When did Beyonce start becoming popular?,269,in the late 1990s,[Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ ...,1,"[[0.0074688885, -0.04449936, 0.113475606, 0.01...","[[0.0074688885, -0.047163706, 0.08111103, -0.0...","[0.6082267463207245, 0.7082076072692871, 0.669...","[8.0352955, 9.462915, 6.914433, 9.608288]"
1,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,What areas did Beyonce compete in when she was...,207,singing and dancing,[Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ ...,1,"[[0.0074688885, -0.04449936, 0.113475606, 0.01...","[[0.0074688885, -0.05823731, 0.07162362, -0.00...","[0.5250032842159271, 0.51096311211586, 0.62473...","[6.9849253, 6.9543734, 6.4365134, 8.201017]"
2,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,When did Beyonce leave Destiny's Child and bec...,526,2003,[Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ ...,3,"[[0.0074688885, -0.04449936, 0.113475606, 0.01...","[[0.0074688885, -0.047163706, 0.112324856, -0....","[0.4191964268684387, 0.4746186137199402, 0.584...","[5.770602, 6.6236463, 6.2470593, 7.1245747]"
3,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,In what city and state did Beyonce grow up?,166,"Houston, Texas",[Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ ...,1,"[[0.0074688885, -0.04449936, 0.113475606, 0.01...","[[0.0074688885, -0.034030482, 0.08177199, -0.0...","[0.5637381374835968, 0.5945537984371185, 0.651...","[7.3106747, 7.8408995, 6.481517, 8.796563]"
4,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,In which decade did Beyonce become famous?,276,late 1990s,[Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ ...,1,"[[0.0074688885, -0.04449936, 0.113475606, 0.01...","[[0.0074688885, -0.06298249, 0.10006339, -0.00...","[0.4646044373512268, 0.5985883176326752, 0.589...","[6.2806025, 8.1350765, 6.1614323, 8.060831]"


In [21]:
train.to_csv('data/formatted_train.csv', index=False)

## Predicted Cosine & Euclidean Index

In [22]:
def cosine_sim(x):
    li = []
    for item in x["sent_emb"]:
        li.append(spatial.distance.cosine(item,x["quest_emb"][0]))
    return li

In [23]:
def pred_idx(distances):
    return np.argmin(distances)

In [24]:
def predictions(train):
    
    train["cosine_sim"] = train.apply(cosine_sim, axis = 1)
    train["diff"] = (train["quest_emb"] - train["sent_emb"])**2
    train["euclidean_dis"] = train["diff"].apply(lambda x: list(np.sum(x, axis = 1)))
    del train["diff"]
    
    print("cosine start")
    
    train["pred_idx_cos"] = train["cosine_sim"].apply(lambda x: pred_idx(x))
    train["pred_idx_euc"] = train["euclidean_dis"].apply(lambda x: pred_idx(x))
    
    return train

In [25]:
predicted = predictions(train)
predicted.head()

cosine start


Unnamed: 0,context,question,answer_start,text,sentences,target,sent_emb,quest_emb,cosine_sim,euclidean_dis,pred_idx_cos,pred_idx_euc
0,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,When did Beyonce start becoming popular?,269,in the late 1990s,[Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ ...,1,"[[0.0074688885, -0.04449936, 0.113475606, 0.01...","[[0.0074688885, -0.047163706, 0.08111103, -0.0...","[0.6082267463207245, 0.7082076072692871, 0.669...","[8.0352955, 9.462915, 6.914433, 9.608288]",0,2
1,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,What areas did Beyonce compete in when she was...,207,singing and dancing,[Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ ...,1,"[[0.0074688885, -0.04449936, 0.113475606, 0.01...","[[0.0074688885, -0.05823731, 0.07162362, -0.00...","[0.5250032842159271, 0.51096311211586, 0.62473...","[6.9849253, 6.9543734, 6.4365134, 8.201017]",1,2
2,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,When did Beyonce leave Destiny's Child and bec...,526,2003,[Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ ...,3,"[[0.0074688885, -0.04449936, 0.113475606, 0.01...","[[0.0074688885, -0.047163706, 0.112324856, -0....","[0.4191964268684387, 0.4746186137199402, 0.584...","[5.770602, 6.6236463, 6.2470593, 7.1245747]",0,0
3,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,In what city and state did Beyonce grow up?,166,"Houston, Texas",[Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ ...,1,"[[0.0074688885, -0.04449936, 0.113475606, 0.01...","[[0.0074688885, -0.034030482, 0.08177199, -0.0...","[0.5637381374835968, 0.5945537984371185, 0.651...","[7.3106747, 7.8408995, 6.481517, 8.796563]",0,2
4,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,In which decade did Beyonce become famous?,276,late 1990s,[Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ ...,1,"[[0.0074688885, -0.04449936, 0.113475606, 0.01...","[[0.0074688885, -0.06298249, 0.10006339, -0.00...","[0.4646044373512268, 0.5985883176326752, 0.589...","[6.2806025, 8.1350765, 6.1614323, 8.060831]",0,2


In [28]:
predicted.to_csv('data/train_unsupervised_predicted.csv', index=False)

## Accuracy

In [26]:
def accuracy(target, predicted):   
    acc = (target==predicted).sum()/len(target)    
    return acc

In [27]:
print('Accuracy for Euclidean Distance: ', accuracy(predicted["target"], predicted["pred_idx_euc"]))
print('Accuracy for Cosine Similarity:', accuracy(predicted["target"], predicted["pred_idx_cos"]))

Accuracy for Euclidean Distance:  0.48901112273955266
Accuracy for Cosine Similarity: 0.5993988366149877
