# Exploring WikiQA

## Setting up the environment

In [1]:
!mkdir -p /scratch/sagarsj42/torch-cache
!mkdir -p /scratch/sagarsj42/transformers
!mkdir -p /scratch/sagarsj42/hf-datasets

import os
os.chdir('/scratch/sagarsj42')
os.environ['TORCH_HOME'] = '/scratch/sagarsj42/torch-cache'
os.environ['TRANSFORMERS_CACHE'] = '/scratch/sagarsj42/transformers'
os.environ['HF_DATASETS_CACHE'] = '/scratch/sagarsj42/hf-datasets'

## Imports

In [2]:
import string

import pandas as pd
import numpy as np
import datasets

## Using a predownloaded dataset

In [3]:
!scp sagarsj42@ada:/share1/sagarsj42/WikiQACorpus.zip .
!unzip -o WikiQACorpus.zip

WikiQACorpus.zip                              100% 6928KB   6.8MB/s   00:00    
Archive:  WikiQACorpus.zip
   creating: WikiQACorpus/emnlp-table/
  inflating: WikiQACorpus/emnlp-table/WikiQA.CNN.dev.rank  
  inflating: WikiQACorpus/emnlp-table/WikiQA.CNN.test.rank  
  inflating: WikiQACorpus/emnlp-table/WikiQA.CNN-Cnt.dev.rank  
  inflating: WikiQACorpus/emnlp-table/WikiQA.CNN-Cnt.test.rank  
  inflating: WikiQACorpus/eval.py    
  inflating: WikiQACorpus/Guidelines_Phase1.pdf  
  inflating: WikiQACorpus/Guidelines_Phase2.pdf  
  inflating: WikiQACorpus/WikiQA.tsv  
  inflating: WikiQACorpus/WikiQA-dev.ref  
  inflating: WikiQACorpus/WikiQA-dev.tsv  
  inflating: WikiQACorpus/WikiQA-dev.txt  
  inflating: WikiQACorpus/WikiQA-dev-filtered.ref  
  inflating: WikiQACorpus/WikiQASent.pos.ans.tsv  
  inflating: WikiQACorpus/WikiQA-test.ref  
  inflating: WikiQACorpus/WikiQA-test.tsv  
  inflating: WikiQACorpus/WikiQA-test.txt  
  inflating: WikiQACorpus/WikiQA-test-filtered.ref  
  inflatin

In [3]:
data_path = 'WikiQACorpus'
filename = 'WikiQA-train.tsv'

wikiqa_train = pd.read_csv(os.path.join(data_path, filename), sep='\t')
wikiqa_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20347 entries, 0 to 20346
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   QuestionID     20347 non-null  object
 1   Question       20347 non-null  object
 2   DocumentID     20347 non-null  object
 3   DocumentTitle  20347 non-null  object
 4   SentenceID     20347 non-null  object
 5   Sentence       20347 non-null  object
 6   Label          20347 non-null  int64 
dtypes: int64(1), object(6)
memory usage: 1.1+ MB


In [4]:
wikiqa_train.head()

Unnamed: 0,QuestionID,Question,DocumentID,DocumentTitle,SentenceID,Sentence,Label
0,Q1,how are glacier caves formed?,D1,Glacier cave,D1-0,A partly submerged glacier cave on Perito More...,0
1,Q1,how are glacier caves formed?,D1,Glacier cave,D1-1,The ice facade is approximately 60 m high,0
2,Q1,how are glacier caves formed?,D1,Glacier cave,D1-2,Ice formations in the Titlis glacier cave,0
3,Q1,how are glacier caves formed?,D1,Glacier cave,D1-3,A glacier cave is a cave formed within the ice...,1
4,Q1,how are glacier caves formed?,D1,Glacier cave,D1-4,"Glacier caves are often called ice caves , but...",0


In [5]:
filename = 'WikiQA-dev.tsv'

wikiqa_dev = pd.read_csv(os.path.join(data_path, filename), sep='\t')
wikiqa_dev.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2733 entries, 0 to 2732
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   QuestionID     2733 non-null   object
 1   Question       2733 non-null   object
 2   DocumentID     2733 non-null   object
 3   DocumentTitle  2733 non-null   object
 4   SentenceID     2733 non-null   object
 5   Sentence       2733 non-null   object
 6   Label          2733 non-null   int64 
dtypes: int64(1), object(6)
memory usage: 149.6+ KB


In [6]:
filename = 'WikiQA-test.tsv'

wikiqa_test = pd.read_csv(os.path.join(data_path, filename), sep='\t')
wikiqa_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6116 entries, 0 to 6115
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   QuestionID     6116 non-null   object
 1   Question       6116 non-null   object
 2   DocumentID     6116 non-null   object
 3   DocumentTitle  6116 non-null   object
 4   SentenceID     6116 non-null   object
 5   Sentence       6116 non-null   object
 6   Label          6116 non-null   int64 
dtypes: int64(1), object(6)
memory usage: 334.6+ KB


## Downloading the dataset from HuggingFace Datasets

In [7]:
wikiqa = datasets.load_dataset('wiki_qa')
wikiqa

Using the latest cached version of the module from /home2/sagarsj42/.cache/huggingface/modules/datasets_modules/datasets/wiki_qa/d2d236b5cbdc6fbdab45d168b4d678a002e06ddea3525733a24558150585951c (last modified on Sat Nov 20 13:05:25 2021) since it couldn't be found locally at /scratch/sagarsj42/wiki_qa/wiki_qa.py, or remotely (ConnectionError).
Using custom data configuration default
Reusing dataset wiki_qa (/scratch/sagarsj42/hf-datasets/wiki_qa/default/0.1.0/d2d236b5cbdc6fbdab45d168b4d678a002e06ddea3525733a24558150585951c)


DatasetDict({
    test: Dataset({
        features: ['question_id', 'question', 'document_title', 'answer', 'label'],
        num_rows: 6165
    })
    validation: Dataset({
        features: ['question_id', 'question', 'document_title', 'answer', 'label'],
        num_rows: 2733
    })
    train: Dataset({
        features: ['question_id', 'question', 'document_title', 'answer', 'label'],
        num_rows: 20360
    })
})

In [8]:
wikiqa['train']

Dataset({
    features: ['question_id', 'question', 'document_title', 'answer', 'label'],
    num_rows: 20360
})

In [9]:
wikiqa['validation']

Dataset({
    features: ['question_id', 'question', 'document_title', 'answer', 'label'],
    num_rows: 2733
})

In [10]:
wikiqa['test']

Dataset({
    features: ['question_id', 'question', 'document_title', 'answer', 'label'],
    num_rows: 6165
})

## Get question IDs with atleast one answer contained in the set of sentences

In [11]:
def get_valid_questions(wikiqa):
    question_status = dict()

    for split in wikiqa:
        split_dataset = wikiqa[split]
        n_samples = len(split_dataset)

        for i in range(n_samples):
            qid = split_dataset[i]['question_id']
            label = split_dataset[i]['label']
            if qid not in question_status:
                question_status[qid] = label
            else:
                question_status[qid] = max(question_status[qid], label)

    valid_questions = set([qid for qid in question_status if question_status[qid] > 0])
    
    return valid_questions

valid_questions = get_valid_questions(wikiqa)
len(valid_questions)

1242

In [12]:
wikiqa_filtered = wikiqa.filter(lambda sample: sample['question_id'] in valid_questions)
wikiqa_filtered

  0%|          | 0/7 [00:00<?, ?ba/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

  0%|          | 0/21 [00:00<?, ?ba/s]

DatasetDict({
    test: Dataset({
        features: ['question_id', 'question', 'document_title', 'answer', 'label'],
        num_rows: 2351
    })
    validation: Dataset({
        features: ['question_id', 'question', 'document_title', 'answer', 'label'],
        num_rows: 1130
    })
    train: Dataset({
        features: ['question_id', 'question', 'document_title', 'answer', 'label'],
        num_rows: 8672
    })
})

In [13]:
fil_ids = set()

for sample in wikiqa_filtered['test']:
    fil_ids.add(sample['question_id'])

len(fil_ids)

243

In [14]:
def get_tokens(sample):
    question = sample['question'].translate(str.maketrans('', '', string.punctuation))
    question = question.lower().split()
    
    answer = sample['answer'].translate(str.maketrans('', '', string.punctuation))
    answer = answer.lower().split()
    
    return question, answer

sample_tokens = get_tokens(wikiqa_filtered['train'][0])
sample_tokens

(['how', 'are', 'glacier', 'caves', 'formed'],
 ['a',
  'partly',
  'submerged',
  'glacier',
  'cave',
  'on',
  'perito',
  'moreno',
  'glacier'])

## Working with GloVe

In [19]:
!wget https://nlp.stanford.edu/data/glove.6B.zip
!unzip -o glove.6B.zip

--2021-11-29 11:54:40--  https://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2021-11-29 11:54:41--  http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip.1’


2021-11-29 11:57:30 (4.87 MB/s) - ‘glove.6B.zip.1’ saved [862182613/862182613]

Archive:  glove.6B.zip
  inflating: glove.6B.50d.txt        
  inflating: glove.6B.100d.txt       
  inflating: glove.6B.200d.txt       
  inflating: glove.6B.300d.txt       


In [15]:
def load_glove(filename):
    glove = dict()

    with open(filename, 'r', encoding='utf-8') as f:
        for line in f:
            line_content = line.split()
            word = line_content[0].strip()
            vec = np.array(line_content[1:], dtype='float32')
            glove[word] = vec
            
    return glove

glove = load_glove('glove.6B.300d.txt')
len(glove)

400000

In [16]:
def get_embeddings(q_a_tokens, glove):
    embed_size = len(list(glove.values())[0])
    q_vecs = [glove[q_word] if q_word in glove else np.zeros(embed_size) for q_word in q_a_tokens[0]]
    a_vecs = [glove[a_word] if a_word in glove else np.zeros(embed_size) for a_word in q_a_tokens[1]]
    
    return q_vecs, a_vecs

sample_vecs = get_embeddings(sample_tokens, glove)
sample_vecs

([array([-2.8520e-01, -1.3883e-02,  3.1607e-01, -1.9182e-01,  5.9983e-02,
          6.0524e-01, -1.8121e-01, -2.0191e-01,  5.6732e-02, -2.1441e+00,
          2.6505e-01, -2.7387e-02, -2.6467e-01,  8.9337e-02,  2.3024e-03,
          1.7254e-02, -2.9702e-02, -1.5041e-01, -1.8500e-02,  9.5384e-02,
          3.8578e-01,  7.2993e-01,  2.1815e-01,  1.4281e-01, -3.5614e-01,
         -1.1845e-01,  1.1216e-01, -7.2290e-02, -3.2908e-01,  8.8392e-02,
          9.7741e-02,  4.5268e-01, -5.4059e-01, -3.6629e-02, -8.9579e-01,
          3.5898e-01, -1.9695e-02,  5.7514e-02, -3.0125e-01, -4.2060e-01,
          2.6366e-01, -3.9566e-01,  1.6363e-02,  9.2313e-02, -5.9094e-02,
          1.0586e-01, -8.6629e-02, -1.4179e-01, -2.3508e-01,  1.4154e-01,
          2.9091e-01, -2.6861e-01,  2.2645e-02, -3.5018e-01, -3.1623e-01,
          2.6834e-01, -1.1712e-01,  1.6216e-01, -3.4595e-02,  2.9888e-01,
          3.1337e-01, -4.6440e-02,  1.9017e-01,  5.1518e-01, -1.7441e-01,
         -2.2635e-01, -2.5727e-02,  3.

In [17]:
c = 0

for split in wikiqa:
    for sample in wikiqa[split]:
        tokens = get_tokens(sample)
        embeds = get_embeddings(tokens, glove)
        c += 1
        
c

29258