In [1]:
from transformers import BertConfig, BertModel
import torch

from torch import nn
import numpy as np
import pandas as pd
import pickle, time
import re, os, string, typing, gc, json
import torch.nn.functional as F
import spacy
from collections import Counter

from tqdm import tqdm
from transformers import BertTokenizer

from scipy.stats.stats import spearmanr
import collections
import itertools

from nltk.corpus import wordnet,stopwords
import random

from nltk.tokenize import sent_tokenize

import EDA as eda

In [2]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Load Squad Data
train_df = pd.read_pickle('NEW_DATA/train.pkl')
valid_df = pd.read_pickle('NEW_DATA/valid.pkl')

In [3]:
len(train_df),len(valid_df)

(87599, 34726)

In [4]:
train_df.head()

Unnamed: 0,id,context,question,label,answer,context_ids,question_ids,label_idx
0,5733be284776f41900661182,"architecturally, the school has a catholic cha...",to whom did the virgin mary allegedly appear i...,"[515, 541]",saint bernadette soubirous,"[101, 6549, 2135, 1010, 1996, 2082, 2038, 1037...","[101, 2000, 3183, 2106, 1996, 6261, 2984, 9382...","[114, 122]"
1,5733be284776f4190066117f,"architecturally, the school has a catholic cha...",what is in front of the notre dame main building?,"[188, 213]",a copper statue of christ,"[101, 6549, 2135, 1010, 1996, 2082, 2038, 1037...","[101, 2054, 2003, 1999, 2392, 1997, 1996, 1028...","[40, 45]"
2,5733be284776f41900661180,"architecturally, the school has a catholic cha...",the basilica of the sacred heart at notre dame...,"[279, 296]",the main building,"[101, 6549, 2135, 1010, 1996, 2082, 2038, 1037...","[101, 1996, 13546, 1997, 1996, 6730, 2540, 201...","[12, 15]"
3,5733be284776f41900661181,"architecturally, the school has a catholic cha...",what is the grotto at notre dame?,"[381, 420]",a marian place of prayer and reflection,"[101, 6549, 2135, 1010, 1996, 2082, 2038, 1037...","[101, 2054, 2003, 1996, 24665, 23052, 2012, 10...","[85, 93]"
4,5733be284776f4190066117e,"architecturally, the school has a catholic cha...",what sits on top of the main building at notre...,"[92, 126]",a golden statue of the virgin mary,"[101, 6549, 2135, 1010, 1996, 2082, 2038, 1037...","[101, 2054, 7719, 2006, 2327, 1997, 1996, 2364...","[20, 28]"


In [5]:
training_data = pd.DataFrame()
validation_data = pd.DataFrame()


training_data["sentence A"] = train_df["context"]
validation_data["sentence A"] = valid_df["context"]

training_data["sentence B"] = ""
validation_data["sentence B"] = ""

training_data.drop_duplicates(inplace=True)
validation_data.drop_duplicates(inplace=True)

training_data.index = np.linspace(0,len(training_data)-1,len(training_data),dtype=int)
validation_data.index = np.linspace(0,len(validation_data)-1,len(validation_data),dtype=int)

len(training_data), len(validation_data)

(18891, 2067)

In [6]:
training_data.head()

Unnamed: 0,sentence A,sentence B
0,"architecturally, the school has a catholic cha...",
1,"as at most other universities, notre dame's st...",
2,the university is the major seat of the congre...,
3,the college of engineering was established in ...,
4,all of notre dame's undergraduate students are...,


In [7]:
training_data.to_pickle("Training_Squad.pkl")
validation_data.to_pickle("Validating_Squad.pkl")

In [8]:
training_data = pd.read_pickle('Training_Squad.pkl')
validation_data = pd.read_pickle('Validating_Squad.pkl')

In [9]:
training_data.head()

Unnamed: 0,sentence A,sentence B
0,"architecturally, the school has a catholic cha...",
1,"as at most other universities, notre dame's st...",
2,the university is the major seat of the congre...,
3,the college of engineering was established in ...,
4,all of notre dame's undergraduate students are...,


# Data Augmentation

In [10]:
EDA_case = np.random.randint(1,5)
EDA_case

4

In [11]:
training_data["sentence A"][1],sent_tokenize(training_data["sentence A"][1])

("as at most other universities, notre dame's students run a number of news media outlets. the nine student-run outlets include three newspapers, both a radio and television station, and several magazines and journals. begun as a one-page journal in september 1876, the scholastic magazine is issued twice monthly and claims to be the oldest continuous collegiate publication in the united states. the other magazine, the juggler, is released twice a year and focuses on student literature and artwork. the dome yearbook is published annually. the newspapers have varying publication interests, with the observer published daily and mainly reporting university and other news, and staffed by students from both notre dame and saint mary's college. unlike scholastic and the dome, the observer is an independent publication and does not have a faculty advisor or any editorial oversight from the university. in 1987, when some students believed that the observer began to show a conservative bias, a l

In [12]:
n = 4
for i in range(len(training_data)):
    EDA_cases = random.sample(range(1,5),2)
    
    aug = max(sent_tokenize(training_data["sentence A"][i]),key=len)
    training_data["sentence A"][i] = aug
    
    aug = eda.Easy_Data_Augmentation(aug,EDA_cases[0],n)
    aug = eda.Easy_Data_Augmentation(aug,EDA_cases[1],n)
    training_data["sentence B"][i] = aug

In [13]:
n = 4
for i in range(len(validation_data)):
    EDA_cases = random.sample(range(1,5),2)
    
    aug = max(sent_tokenize(validation_data["sentence A"][i]),key=len)
    validation_data["sentence A"][i] = aug
    
    aug = eda.Easy_Data_Augmentation(aug,EDA_cases[0],n)
    aug = eda.Easy_Data_Augmentation(aug,EDA_cases[1],n)
    validation_data["sentence B"][i] = aug

In [14]:
training_data.head()

Unnamed: 0,sentence A,sentence B
0,"it is a replica of the grotto at lourdes, fran...",appeared is a replica of the grotto at lourdes...
1,the newspapers have varying publication intere...,the newspapers commentator varying publication...
2,retired priests and brothers reside in fatima ...,retired priests and strike out brothers amp fi...
3,"today the college, housed in the fitzpatrick, ...","today the college, housed study the fitzpatric..."
4,each student is given an academic advisor from...,each student is given an faculty member advise...


In [15]:
validation_data.head()

Unnamed: 0,sentence A,sentence B
0,"as this was the 50th super bowl, the league em...","as this was the 50th super bowl, game league a..."
1,they defeated the arizona cardinals 49–15 in t...,they defeated the arizona cardinals 49–15 in t...
2,"newton was limited by denver's defense, which ...","newton including by denver's defense, which sa..."
3,the super bowl 50 halftime show was headlined ...,the ball bowling ball 50 halftime show headlin...
4,"in early 2012, nfl commissioner roger goodell ...",in early nfl commissioner goodell stated that ...


In [28]:
i = np.random.randint(0,len(training_data))
training_data["sentence A"][i],training_data["sentence B"][i]

('the dominant land plant species of the time were gymnosperms, which are vascular, cone-bearing, non-flowering plants such as conifers that produce seeds without a coating.',
 'rife the dominant institute plant species of the time gymnosperms, which cone-bearing, non-flowering plants institute such as institute conifers that produce seeds without a coating.')

In [None]:
training_data.to_pickle("Training_Squad.pkl")
validation_data.to_pickle("Validating_Squad.pkl")

In [None]:
training_data = pd.read_pickle('Training_Squad.pkl')
validation_data = pd.read_pickle('Validating_Squad.pkl')