In [1]:
import os

import pandas as pd
import numpy as np

import time

# need to filter warnings related to spacy lemmatizer
import logging
logger = logging.getLogger("spacy")
logger.setLevel(logging.ERROR)

In [2]:
os.chdir('../..')

In [3]:
from src.data.dataload import load_sst
from src.data.dataload import load_agnews

In [4]:
sst = load_sst()
agnews = load_agnews()

In [5]:
input_data = {'sst': sst, 'agnews': agnews}
datasets = list(input_data.keys())

In [6]:
train = {}
dev = {}
test = {}

for dataset in datasets:
    train[dataset], dev[dataset], test[dataset] = input_data[dataset].train_val_test

Using custom data configuration default
Reusing dataset ag_news (/Users/olivier/.cache/huggingface/datasets/ag_news/default/0.0.0/fb5c5e74a110037311ef5e904583ce9f8b9fbc1354290f97b4929f01b3f48b1a)


In [7]:
for dataset in datasets:
    print(train[dataset].shape, dev[dataset].shape, test[dataset].shape)

(8544, 2) (1101, 2) (2210, 2)
(108000, 2) (12000, 2) (7600, 2)


In [8]:
train['sst'].head()

Unnamed: 0,sentence,label
0,The Rock is destined to be the 21st Century 's...,3
1,The gorgeously elaborate continuation of `` Th...,4
2,Singer/composer Bryan Adams contributes a slew...,3
3,You 'd think by now America would have had eno...,2
4,Yet the act is still charming here .,3


In [9]:
# not sure if this has been fixed in the data loader. Right now the indices are shuffled.
train['agnews'] = train['agnews'].reset_index()
del(train['agnews']['index'])

In [10]:
train['agnews'].head()

Unnamed: 0,label,text
0,0,"Rocket Explodes Above US Base in Kabul, No Cas..."
1,0,Pakistan elects new prime minister Pakistan #3...
2,0,Crippled Canadian sub reaches Scotland The cri...
3,0,"US, Nigeria to hold joint military training ne..."
4,2,Vodafone denies backing Verizon #39;s bid for ...


In [11]:
from src.data.perturbations import add_perturbations

In [12]:
from src.data.perturbations import checklist_contract_sentence

In [13]:
# train_perturbations = add_perturbations(
#     df=train['sst'], 
#     sentence_col_name='sentence', 
#     perturbation_functions=[
#         checklist_add_typos, 
#         checklist_change_names, 
#         checklist_contract_sentence, 
#         checklist_strip_punctuation
#     ]
# )

In [14]:
# train_perturbations.head()

In [15]:
from src.data.perturbations import custom_remove_commas, \
custom_remove_all_punctuation, \
custom_switch_gender, \
custom_strip_trailing_punct, \
custom_add_typo, \
custom_change_first_names, \
custom_change_last_names, \
custom_change_location

In [16]:
train_subset = train['agnews'][:100].copy()

In [17]:
train_subset.head()

Unnamed: 0,label,text
0,0,"Rocket Explodes Above US Base in Kabul, No Cas..."
1,0,Pakistan elects new prime minister Pakistan #3...
2,0,Crippled Canadian sub reaches Scotland The cri...
3,0,"US, Nigeria to hold joint military training ne..."
4,2,Vodafone denies backing Verizon #39;s bid for ...


In [18]:
t0 = time.time()

train_perturbations = add_perturbations(
    df=train_subset, 
    sentence_col_name='text', 
    perturbation_functions=[
        custom_change_first_names,
        custom_change_last_names,
        custom_change_location
    ]
)

t1 = time.time()
print(t1-t0)

2.200652837753296


In [25]:
train_perturbations.head()

Unnamed: 0,label,text,text_tokens,text_change_first_name_concat,text_change_first_name_tokens,success_change_first_name,text_change_last_name_concat,text_change_last_name_tokens,success_change_last_name,text_change_location_concat,text_change_location_tokens,success_change_location_name
0,0,"Rocket Explodes Above US Base in Kabul, No Cas...","[Rocket, Explodes, Above, US, Base, in, Kabul,...","Rocket Explodes Above US Base in Kabul, No Cas...","[Rocket, Explodes, Above, US, Base, in, Kabul,...",0,"Rocket Explodes Above US Base in Kabul, No Cas...","[Rocket, Explodes, Above, US, Base, in, Kabul,...",0,"Rocket Explodes Above US Base in Kabul, No Cas...","[Rocket, Explodes, Above, US, Base, in, Kabul,...","[1, [29]]"
1,0,Pakistan elects new prime minister Pakistan #3...,"[Pakistan, elects, new, prime, minister, Pakis...",Pakistan elects new prime minister Pakistan #3...,"[Pakistan, elects, new, prime, minister, Pakis...","[1, [12]]",Pakistan elects new prime minister Pakistan #3...,"[Pakistan, elects, new, prime, minister, Pakis...",0,Sudan elects new prime minister Sudan #39;s Na...,"[Sudan, elects, new, prime, minister, Pakistan...","[1, [0]]"
2,0,Crippled Canadian sub reaches Scotland The cri...,"[Crippled, Canadian, sub, reaches, Scotland, T...",Crippled Canadian sub reaches Scotland The cri...,"[Crippled, Canadian, sub, reaches, Scotland, T...",0,Crippled Canadian sub reaches Scotland The cri...,"[Crippled, Canadian, sub, reaches, Scotland, T...",0,Crippled Canadian sub reaches Scotland The cri...,"[Crippled, Canadian, sub, reaches, Scotland, T...",0
3,0,"US, Nigeria to hold joint military training ne...","[US, ,, Nigeria, to, hold, joint, military, tr...","US, Nigeria to hold joint military training ne...","[US, ,, Nigeria, to, hold, joint, military, tr...",0,"US, Nigeria to hold joint military training ne...","[US, ,, Nigeria, to, hold, joint, military, tr...",0,"US, Iraq to hold joint military training next ...","[US, ,, Iraq, to, hold, joint, military, train...","[1, [2]]"
4,2,Vodafone denies backing Verizon #39;s bid for ...,"[Vodafone, denies, backing, Verizon, #, 39;s, ...",Vodafone denies backing Verizon #39;s bid for ...,"[Vodafone, denies, backing, Verizon, #, 39;s, ...",0,Vodafone denies backing Verizon #39;s bid for ...,"[Vodafone, denies, backing, Verizon, #, 39;s, ...",0,Vodafone denies backing Verizon #39;s bid for ...,"[Vodafone, denies, backing, Verizon, #, 39;s, ...",0
