In [1]:
import os

import pandas as pd
import numpy as np

import time

# need to filter warnings related to spacy lemmatizer
import logging
logger = logging.getLogger("spacy")
logger.setLevel(logging.ERROR)

In [2]:
os.chdir('../..')

In [3]:
from src.data.dataload import load_sst
from src.data.dataload import load_agnews

In [4]:
sst = load_sst()
agnews = load_agnews()

In [5]:
input_data = {'sst': sst, 'agnews': agnews}
datasets = list(input_data.keys())

In [6]:
train = {}
dev = {}
test = {}

for dataset in datasets:
    train[dataset], dev[dataset], test[dataset] = input_data[dataset].train_val_test

Using custom data configuration default
Reusing dataset ag_news (/Users/olivier/.cache/huggingface/datasets/ag_news/default/0.0.0/fb5c5e74a110037311ef5e904583ce9f8b9fbc1354290f97b4929f01b3f48b1a)


In [7]:
for dataset in datasets:
    print(train[dataset].shape, dev[dataset].shape, test[dataset].shape)

(8544, 2) (1101, 2) (2210, 2)
(108000, 2) (12000, 2) (7600, 2)


In [8]:
train['agnews'].head()

Unnamed: 0,label,text
61679,3,Hurricane aid covers NASA repair funds A hurri...
30331,0,N.Korea Blast Scene a Construction Site -- Dip...
57703,2,Automakers look to rev up sales with fun new a...
1199,3,Auction for Shares in Google's IPO May End Goo...
108660,1,Drugs in sport: Jones denies fresh Conte claim...


In [9]:
# not sure if this has been fixed in the data loader. Right now the indices are shuffled.
train['agnews'] = train['agnews'].reset_index()
del(train['agnews']['index'])

In [10]:
from src.data.perturbations import add_perturbations

In [11]:
from src.data.perturbations import remove_commas, \
remove_all_punctuation, \
switch_gender, \
strip_trailing_punct, \
add_typo, \
change_first_name, \
change_last_name, \
change_location, \
contraction, \
swap_adjectives

In [12]:
train_subset = train['sst'][:1000].copy()

In [13]:
train_subset.head()

Unnamed: 0,sentence,label
0,The Rock is destined to be the 21st Century 's...,3
1,The gorgeously elaborate continuation of `` Th...,4
2,Singer/composer Bryan Adams contributes a slew...,3
3,You 'd think by now America would have had eno...,2
4,Yet the act is still charming here .,3


In [14]:
t0 = time.time()

train_perturbations = add_perturbations(
    df=train_subset, 
    sentence_col_name='sentence', 
    perturbation_functions=[
        contraction,
        change_last_name,
        swap_adjectives
    ]
)

t1 = time.time()
print(t1-t0)

5.688600778579712


In [17]:
train_perturbations

Unnamed: 0,sentence,label,sentence_tokens,sentence_contraction_concat,sentence_contraction_tokens,success_contraction,sentence_change_last_name_concat,sentence_change_last_name_tokens,success_change_last_name,sentence_swap_adj_concat,sentence_swap_adj_tokens,success_swap_adj
0,The Rock is destined to be the 21st Century 's...,3,"[The, Rock, is, destined, to, be, the, 21st, C...",The Rock is destined to be the 21st Century 's...,"[The, Rock, is, destined, to, be, the, 21st, C...","[1, [[17, 18]]]",The Rock is destined to be the 21st Century 's...,"[The, Rock, is, destined, to, be, the, 21st, C...","[1, [37]]",The Rock is destined to be the 21st Century 's...,"[The, Rock, is, destined, to, be, the, 21st, C...",0
1,The gorgeously elaborate continuation of `` Th...,4,"[The, gorgeously, elaborate, continuation, of,...",The gorgeously elaborate continuation of `` Th...,"[The, gorgeously, elaborate, continuation, of,...",0,The gorgeously elaborate continuation of `` Th...,"[The, gorgeously, elaborate, continuation, of,...","[1, [32]]",The gorgeously elaborate continuation of `` Th...,"[The, gorgeously, elaborate, continuation, of,...",0
2,Singer/composer Bryan Adams contributes a slew...,3,"[Singer, /, composer, Bryan, Adams, contribute...",Singer/composer Bryan Adams contributes a slew...,"[Singer, /, composer, Bryan, Adams, contribute...",0,Singer/composer Bryan Lee contributes a slew o...,"[Singer, /, composer, Bryan, Lee, contributes,...","[1, [4]]",Singer/composer Bryan Adams contributes a slew...,"[Singer, /, composer, Bryan, Adams, contribute...",0
3,You 'd think by now America would have had eno...,2,"[You, 'd, think, by, now, America, would, have...",You 'd think by now America would have had eno...,"[You, 'd, think, by, now, America, would, have...",0,You 'd think by now America would have had eno...,"[You, 'd, think, by, now, America, would, have...",0,You 'd think by now America would have had eno...,"[You, 'd, think, by, now, America, would, have...",0
4,Yet the act is still charming here .,3,"[Yet, the, act, is, still, charming, here, .]",Yet the act is still charming here .,"[Yet, the, act, is, still, charming, here, .]",0,Yet the act is still charming here .,"[Yet, the, act, is, still, charming, here, .]",0,Yet the act is still charming here .,"[Yet, the, act, is, still, charming, here, .]",0
...,...,...,...,...,...,...,...,...,...,...,...,...
995,A lot of the credit for the film 's winning to...,3,"[A, lot, of, the, credit, for, the, film, 's, ...",A lot of the credit for the film 's winning to...,"[A, lot, of, the, credit, for, the, film, 's, ...","[1, [[17, 18]]]",A lot of the credit for the film 's winning to...,"[A, lot, of, the, credit, for, the, film, 's, ...",0,A lot of the credit for the film 's winning to...,"[A, lot, of, the, credit, for, the, film, 's, ...",0
996,Exploits ( headbanger ) stereotypes in good fu...,3,"[Exploits, (, headbanger, ), stereotypes, in, ...",Exploits ( headbanger ) stereotypes in good fu...,"[Exploits, (, headbanger, ), stereotypes, in, ...",0,Exploits ( headbanger ) stereotypes in good fu...,"[Exploits, (, headbanger, ), stereotypes, in, ...",0,Exploits ( headbanger ) stereotypes in good fu...,"[Exploits, (, headbanger, ), stereotypes, in, ...",0
997,A journey that is as difficult for the audienc...,3,"[A, journey, that, is, as, difficult, for, the...",A journey that 's as difficult for the audienc...,"[A, journey, that, 's, as, difficult, for, the...","[1, [[2, 3, 12, 13, 19, 20]]]",A journey that is as difficult for the audienc...,"[A, journey, that, is, as, difficult, for, the...",0,A journey that is as difficult for the audienc...,"[A, journey, that, is, as, difficult, for, the...",0
998,"Ratliff 's two previous titles , Plutonium Cir...",3,"[Ratliff, 's, two, previous, titles, ,, Pluton...","Ratliff 's two previous titles , Plutonium Cir...","[Ratliff, 's, two, previous, titles, ,, Pluton...",0,"Ratliff 's two previous titles , Plutonium Cir...","[Ratliff, 's, two, previous, titles, ,, Pluton...",0,"Ratliff 's two previous titles , Plutonium Cir...","[Ratliff, 's, two, previous, titles, ,, Pluton...",0
