In [1]:
import os
import time
import logging

import pandas as pd
import numpy as np
from allennlp.data.tokenizers.spacy_tokenizer import SpacyTokenizer
from transformers import BertTokenizer

# need to filter warnings related to spacy lemmatizer
logger = logging.getLogger("spacy")
logger.setLevel(logging.ERROR)

In [2]:
os.chdir('../..')

In [3]:
from src.data.dataload import load_sst
from src.data.dataload import load_agnews

AttributeError: 'PathDistribution' object has no attribute 'name'

In [4]:
from src.data.perturbations import add_perturbations
from src.data.perturbations import \
    remove_commas, \
    remove_all_punctuation, \
    switch_gender, \
    strip_trailing_punct, \
    add_typo, \
    change_first_name, \
    change_last_name, \
    change_location, \
    contraction, \
    swap_adjectives

## Load datasets

In [5]:
sst = load_sst()
agnews = load_agnews()

In [6]:
input_data = {'sst': sst, 'agnews': agnews}
datasets = list(input_data.keys())

In [7]:
train = {}
dev = {}
test = {}

for dataset in datasets:
    train[dataset], dev[dataset], test[dataset] = input_data[dataset].train_val_test

Using custom data configuration default
Reusing dataset ag_news (/Users/stevengeorge/.cache/huggingface/datasets/ag_news/default/0.0.0/17ec33e23df9e89565131f989e0fdf78b0cc4672337b582da83fc3c9f79fe34d)


In [8]:
for dataset in datasets:
    print(train[dataset].shape, dev[dataset].shape, test[dataset].shape)

(8544, 2) (1101, 2) (2210, 2)
(108000, 3) (12000, 3) (7600, 3)


In [9]:
train['agnews'].head()

Unnamed: 0,sentence,label,title
0,"Reuters - Short-sellers, Wall Street's dwindli...",2,Wall St. Bears Claw Back Into the Black (Reuters)
1,Reuters - Private investment firm Carlyle Grou...,2,Carlyle Looks Toward Commercial Aerospace (Reu...
2,Reuters - Soaring crude prices plus worriesabo...,2,Oil and Economy Cloud Stocks' Outlook (Reuters)
3,Reuters - Authorities have halted oil exportfl...,2,Iraq Halts Oil Exports from Main Southern Pipe...
4,"AFP - Tearaway world oil prices, toppling reco...",2,"Oil prices soar to all-time record, posing new..."


In [10]:
np.random.seed(3)
train_subset = train['sst'].copy().sample(n=100)

In [11]:
train_subset.head()

Unnamed: 0,sentence,label
6341,"Schnieder bounces around with limp wrists , we...",1
2119,It is an indelible epic American story about t...,2
5105,"Mr. Goyer 's loose , unaccountable direction i...",0
1594,Director Nancy Savoca 's no-frills record of a...,4
387,An average coming-of-age tale elevated by the ...,2


## Perturbations

### spaCy tokenizer

In [17]:
tokenizer = SpacyTokenizer()

In [24]:
%%time

train_perturbations = add_perturbations(
    df=train_subset,
    tokenizer=tokenizer,
    sentence_col_name='sentence', 
    perturbation_functions=[
        remove_commas,
        remove_all_punctuation,
        switch_gender,
        strip_trailing_punct,
        add_typo,
        change_first_name,
        change_last_name,
        change_location,
        contraction,
        swap_adjectives
    ]
)

CPU times: user 816 ms, sys: 131 ms, total: 947 ms
Wall time: 953 ms


In [25]:
train_perturbations.head()

Unnamed: 0,sentence,label,tokens_orig,remove_commas_concat,remove_commas_tokens,remove_commas_success,remove_commas_pert_ind,remove_all_punct_concat,remove_all_punct_tokens,remove_all_punct_success,...,change_location_success,change_location_pert_ind,contraction_concat,contraction_tokens,contraction_success,contraction_pert_ind,swap_adj_concat,swap_adj_tokens,swap_adj_success,swap_adj_pert_ind
6341,"Schnieder bounces around with limp wrists , we...",1,"[Schnieder, bounces, around, with, limp, wrist...",Schnieder bounces around with limp wrists wea...,"[Schnieder, bounces, around, with, limp, wrist...",1,"[6, 14]",Schnieder bounces around with limp wrists wea...,"[Schnieder, bounces, around, with, limp, wrist...",1,...,0,,"Schnieder bounces around with limp wrists , we...","[Schnieder, bounces, around, with, limp, wrist...",1,"[23, 24]","Schnieder bounces around with limp wrists , we...","[Schnieder, bounces, around, with, limp, wrist...",1,"[10, 12]"
2119,It is an indelible epic American story about t...,2,"[It, is, an, indelible, epic, American, story,...",It is an indelible epic American story about t...,"[It, is, an, indelible, epic, American, story,...",1,"[10, 16]",It is an indelible epic American story about t...,"[It, is, an, indelible, epic, American, story,...",1,...,0,,It is an indelible epic American story about t...,"[It, is, an, indelible, epic, American, story,...",0,,It is an indelible epic American story about t...,"[It, is, an, indelible, epic, American, story,...",1,"[12, 14, 22, 24]"
5105,"Mr. Goyer 's loose , unaccountable direction i...",0,"[Mr., Goyer, 's, loose, ,, unaccountable, dire...",Mr. Goyer 's loose unaccountable direction is...,"[Mr., Goyer, 's, loose, , unaccountable, direc...",1,[4],Mr Goyer s loose unaccountable direction is t...,"[Mr, Goyer, s, loose, , unaccountable, directi...",1,...,0,,"Mr. Goyer 's loose , unaccountable direction i...","[Mr., Goyer, 's, loose, ,, unaccountable, dire...",0,,"Mr. Goyer 's loose , unaccountable direction i...","[Mr., Goyer, 's, loose, ,, unaccountable, dire...",0,
1594,Director Nancy Savoca 's no-frills record of a...,4,"[Director, Nancy, Savoca, 's, no, -, frills, r...",Director Nancy Savoca 's no-frills record of a...,"[Director, Nancy, Savoca, 's, no, -, frills, r...",0,,Director Nancy Savoca s no frills record of a...,"[Director, Nancy, Savoca, s, no, , frills, rec...",1,...,0,,Director Nancy Savoca 's no-frills record of a...,"[Director, Nancy, Savoca, 's, no, -, frills, r...",0,,Director Nancy Savoca 's no - frills record of...,"[Director, Nancy, Savoca, 's, no, -, frills, r...",1,"[33, 35]"
387,An average coming-of-age tale elevated by the ...,2,"[An, average, coming, -, of, -, age, tale, ele...",An average coming-of-age tale elevated by the ...,"[An, average, coming, -, of, -, age, tale, ele...",0,,An average coming of age tale elevated by th...,"[An, average, coming, , of, , age, tale, eleva...",1,...,0,,An average coming-of-age tale elevated by the ...,"[An, average, coming, -, of, -, age, tale, ele...",0,,An average coming-of-age tale elevated by the ...,"[An, average, coming, -, of, -, age, tale, ele...",0,


In [27]:
perturb_columns = [col for col in train_perturbations.columns if '_tokens' in col]
perturb_columns

['remove_commas_tokens',
 'remove_all_punct_tokens',
 'switch_gender_tokens',
 'strip_punct_tokens',
 'add_typo_tokens',
 'change_first_name_tokens',
 'change_last_name_tokens',
 'change_location_tokens',
 'contraction_tokens',
 'swap_adj_tokens']

In [30]:
for col in perturb_columns:
    try:
        assert (train_perturbations['tokens_orig'].apply(len) != train_perturbations[col].apply(len)).sum() == 0
    except AssertionError:
        print(f'Not true for {col}')

Not true for strip_punct_tokens


### BERT tokenizer

In [17]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

In [24]:
%%time

train_perturbations = add_perturbations(
    df=train_subset,
    tokenizer=tokenizer,
    sentence_col_name='sentence', 
    perturbation_functions=[
        remove_commas,
        remove_all_punctuation,
        switch_gender,
        strip_trailing_punct,
        add_typo,
        change_first_name,
        change_last_name,
        change_location,
        contraction,
        swap_adjectives
    ]
)

CPU times: user 816 ms, sys: 131 ms, total: 947 ms
Wall time: 953 ms


In [25]:
train_perturbations.head()

Unnamed: 0,sentence,label,tokens_orig,remove_commas_concat,remove_commas_tokens,remove_commas_success,remove_commas_pert_ind,remove_all_punct_concat,remove_all_punct_tokens,remove_all_punct_success,...,change_location_success,change_location_pert_ind,contraction_concat,contraction_tokens,contraction_success,contraction_pert_ind,swap_adj_concat,swap_adj_tokens,swap_adj_success,swap_adj_pert_ind
6341,"Schnieder bounces around with limp wrists , we...",1,"[Schnieder, bounces, around, with, limp, wrist...",Schnieder bounces around with limp wrists wea...,"[Schnieder, bounces, around, with, limp, wrist...",1,"[6, 14]",Schnieder bounces around with limp wrists wea...,"[Schnieder, bounces, around, with, limp, wrist...",1,...,0,,"Schnieder bounces around with limp wrists , we...","[Schnieder, bounces, around, with, limp, wrist...",1,"[23, 24]","Schnieder bounces around with limp wrists , we...","[Schnieder, bounces, around, with, limp, wrist...",1,"[10, 12]"
2119,It is an indelible epic American story about t...,2,"[It, is, an, indelible, epic, American, story,...",It is an indelible epic American story about t...,"[It, is, an, indelible, epic, American, story,...",1,"[10, 16]",It is an indelible epic American story about t...,"[It, is, an, indelible, epic, American, story,...",1,...,0,,It is an indelible epic American story about t...,"[It, is, an, indelible, epic, American, story,...",0,,It is an indelible epic American story about t...,"[It, is, an, indelible, epic, American, story,...",1,"[12, 14, 22, 24]"
5105,"Mr. Goyer 's loose , unaccountable direction i...",0,"[Mr., Goyer, 's, loose, ,, unaccountable, dire...",Mr. Goyer 's loose unaccountable direction is...,"[Mr., Goyer, 's, loose, , unaccountable, direc...",1,[4],Mr Goyer s loose unaccountable direction is t...,"[Mr, Goyer, s, loose, , unaccountable, directi...",1,...,0,,"Mr. Goyer 's loose , unaccountable direction i...","[Mr., Goyer, 's, loose, ,, unaccountable, dire...",0,,"Mr. Goyer 's loose , unaccountable direction i...","[Mr., Goyer, 's, loose, ,, unaccountable, dire...",0,
1594,Director Nancy Savoca 's no-frills record of a...,4,"[Director, Nancy, Savoca, 's, no, -, frills, r...",Director Nancy Savoca 's no-frills record of a...,"[Director, Nancy, Savoca, 's, no, -, frills, r...",0,,Director Nancy Savoca s no frills record of a...,"[Director, Nancy, Savoca, s, no, , frills, rec...",1,...,0,,Director Nancy Savoca 's no-frills record of a...,"[Director, Nancy, Savoca, 's, no, -, frills, r...",0,,Director Nancy Savoca 's no - frills record of...,"[Director, Nancy, Savoca, 's, no, -, frills, r...",1,"[33, 35]"
387,An average coming-of-age tale elevated by the ...,2,"[An, average, coming, -, of, -, age, tale, ele...",An average coming-of-age tale elevated by the ...,"[An, average, coming, -, of, -, age, tale, ele...",0,,An average coming of age tale elevated by th...,"[An, average, coming, , of, , age, tale, eleva...",1,...,0,,An average coming-of-age tale elevated by the ...,"[An, average, coming, -, of, -, age, tale, ele...",0,,An average coming-of-age tale elevated by the ...,"[An, average, coming, -, of, -, age, tale, ele...",0,


In [27]:
perturb_columns = [col for col in train_perturbations.columns if '_tokens' in col]
perturb_columns

['remove_commas_tokens',
 'remove_all_punct_tokens',
 'switch_gender_tokens',
 'strip_punct_tokens',
 'add_typo_tokens',
 'change_first_name_tokens',
 'change_last_name_tokens',
 'change_location_tokens',
 'contraction_tokens',
 'swap_adj_tokens']

In [30]:
for col in perturb_columns:
    try:
        assert (train_perturbations['tokens_orig'].apply(len) != train_perturbations[col].apply(len)).sum() == 0
    except AssertionError:
        print(f'Not true for {col}')

Not true for strip_punct_tokens
