In [10]:
from utils import (
    load_huggingface_dataset,
    generate_cda,
    generate_cda_df
)
from tqdm.auto import tqdm

In [3]:
data = load_huggingface_dataset(dataset_name='snli', split='train', to_pandas=True)
data.head(10)

Found cached dataset snli (/home/aditya/.cache/huggingface/datasets/snli/plain_text/1.0.0/1f60b67533b65ae0275561ff7828aad5ee4282d0e6f844fd148d05d3c6ea251b)


Unnamed: 0,premise,hypothesis,label
0,A person on a horse jumps over a broken down a...,A person is training his horse for a competition.,1
1,A person on a horse jumps over a broken down a...,"A person is at a diner, ordering an omelette.",2
2,A person on a horse jumps over a broken down a...,"A person is outdoors, on a horse.",0
3,Children smiling and waving at camera,They are smiling at their parents,1
4,Children smiling and waving at camera,There are children present,0
5,Children smiling and waving at camera,The kids are frowning,2
6,A boy is jumping on skateboard in the middle o...,The boy skates down the sidewalk.,2
7,A boy is jumping on skateboard in the middle o...,The boy does a skateboarding trick.,0
8,A boy is jumping on skateboard in the middle o...,The boy is wearing safety equipment.,1
9,An older man sits with his orange juice at a s...,An older man drinks his juice as he waits for ...,1


In [4]:
# Filter only entailment pairs from the SNLI dataset
# Entailment pairs are labeled as 0
data = data[data.label == 0]
data.head(10)

Unnamed: 0,premise,hypothesis,label
2,A person on a horse jumps over a broken down a...,"A person is outdoors, on a horse.",0
4,Children smiling and waving at camera,There are children present,0
7,A boy is jumping on skateboard in the middle o...,The boy does a skateboarding trick.,0
14,Two blond women are hugging one another.,There are women showing affection.,0
17,"A few people in a restaurant setting, one of t...",The diners are at a restaurant.,0
18,An older man is drinking orange juice at a res...,A man is drinking juice.,0
23,"A man with blond-hair, and a brown shirt drink...",A blond man drinking water from a fountain.,0
25,Two women who just had lunch hugging and sayin...,There are two woman in this picture.,0
29,"Two women, holding food carryout containers, hug.",Two women hug each other.,0
31,A Little League team tries to catch a runner s...,A team is trying to tag a runner out.,0


In [5]:
# Word pairs listed in the MABEL paper
word_pairs = {
    'man': 'woman',
    'men': 'women',
    'boy': 'girl',
    'boys': 'girls',
    'he': 'she',
    'father': 'mother',
    'son': 'daughter',
    'sons': 'daughters',
    'guy': 'gal',
    'guys': 'gals',
    'male': 'female',
    'his': 'her',
    'himself': 'herself'
}

# Update the dictionary with reverse keys as well
word_pairs.update({v: k for k, v in word_pairs.items()})

In [8]:
# Counterfactual Data Augmentation on an example sentence
sentence_ex = 'A man and a woman are drinking coffee'

# Generate CDA returns the transformed sentence and a boolean that stores if the input was transformed
generate_cda(sentence_ex, word_pairs)

('A woman and a man are drinking coffee', True)

In [11]:
# Apply the generate CDA function on the dataframe
tqdm.pandas()
data[['aug_sent0', 'aug_sent1', 'both']] = data.progress_apply(generate_cda_df, axis=1, result_type='expand', args=(word_pairs,))

  0%|          | 0/183416 [00:00<?, ?it/s]

In [15]:
data.head(20)

Unnamed: 0,premise,hypothesis,label,aug_sent0,aug_sent1,both
2,A person on a horse jumps over a broken down a...,"A person is outdoors, on a horse.",0,A person on a horse jumps over a broken down a...,"A person is outdoors, on a horse.",0
4,Children smiling and waving at camera,There are children present,0,Children smiling and waving at camera,There are children present,0
7,A boy is jumping on skateboard in the middle o...,The boy does a skateboarding trick.,0,A girl is jumping on skateboard in the middle ...,The girl does a skateboarding trick.,1
14,Two blond women are hugging one another.,There are women showing affection.,0,Two blond men are hugging one another.,There are men showing affection.,1
17,"A few people in a restaurant setting, one of t...",The diners are at a restaurant.,0,"A few people in a restaurant setting, one of t...",The diners are at a restaurant.,0
18,An older man is drinking orange juice at a res...,A man is drinking juice.,0,An older woman is drinking orange juice at a r...,A woman is drinking juice.,1
23,"A man with blond-hair, and a brown shirt drink...",A blond man drinking water from a fountain.,0,"A woman with blond-hair, and a brown shirt dri...",A blond woman drinking water from a fountain.,1
25,Two women who just had lunch hugging and sayin...,There are two woman in this picture.,0,Two men who just had lunch hugging and saying ...,There are two man in this picture.,1
29,"Two women, holding food carryout containers, hug.",Two women hug each other.,0,"Two women, holding food carryout containers, hug.",Two men hug each other.,0
31,A Little League team tries to catch a runner s...,A team is trying to tag a runner out.,0,A Little League team tries to catch a runner s...,A team is trying to tag a runner out.,0


# TODOs for upcoming milestones

* Making the word map case-insensitive and also handle punctuations. For example in the above output, although 'woman' is changed to 'man' in the augmented sentence, 'Woman' is not transformed to 'Man'. The same holds true for 'woman,' etc.
* Expanding the word map to incorporate more transformations. The word map used in this notebook is the one listed in the original paper. For this project, we intend to use a much larger word map available [here](https://github.com/uclanlp/gn_glove/tree/master/wordlist).