# Data Augmentation

Motivation: Labels in PAP are unbalanced and skew toward 1 (plausible). To mitigate the negative effect of data imbalance on model's performance, we need to generate more implausbile data.

Approach: Sample implausible instances from pap multiclass data.

In [1]:
import json, random
import pandas as pd
from collections import Counter

In [2]:
# Project path.
PROJECT_PATH = '/mount/studenten/arbeitsdaten-studenten1/semantic-plausibility/plausible-parrots/'

# Path for all datasets.
DATA_PATH = '/mount/studenten/semantic-plausibility/datasets/'

# Paths for PaP.
# pap binary, preprocessed
PAP_BI_PREPROCESSED = PROJECT_PATH + '1-data_preprocessing/output/'
# unfiltered version, multiclass
PAP_TRAIN_PATH_MULT = DATA_PATH + 'pap/train-dev-test-split/multiclass/train.csv'
PAP_DEV_PATH_MULT = DATA_PATH + 'pap/train-dev-test-split/multiclass/dev.csv'
PAP_TEST_PATH_MULT = DATA_PATH + 'pap/train-dev-test-split/multiclass/test.csv'

## Pap

In [3]:
pap_train_processed = pd.read_csv(PAP_BI_PREPROCESSED + 'pap_train_processed.csv')
pap_train_processed

Unnamed: 0,text,original_label,label
0,Event occurs year.,plausible,1
1,Tortoise brings limb.,plausible,1
2,Headliner overpowers function.,implausible,1
3,County receives hour.,plausible,0
4,Traveler acknowledges recognition.,implausible,1
...,...,...,...
1723,Classification hauls slavery.,implausible,1
1724,Library needs space.,plausible,1
1725,Analysis constrains theory.,plausible,1
1726,Row elicits game.,plausible,1


In [4]:
pap_dev_processed = pd.read_csv(PAP_BI_PREPROCESSED + 'pap_dev_processed.csv')
pap_dev_processed

Unnamed: 0,text,original_label,label
0,Method seizes bacterium.,implausible,0
1,Technician visits community.,plausible,1
2,Inclusion expands range.,plausible,1
3,Pencil puts norm.,implausible,0
4,Solution musters team.,implausible,1
...,...,...,...
211,Variable represents domain.,plausible,1
212,Circuit transcends driver.,implausible,1
213,Arrow wears chest.,implausible,0
214,Blast leaves person.,plausible,0


In [5]:
pap_test_processed = pd.read_csv(PAP_BI_PREPROCESSED + 'pap_test_processed.csv')
pap_test_processed

Unnamed: 0,text,original_label,label
0,Interpretation construes title.,plausible,1
1,Mask sustains axis.,implausible,0
2,Trader ensures strategy.,implausible,1
3,Animator comprises trip.,implausible,1
4,Welfare constructs hundred.,implausible,0
...,...,...,...
211,Malcontent pervades effect.,implausible,1
212,Realism overpowers alignment.,implausible,1
213,Outcome presides part.,implausible,0
214,Ship collides head.,plausible,1


In [6]:
pap_train_mult = pd.read_csv(PAP_TRAIN_PATH_MULT)
pap_dev_mult = pd.read_csv(PAP_DEV_PATH_MULT)
pap_test_mult = pd.read_csv(PAP_TEST_PATH_MULT)
pap_mult = pd.concat([pap_train_mult, 
                      pap_dev_mult, 
#                       pap_test_mult,
                     ])
# pap_mult = pap_mult[pap_mult['label'] != 3]
# pap_mult['label'] = pap_mult['label'].apply(lambda x: 0 if x <= 2 else 1)
pap_mult

Unnamed: 0,text,original_label,label
0,trader navigates tour,implausible,4
1,credit throws evidence,implausible,4
2,resentment perceives man,implausible,4
3,scandal hurts department,plausible,5
4,team rejoices constancy,implausible,4
...,...,...,...
211,desensitization demolishes farmhouse,implausible,4
212,head weighs flash,implausible,1
213,lack mitigates disruption,implausible,2
214,description overestimates crispness,implausible,4


In [7]:
pap_mult_implausible = pap_mult[pap_mult['original_label']=='implausible']
# pap_mult_implausible = pap_mult[pap_mult['label']==0]

# Set the label of all instances with original_label == 'implausible' to 0.
pap_mult_implausible['label'] = pap_mult_implausible['label'].apply(lambda x: 0)
pap_mult_implausible

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pap_mult_implausible['label'] = pap_mult_implausible['label'].apply(lambda x: 0)


Unnamed: 0,text,original_label,label
0,trader navigates tour,implausible,0
1,credit throws evidence,implausible,0
2,resentment perceives man,implausible,0
4,team rejoices constancy,implausible,0
5,faculty inherits knee,implausible,0
...,...,...,...
208,resistance imparts width,implausible,0
211,desensitization demolishes farmhouse,implausible,0
212,head weighs flash,implausible,0
213,lack mitigates disruption,implausible,0


### Sample 734 instances from pap_mult_implausible to augment pap_train_processed.

In [8]:
pap_sample_train = pap_mult_implausible.sample(n=734, random_state=123)
pap_sample_train

Unnamed: 0,text,original_label,label
327,parliament enunciates precaution,implausible,0
725,fantail fits skill,implausible,0
1586,event shakes flesh,implausible,0
298,photograph rewrites bank,implausible,0
326,subculture vandalizes basis,implausible,0
...,...,...,...
224,tile cuts knot,implausible,0
644,dish tolerates cinnamon,implausible,0
1205,density afflicts variety,implausible,0
614,shrew laments livelihood,implausible,0


In [9]:
# Drop the instances in pap_mult_implausible that have been sampled.
pap_mult_implausible.drop(index=pap_sample_train.index.tolist(), inplace=True)
pap_mult_implausible

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pap_mult_implausible.drop(index=pap_sample_train.index.tolist(), inplace=True)


Unnamed: 0,text,original_label,label
2,resentment perceives man,implausible,0
25,prison correlates plot,implausible,0
31,cylinder plucks sympathy,implausible,0
84,moon severs debut,implausible,0
99,state reconciles version,implausible,0
...,...,...,...
142,throat passes subsistence,implausible,0
183,taxi exemplifies accessibility,implausible,0
203,refinement reverts thing,implausible,0
206,cobblestone imitates envelope,implausible,0


### Sample 92 instances from pap_mult_implausible to augment pap_dev_processed.

In [10]:
pap_sample_dev = pap_mult_implausible.sample(n=92, random_state=123)
pap_sample_dev

Unnamed: 0,text,original_label,label
596,architecture shouts diploma,implausible,0
951,building postdates connotation,implausible,0
1585,anthropologist unlocks quarterback,implausible,0
1331,advertisement cheers windmill,implausible,0
278,sensor serves cartridge,implausible,0
...,...,...,...
433,statue rebuffs oak,implausible,0
2,resentment perceives man,implausible,0
1690,uniform stops rupture,implausible,0
524,plane creates shower,implausible,0


In [11]:
# Drop the instances in pap_mult_implausible that have been sampled.
pap_mult_implausible.drop(index=pap_sample_dev.index.tolist(), inplace=True)
pap_mult_implausible

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pap_mult_implausible.drop(index=pap_sample_dev.index.tolist(), inplace=True)


Unnamed: 0,text,original_label,label
25,prison correlates plot,implausible,0
31,cylinder plucks sympathy,implausible,0
84,moon severs debut,implausible,0
119,completion hones objection,implausible,0
130,property straightens earth,implausible,0
...,...,...,...
142,throat passes subsistence,implausible,0
183,taxi exemplifies accessibility,implausible,0
203,refinement reverts thing,implausible,0
206,cobblestone imitates envelope,implausible,0


### Sample 92 instances from pap_mult_implausible to augment pap_test_processed.

In [12]:
pap_sample_test = pap_mult_implausible.sample(n=92, random_state=123)
pap_sample_test

Unnamed: 0,text,original_label,label
1256,altar reaches banana,implausible,0
1290,invasion requires obligation,implausible,0
1304,stage stands justice,implausible,0
747,beginning mentions side,implausible,0
957,rainstorm disagrees scandal,implausible,0
...,...,...,...
923,majority stops helmet,implausible,0
827,beach picks involvement,implausible,0
1007,book realizes size,implausible,0
1202,landfill intersects number,implausible,0


## Augment the pap binary dataset with the instances sampled from pap multiclass.

In [13]:
def postprocess(text):
    sentence = text[0].upper() + text[1:] + "."
    return sentence

### pap train

In [14]:
pap_sample_train['text'] = pap_sample_train['text'].apply(postprocess)
pap_sample_train

Unnamed: 0,text,original_label,label
327,Parliament enunciates precaution.,implausible,0
725,Fantail fits skill.,implausible,0
1586,Event shakes flesh.,implausible,0
298,Photograph rewrites bank.,implausible,0
326,Subculture vandalizes basis.,implausible,0
...,...,...,...
224,Tile cuts knot.,implausible,0
644,Dish tolerates cinnamon.,implausible,0
1205,Density afflicts variety.,implausible,0
614,Shrew laments livelihood.,implausible,0


In [15]:
# Concatenate pap_train_processed and pap_sample_train.
pap_train_processed = pd.concat([pap_train_processed, pap_sample_train], ignore_index=True)
pap_train_processed

Unnamed: 0,text,original_label,label
0,Event occurs year.,plausible,1
1,Tortoise brings limb.,plausible,1
2,Headliner overpowers function.,implausible,1
3,County receives hour.,plausible,0
4,Traveler acknowledges recognition.,implausible,1
...,...,...,...
2457,Tile cuts knot.,implausible,0
2458,Dish tolerates cinnamon.,implausible,0
2459,Density afflicts variety.,implausible,0
2460,Shrew laments livelihood.,implausible,0


In [26]:
Counter(pap_train_processed['label'])

Counter({1: 1231, 0: 1231})

In [16]:
pap_train_processed.to_csv('./output/pap_train_processed_augmented.csv', index=False)

### pap dev

In [17]:
pap_sample_dev['text'] = pap_sample_dev['text'].apply(postprocess)
pap_sample_dev

Unnamed: 0,text,original_label,label
596,Architecture shouts diploma.,implausible,0
951,Building postdates connotation.,implausible,0
1585,Anthropologist unlocks quarterback.,implausible,0
1331,Advertisement cheers windmill.,implausible,0
278,Sensor serves cartridge.,implausible,0
...,...,...,...
433,Statue rebuffs oak.,implausible,0
2,Resentment perceives man.,implausible,0
1690,Uniform stops rupture.,implausible,0
524,Plane creates shower.,implausible,0


In [18]:
# Concatenate pap_dev_processed and pap_sample_dev.
pap_dev_processed = pd.concat([pap_dev_processed, pap_sample_dev], ignore_index=True)
pap_dev_processed

Unnamed: 0,text,original_label,label
0,Method seizes bacterium.,implausible,0
1,Technician visits community.,plausible,1
2,Inclusion expands range.,plausible,1
3,Pencil puts norm.,implausible,0
4,Solution musters team.,implausible,1
...,...,...,...
303,Statue rebuffs oak.,implausible,0
304,Resentment perceives man.,implausible,0
305,Uniform stops rupture.,implausible,0
306,Plane creates shower.,implausible,0


In [27]:
Counter(pap_dev_processed['label'])

Counter({0: 154, 1: 154})

In [19]:
pap_dev_processed.to_csv('./output/pap_dev_processed_augmented.csv', index=False)

### pap test

In [20]:
pap_sample_test['text'] = pap_sample_test['text'].apply(postprocess)
pap_sample_test

Unnamed: 0,text,original_label,label
1256,Altar reaches banana.,implausible,0
1290,Invasion requires obligation.,implausible,0
1304,Stage stands justice.,implausible,0
747,Beginning mentions side.,implausible,0
957,Rainstorm disagrees scandal.,implausible,0
...,...,...,...
923,Majority stops helmet.,implausible,0
827,Beach picks involvement.,implausible,0
1007,Book realizes size.,implausible,0
1202,Landfill intersects number.,implausible,0


In [21]:
# Concatenate pap_test_processed and pap_sample_test.
pap_test_processed = pd.concat([pap_test_processed, pap_sample_test], ignore_index=True)
pap_test_processed

Unnamed: 0,text,original_label,label
0,Interpretation construes title.,plausible,1
1,Mask sustains axis.,implausible,0
2,Trader ensures strategy.,implausible,1
3,Animator comprises trip.,implausible,1
4,Welfare constructs hundred.,implausible,0
...,...,...,...
303,Majority stops helmet.,implausible,0
304,Beach picks involvement.,implausible,0
305,Book realizes size.,implausible,0
306,Landfill intersects number.,implausible,0


In [28]:
Counter(pap_test_processed['label'])

Counter({1: 154, 0: 154})

In [22]:
pap_test_processed.to_csv('./output/pap_test_processed_augmented.csv', index=False)