# Read and Sample Data

In [None]:
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


## Train Data

In [None]:
import pandas as pd

df_train = pd.read_json('/kaggle/input/ropptdata/train_rospacy.json')
print("Length of Val df:",len(df_train))

sampled_df_train = df_train.sample(n=3000, random_state=42)
print("Length of downsampled Val df:",len(sampled_df_train))

# Reset the index
sampled_df_train = sampled_df_train.reset_index(drop=True)

sampled_df_train.head()

## Validation Data

In [None]:
import pandas as pd

df_val = pd.read_json('/kaggle/input/ropptdata/val_rospacy.json')
print("Length of Val df:",len(df_val))

sampled_df_val = df_val.sample(n=1000, random_state=42)
print("Length of downsampled Val df:",len(sampled_df_val))

# Reset the index
sampled_df_val = sampled_df_val.reset_index(drop=True)

sampled_df_val.head()

## Test Data

In [None]:
import pandas as pd

df_test = pd.read_json('/kaggle/input/ropptdata/test_rospacy.json')
print("Length of Val df:",len(df_test))

sampled_df_test = df_test.sample(n=1000, random_state=42)
print("Length of downsampled Test df:",len(sampled_df_test))

# Reset the index
sampled_df_test = sampled_df_test.reset_index(drop=True)

sampled_df_test.head()

## Save to files

In [None]:
sampled_df_train.to_json('/kaggle/working/NLPTry/train_rospacy_sampled.json', orient='records')
sampled_df_val.to_json('/kaggle/working/NLPTry/val_rospacy_sampled.json', orient='records')
sampled_df_test.to_json('/kaggle/working/NLPTry/test_rospacy_sampled.json', orient='records')

# Defining baseline

In [None]:
import nltk
nltk.download('punkt')
!git clone https://github.com/pranay-sharma793/NLPTry.git
%cd /kaggle/working/NLPTry
!sh env_setup.sh

In [None]:
!python "/content/NLPTry/main.py" --data_dir '/content/drive/MyDrive/NLP-Rajas-data' --task_name vua18 --model_type MELBERT_GAT --class_weight 3 --bert_model roberta-base --num_train_epoch 3 --train_batch_size 8 --learning_rate 3e-5 --warmup_epoch 2 --dataset_name vua18

# Robustness Module 1: Inducing incorrect spellings

## Defining Misspelt Module

In [None]:
## COMMENT THIS BLOCK ##
import pandas as pd
sampled_df_test = pd.read_json('/content/drive/MyDrive/NLP-Rajas-data/test_rospacy_sampled.json')
sampled_df_test.head(5)

In [None]:
import pandas as pd
import numpy as np

# Assuming you have a DataFrame named sampled_df_test

# Create a copy of the DataFrame
misspelt_sampled_df_test = sampled_df_test.copy()

# Function to induce misspellings in a token
def induce_misspelling(token):
    if len(token) <= 1:
        return token  # Skip single-character tokens
    idx = np.random.randint(0, len(token))  # Randomly select a position in the token
    replacement = chr(np.random.randint(97, 123))  # Random lowercase letter as replacement
    return token[:idx] + replacement + token[idx+1:]

# Apply the function to induce misspellings in all tokens except ori_aspect
misspelt_sampled_df_test['misspelled_tokens'] = misspelt_sampled_df_test.apply(lambda row: [induce_misspelling(token) if token.lower() != row['ori_aspect'].lower() else token for token in row['tokens']], axis=1)

# Detokenize misspelled tokens to create sentences
misspelt_sampled_df_test['misspelled_sentence'] = misspelt_sampled_df_test['misspelled_tokens'].apply(lambda tokens: ' '.join(tokens).replace(' ,', ',').replace(" '", "'"))

# Replace the sentence and ori_sentence columns with misspelled sentences
misspelt_sampled_df_test['sentence'] = misspelt_sampled_df_test['misspelled_sentence']
misspelt_sampled_df_test['ori_sentence'] = misspelt_sampled_df_test['misspelled_sentence']

# Replace the tokens column with misspelled_tokens
misspelt_sampled_df_test['tokens'] = misspelt_sampled_df_test['misspelled_tokens']

# Drop unnecessary columns
misspelt_sampled_df_test.drop(['misspelled_tokens', 'misspelled_sentence'], axis=1, inplace=True)

# Save the misspelt test json
misspelt_sampled_df_test.to_json('misspelt_test_rospacy_sampled.json', orient='records')

# Verify the changes
misspelt_sampled_df_test.head()

Unnamed: 0,sentence,aspect_sentiment,ori_sentence,ori_aspect,ori_index,ori_pos,from_to,tokens,tags,predicted_dependencies,dependencies,predicted_heads
0,"Sartse'x fundamentul thesib, ehat âd tistory ...","[[lead, metaphor]]","Sartse'x fundamentul thesib, ehat âd tistory ...",lead,42,VERB,"[[51, 51]]","[Sartse, 'x, fundamentul, thesib, ,, ehat, âd...","[PROPN, PART, ADJ, NOUN, PUNCT, SCONJ, X, NOUN...","[poss, case, amod, nsubj, punct, mark, punct, ...","[[poss, 4, 1], [case, 1, 2], [amod, 4, 3], [ns...","[4, 1, 4, 24, 4, 10, 10, 10, 10, 4, 10, 11, 12..."
1,"Anv tzing pastry yoe knoh, . \n","[[know,., non_meta]]","Anv tzing pastry yoe knoh, . \n","know,.",4,VERB,"[[4, 6]]","[Anv, tzing, pastry, yoe, knoh, ,, ., \n]","[DET, NOUN, NOUN, PRON, VERB, PUNCT, PUNCT, SP...","[det, compound, ROOT, nsubj, parataxis, punct,...","[[det, 2, 1], [compound, 3, 2], [ROOT, 0, 3], ...","[2, 3, 0, 5, 3, 3, 3, 0]"
2,cf yeu ct nft get ct dont tomorrew I xll fry a...,"[[get, metaphor]]",cf yeu ct nft get ct dont tomorrew I xll fry a...,get,3,VERB,"[[4, 4]]","[cf, yeu, ct, nft, get, ct, dont, tomorrew, I,...","[SCONJ, PRON, AUX, PART, VERB, PRON, VERB, NOU...","[mark, nsubj, aux, neg, advcl, nsubj, ccomp, n...","[[mark, 5, 1], [nsubj, 5, 2], [aux, 5, 3], [ne...","[5, 5, 5, 5, 11, 7, 5, 7, 11, 11, 0, 11, 11, 1..."
3,"He lived locably, to hou dii neigcbours vieb o...","[[He, non_meta]]","He lived locably, to hou dii neigcbours vieb o...",He,0,PRON,"[[0, 0]]","[He, lived, locably, ,, to, hou, dii, neigcbou...","[PRON, VERB, ADV, PUNCT, ADV, ADV, AUX, NOUN, ...","[nsubj, ROOT, advmod, punct, advmod, advmod, a...","[[nsubj, 2, 1], [ROOT, 0, 2], [advmod, 2, 3], ...","[2, 0, 2, 2, 9, 9, 9, 9, 2, 11, 9, 9, 9, 0]"
4,Nqtional hower wns giveh b0 peb cenn gf the CE...,"[[the, non_meta]]",Nqtional hower wns giveh b0 peb cenn gf the CE...,the,8,DET,"[[8, 8]]","[Nqtional, hower, wns, giveh, b0, peb, cenn, g...","[PROPN, PROPN, AUX, VERB, NUM, NOUN, NOUN, ADP...","[compound, nsubjpass, auxpass, ROOT, nummod, c...","[[compound, 2, 1], [nsubjpass, 4, 2], [auxpass...","[2, 4, 4, 0, 7, 7, 4, 7, 10, 13, 10, 13, 8, 4,..."


## Testing Module 1

In [None]:
!python "/content/NLPTry/main.py" --data_dir '/content/drive/MyDrive/NLP-Rajas-data' --task_name vua18 --model_type MELBERT_GAT --class_weight 3 --bert_model roberta-base --num_train_epoch 3 --train_batch_size 8 --learning_rate 3e-5 --warmup_epoch 2 --dataset_name vua18

# Robustness Module 2: Introducing noise

## Defining noise module

In [None]:
import pandas as pd
import numpy as np

# Assuming you have a DataFrame named sampled_df_test

# Create a copy of the DataFrame
noisy_sampled_df_test = sampled_df_test.copy()

# Function to induce random noise in a token
def induce_noise(token, ori_aspect):
    if len(token) <= 1 or token.lower() == ori_aspect.lower():
        return token  # Skip single-character tokens and tokens in ori_aspect
    noise = ''.join(np.random.choice(list(token), size=len(token), replace=True))
    return noise

# Apply the function to induce random noise in all tokens except ori_aspect
noisy_sampled_df_test['noisy_tokens'] = noisy_sampled_df_test.apply(lambda row: [induce_noise(token, row['ori_aspect']) for token in row['tokens']], axis=1)

# Detokenize noisy tokens to create sentences
noisy_sampled_df_test['noisy_sentence'] = noisy_sampled_df_test['noisy_tokens'].apply(lambda tokens: ' '.join(tokens).replace(' ,', ',').replace(" '", "'"))

# Replace the sentence and ori_sentence columns with noisy sentences
noisy_sampled_df_test['sentence'] = noisy_sampled_df_test['noisy_sentence']
noisy_sampled_df_test['ori_sentence'] = noisy_sampled_df_test['noisy_sentence']

# Replace the tokens column with noisy_tokens
noisy_sampled_df_test['tokens'] = noisy_sampled_df_test['noisy_tokens']

# Drop unnecessary columns
noisy_sampled_df_test.drop(['noisy_tokens', 'noisy_sentence'], axis=1, inplace=True)

# Save the noisy test json
noisy_sampled_df_test.to_json('noisy_test_rospacy_sampled.json', orient='records')

# Verify the changes
noisy_sampled_df_test.head()


Unnamed: 0,sentence,aspect_sentiment,ori_sentence,ori_aspect,ori_index,ori_pos,from_to,tokens,tags,predicted_dependencies,dependencies,predicted_heads
0,"aSetra s' afmnnuamlfu isitts, aaht ââ syHitry...","[[lead, metaphor]]","aSetra s' afmnnuamlfu isitts, aaht ââ syHitry...",lead,42,VERB,"[[51, 51]]","[aSetra, s', afmnnuamlfu, isitts, ,, aaht, ââ...","[PROPN, PART, ADJ, NOUN, PUNCT, SCONJ, X, NOUN...","[poss, case, amod, nsubj, punct, mark, punct, ...","[[poss, 4, 1], [case, 1, 2], [amod, 4, 3], [ns...","[4, 1, 4, 24, 4, 10, 10, 10, 10, 4, 10, 11, 12..."
1,"AAy hhhtt taayay yyu nnnk, . \n","[[know,., non_meta]]","AAy hhhtt taayay yyu nnnk, . \n","know,.",4,VERB,"[[4, 6]]","[AAy, hhhtt, taayay, yyu, nnnk, ,, ., \n]","[DET, NOUN, NOUN, PRON, VERB, PUNCT, PUNCT, SP...","[det, compound, ROOT, nsubj, parataxis, punct,...","[[det, 2, 1], [compound, 3, 2], [ROOT, 0, 3], ...","[2, 3, 0, 5, 3, 3, 3, 0]"
2,II uoo ac''t get ti eono rotwtoom I ll' tyt nd...,"[[get, metaphor]]",II uoo ac''t get ti eono rotwtoom I ll' tyt nd...,get,3,VERB,"[[4, 4]]","[II, uoo, ac, ''t, get, ti, eono, rotwtoom, I,...","[SCONJ, PRON, AUX, PART, VERB, PRON, VERB, NOU...","[mark, nsubj, aux, neg, advcl, nsubj, ccomp, n...","[[mark, 5, 1], [nsubj, 5, 2], [aux, 5, 3], [ne...","[5, 5, 5, 5, 11, 7, 5, 7, 11, 11, 0, 11, 11, 1..."
3,"He dvidl yocllal, oo oww ddd heoiubisru wevv i...","[[He, non_meta]]","He dvidl yocllal, oo oww ddd heoiubisru wevv i...",He,0,PRON,"[[0, 0]]","[He, dvidl, yocllal, ,, oo, oww, ddd, heoiubis...","[PRON, VERB, ADV, PUNCT, ADV, ADV, AUX, NOUN, ...","[nsubj, ROOT, advmod, punct, advmod, advmod, a...","[[nsubj, 2, 1], [ROOT, 0, 2], [advmod, 2, 3], ...","[2, 0, 2, 2, 9, 9, 9, 9, 2, 11, 9, 9, 9, 0]"
4,aNlnoana rwewe sas geggi 77 ree nnte ff the GE...,"[[the, non_meta]]",aNlnoana rwewe sas geggi 77 ree nnte ff the GE...,the,8,DET,"[[8, 8]]","[aNlnoana, rwewe, sas, geggi, 77, ree, nnte, f...","[PROPN, PROPN, AUX, VERB, NUM, NOUN, NOUN, ADP...","[compound, nsubjpass, auxpass, ROOT, nummod, c...","[[compound, 2, 1], [nsubjpass, 4, 2], [auxpass...","[2, 4, 4, 0, 7, 7, 4, 7, 10, 13, 10, 13, 8, 4,..."


## Testing noise module

In [None]:
!python "/content/NLPTry/main.py" --data_dir '/content/drive/MyDrive/NLP-Rajas-data' --task_name vua18 --model_type MELBERT_GAT --class_weight 3 --bert_model roberta-base --num_train_epoch 3 --train_batch_size 8 --learning_rate 3e-5 --warmup_epoch 2 --dataset_name vua18

# Robustness Module 3: Mislabelling random records (Invariance test (INV))

## Defining INV Test

In [None]:
import pandas as pd
import numpy as np

# Function to replace metaphor with non_meta and vice versa for a nested list
def replace_aspect_sentiment_nested_list(lst):
    return [[item[0], 'non_meta' if item[1] == 'metaphor' else 'metaphor'] for item in lst]

# Create a copy of the original DataFrame
mislabelled_sampled_df_test = sampled_df_test.copy()

# Apply the replacement function to 40% of the rows randomly
mislabelled_sampled_df_test['aspect_sentiment'] = mislabelled_sampled_df_test['aspect_sentiment'].apply(
    lambda x: replace_aspect_sentiment_nested_list(x) if np.random.rand() < 0.4 else x
)

# Save the noisy test json
mislabelled_sampled_df_test.to_json('mislabelled_test_rospacy_sampled.json', orient='records')

mislabelled_sampled_df_test.head()

Unnamed: 0,sentence,aspect_sentiment,ori_sentence,ori_aspect,ori_index,ori_pos,from_to,tokens,tags,predicted_dependencies,dependencies,predicted_heads
0,"Sartre's fundamental thesis, that â History ...","[[lead, non_meta]]","Sartre's fundamental thesis, that â History ...",lead,42,VERB,"[[51, 51]]","[Sartre, 's, fundamental, thesis, ,, that, â...","[PROPN, PART, ADJ, NOUN, PUNCT, SCONJ, X, NOUN...","[poss, case, amod, nsubj, punct, mark, punct, ...","[[poss, 4, 1], [case, 1, 2], [amod, 4, 3], [ns...","[4, 1, 4, 24, 4, 10, 10, 10, 10, 4, 10, 11, 12..."
1,"Any thing pastry you know,.","[[know,., non_meta]]","Any thing pastry you know,.","know,.",4,VERB,"[[4, 6]]","[Any, thing, pastry, you, know, ,, ., \n]","[DET, NOUN, NOUN, PRON, VERB, PUNCT, PUNCT, SP...","[det, compound, ROOT, nsubj, parataxis, punct,...","[[det, 2, 1], [compound, 3, 2], [ROOT, 0, 3], ...","[2, 3, 0, 5, 3, 3, 3, 0]"
2,If you can't get it done tomorrow I'll try and...,"[[get, metaphor]]",If you can't get it done tomorrow I'll try and...,get,3,VERB,"[[4, 4]]","[If, you, ca, n't, get, it, done, tomorrow, I,...","[SCONJ, PRON, AUX, PART, VERB, PRON, VERB, NOU...","[mark, nsubj, aux, neg, advcl, nsubj, ccomp, n...","[[mark, 5, 1], [nsubj, 5, 2], [aux, 5, 3], [ne...","[5, 5, 5, 5, 11, 7, 5, 7, 11, 11, 0, 11, 11, 1..."
3,"He lived locally, so how did neighbours view h...","[[He, metaphor]]","He lived locally, so how did neighbours view h...",He,0,PRON,"[[0, 0]]","[He, lived, locally, ,, so, how, did, neighbou...","[PRON, VERB, ADV, PUNCT, ADV, ADV, AUX, NOUN, ...","[nsubj, ROOT, advmod, punct, advmod, advmod, a...","[[nsubj, 2, 1], [ROOT, 0, 2], [advmod, 2, 3], ...","[2, 0, 2, 2, 9, 9, 9, 9, 2, 11, 9, 9, 9, 0]"
4,National Power was given 70 per cent of the CE...,"[[the, metaphor]]",National Power was given 70 per cent of the CE...,the,8,DET,"[[8, 8]]","[National, Power, was, given, 70, per, cent, o...","[PROPN, PROPN, AUX, VERB, NUM, NOUN, NOUN, ADP...","[compound, nsubjpass, auxpass, ROOT, nummod, c...","[[compound, 2, 1], [nsubjpass, 4, 2], [auxpass...","[2, 4, 4, 0, 7, 7, 4, 7, 10, 13, 10, 13, 8, 4,..."


## Testing Module 3

In [None]:
!python "/content/NLPTry/main.py" --data_dir '/content/drive/MyDrive/NLP-Rajas-data' --task_name vua18 --model_type MELBERT_GAT --class_weight 3 --bert_model roberta-base --num_train_epoch 3 --train_batch_size 8 --learning_rate 3e-5 --warmup_epoch 2 --dataset_name vua18

# Robustness Module 4: Grammar noise

## Defining Grammar noise module

In [None]:
!pip install spacy
!python -m spacy download en_core_web_sm

2023-11-27 22:44:33.999721: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-11-27 22:44:34.001618: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-11-27 22:44:34.001693: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-11-27 22:44:34.023871: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Collecting en-core-web-sm==3.6.0
  Downloading htt

In [None]:
import pandas as pd
from nltk.stem import PorterStemmer

# Assuming you have a DataFrame named sampled_df_test

# Create a copy of the DataFrame
grammar_noisy_sampled_df_test = sampled_df_test.copy()

# Initialize the Porter Stemmer
porter_stemmer = PorterStemmer()

# Function to induce grammar noise by stemming tokens
def induce_grammar_noise(token):
    # Use Porter Stemmer for stemming
    return porter_stemmer.stem(token)

# Apply the function to induce grammar noise for all tokens except ori_aspect
grammar_noisy_sampled_df_test['grammar_noisy_tokens'] = grammar_noisy_sampled_df_test.apply(lambda row: [induce_grammar_noise(token) if token.lower() != row['ori_aspect'].lower() else token for token in row['tokens']], axis=1)

# Detokenize grammar noisy tokens to create sentences
grammar_noisy_sampled_df_test['grammar_noisy_sentence'] = grammar_noisy_sampled_df_test['grammar_noisy_tokens'].apply(lambda tokens: ' '.join(tokens).replace(' ,', ',').replace(" '", "'"))

# Replace the sentence and ori_sentence columns with grammar noisy sentences
grammar_noisy_sampled_df_test['sentence'] = grammar_noisy_sampled_df_test['grammar_noisy_sentence']
grammar_noisy_sampled_df_test['ori_sentence'] = grammar_noisy_sampled_df_test['grammar_noisy_sentence']

# Replace the tokens column with grammar_noisy_tokens
grammar_noisy_sampled_df_test['tokens'] = grammar_noisy_sampled_df_test['grammar_noisy_tokens']

# Drop unnecessary columns
grammar_noisy_sampled_df_test.drop(['grammar_noisy_tokens', 'grammar_noisy_sentence'], axis=1, inplace=True)

# Save the grammar noisy test json
grammar_noisy_sampled_df_test.to_json('grammar_noisy_test_nltk_sampled.json', orient='records')

# Verify the changes
grammar_noisy_sampled_df_test.head()

Unnamed: 0,sentence,aspect_sentiment,ori_sentence,ori_aspect,ori_index,ori_pos,from_to,tokens,tags,predicted_dependencies,dependencies,predicted_heads
0,"sartr's fundament thesi, that â histori cont...","[[lead, metaphor]]","sartr's fundament thesi, that â histori cont...",lead,42,VERB,"[[51, 51]]","[sartr, 's, fundament, thesi, ,, that, â, hi...","[PROPN, PART, ADJ, NOUN, PUNCT, SCONJ, X, NOUN...","[poss, case, amod, nsubj, punct, mark, punct, ...","[[poss, 4, 1], [case, 1, 2], [amod, 4, 3], [ns...","[4, 1, 4, 24, 4, 10, 10, 10, 10, 4, 10, 11, 12..."
1,"ani thing pastri you know, . \n","[[know,., non_meta]]","ani thing pastri you know, . \n","know,.",4,VERB,"[[4, 6]]","[ani, thing, pastri, you, know, ,, ., \n]","[DET, NOUN, NOUN, PRON, VERB, PUNCT, PUNCT, SP...","[det, compound, ROOT, nsubj, parataxis, punct,...","[[det, 2, 1], [compound, 3, 2], [ROOT, 0, 3], ...","[2, 3, 0, 5, 3, 3, 3, 0]"
2,if you ca n't get it done tomorrow i'll tri an...,"[[get, metaphor]]",if you ca n't get it done tomorrow i'll tri an...,get,3,VERB,"[[4, 4]]","[if, you, ca, n't, get, it, done, tomorrow, i,...","[SCONJ, PRON, AUX, PART, VERB, PRON, VERB, NOU...","[mark, nsubj, aux, neg, advcl, nsubj, ccomp, n...","[[mark, 5, 1], [nsubj, 5, 2], [aux, 5, 3], [ne...","[5, 5, 5, 5, 11, 7, 5, 7, 11, 11, 0, 11, 11, 1..."
3,"He live local, so how did neighbour view hi jo...","[[He, non_meta]]","He live local, so how did neighbour view hi jo...",He,0,PRON,"[[0, 0]]","[He, live, local, ,, so, how, did, neighbour, ...","[PRON, VERB, ADV, PUNCT, ADV, ADV, AUX, NOUN, ...","[nsubj, ROOT, advmod, punct, advmod, advmod, a...","[[nsubj, 2, 1], [ROOT, 0, 2], [advmod, 2, 3], ...","[2, 0, 2, 2, 9, 9, 9, 9, 2, 11, 9, 9, 9, 0]"
4,nation power wa given 70 per cent of the cegb'...,"[[the, non_meta]]",nation power wa given 70 per cent of the cegb'...,the,8,DET,"[[8, 8]]","[nation, power, wa, given, 70, per, cent, of, ...","[PROPN, PROPN, AUX, VERB, NUM, NOUN, NOUN, ADP...","[compound, nsubjpass, auxpass, ROOT, nummod, c...","[[compound, 2, 1], [nsubjpass, 4, 2], [auxpass...","[2, 4, 4, 0, 7, 7, 4, 7, 10, 13, 10, 13, 8, 4,..."


## Testing Grammar Noise

In [None]:
!python "/content/NLPTry/main.py" --data_dir '/content/drive/MyDrive/NLP-Rajas-data' --task_name vua18 --model_type MELBERT_GAT --class_weight 3 --bert_model bert-base --num_train_epoch 3 --train_batch_size 8 --learning_rate 3e-5 --warmup_epoch 2 --dataset_name vua18