In [1]:
# --- Updated Setup Cell ---

# 1. Clone your GitHub repository
!git clone https://github.com/rafidreezwan/nlp-robustness-study.git

# 2. Navigate into your project directory
%cd nlp-robustness-study

# 3. Install all the required Python libraries
!pip install -q datasets transformers torch sentencepiece nltk pandas tqdm sentence-transformers

# 4. Download the necessary NLTK data (this fixes the error)
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')

print("\n✅✅✅ Setup complete! You are ready to run the experiment. ✅✅✅")

Cloning into 'nlp-robustness-study'...
remote: Enumerating objects: 3, done.[K
remote: Counting objects: 100% (3/3), done.[K
remote: Compressing objects: 100% (2/2), done.[K
remote: Total 3 (delta 0), reused 0 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (3/3), done.
/content/nlp-robustness-study
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m87.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m49.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m46.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [3

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.



✅✅✅ Setup complete! You are ready to run the experiment. ✅✅✅


In [2]:
# ==============================================================================
# ALL-IN-ONE SCRIPT FOR NLP ROBUSTNESS EXPERIMENT (PATH A) - FINAL VERSION
# ==============================================================================

# --- Part 1: Imports ---
import pandas as pd
import nltk
from tqdm import tqdm
from datasets import load_dataset
from transformers import pipeline, AutoTokenizer
from sentence_transformers import SentenceTransformer, util
import torch

def run_full_experiment():
    """Main function to run the entire experimental pipeline."""

    # --- Part 2: Configuration ---
    NUM_ROOT_SENTENCES = 200
    PERTURBED_DATASET_FILE = 'imdb_perturbation_dataset.csv'
    RESULTS_FILE = 'experiment_results.csv'
    EVALUATION_MODEL = 'all-MiniLM-L6-v2'
    PARAPHRASE_MODEL = 'tuner007/pegasus_paraphrase'

    print("="*50)
    print("🚀 STARTING EXPERIMENT 🚀")
    print(f"Number of root sentences: {NUM_ROOT_SENTENCES}")
    print("="*50)

    # --- Part 3: Load and Filter IMDb Data ---
    print("\n[PHASE 1/5] Loading and filtering IMDb data...")
    imdb_dataset = load_dataset("imdb", split='train')
    negative_reviews = imdb_dataset.filter(lambda example: example['label'] == 0)
    source_reviews = negative_reviews.select(range(NUM_ROOT_SENTENCES * 2))

    # --- Part 4: Extract and Clean Sentences ---
    print("\n[PHASE 2/5] Extracting and cleaning sentences...")
    root_sentences = []
    for review in source_reviews:
        if len(root_sentences) >= NUM_ROOT_SENTENCES: break
        sentences = nltk.sent_tokenize(review['text'])
        for sentence in sentences:
            if 10 < len(sentence.split()) < 45: # Made the word count slightly stricter to help avoid long sentences
                root_sentences.append(sentence)
                if len(root_sentences) >= NUM_ROOT_SENTENCES: break

    print(f"Successfully extracted {len(root_sentences)} root sentences.")

    # --- Part 5: Perturb Sentences ---
    print("\n[PHASE 3/5] Perturbing sentences...")

    # Method 1: Back-Translation
    print("  -> Method A: Back-Translation (en-de-en)...")
    en_to_de_translator = pipeline('translation_en_to_de', model='Helsinki-NLP/opus-mt-en-de', device=0)
    de_to_en_translator = pipeline('translation_de_to_en', model='Helsinki-NLP/opus-mt-de-en', device=0)
    back_translated = [de_to_en_translator(en_to_de_translator(s)[0]['translation_text'])[0]['translation_text'] for s in tqdm(root_sentences, desc="Back-Translating")]

    # Method 2: AI Paraphrasing (with safety check)
    print("  -> Method B: AI Paraphrasing with safety check...")
    paraphraser = pipeline('text2text-generation', model=PARAPHRASE_MODEL, device=0)
    paraphrase_tokenizer = AutoTokenizer.from_pretrained(PARAPHRASE_MODEL)

    paraphrased = []
    for sentence in tqdm(root_sentences, desc="Paraphrasing"):
        # Tokenize the sentence and check its length
        token_length = len(paraphrase_tokenizer.encode(sentence, truncation=False))

        # The model's limit is 60. If our sentence is longer, we skip paraphrasing it.
        if token_length > 60:
            paraphrased.append(sentence) # Use the original sentence as a fallback
        else:
            result = paraphraser(sentence, num_beams=5, num_return_sequences=1)[0]['generated_text']
            paraphrased.append(result)

    # --- Part 6: Assemble Final Dataset ---
    final_data = []
    for i in range(len(root_sentences)):
        final_data.append({'root_sentence': root_sentences[i], 'perturbed_sentence': back_translated[i], 'method': 'back-translation'})
        final_data.append({'root_sentence': root_sentences[i], 'perturbed_sentence': paraphrased[i], 'method': 'paraphrasing'})

    df_perturbed = pd.DataFrame(final_data)
    df_perturbed.to_csv(PERTURBED_DATASET_FILE, index=False)
    print(f"\n✅ Perturbed dataset saved to '{PERTURBED_DATASET_FILE}'")

    # --- Part 7: Run Evaluation ---
    print("\n[PHASE 4/5] Evaluating model performance...")
    model = SentenceTransformer(EVALUATION_MODEL)

    corpus_embeddings = model.encode(root_sentences, convert_to_tensor=True, show_progress_bar=True)

    correct_predictions = 0
    results_list = []

    for index, row in tqdm(df_perturbed.iterrows(), total=df_perturbed.shape[0], desc="Evaluating"):
        true_root = row['root_sentence']
        query = row['perturbed_sentence']

        query_embedding = model.encode(query, convert_to_tensor=True)
        cos_scores = util.cos_sim(query_embedding, corpus_embeddings)[0]
        top_result_index = torch.argmax(cos_scores).item()
        predicted_root = root_sentences[top_result_index]

        is_correct = (predicted_root == true_root)
        if is_correct:
            correct_predictions += 1

        results_list.append({
            'root_sentence': true_root,
            'perturbed_sentence': query,
            'method': row['method'],
            'predicted_root': predicted_root,
            'is_correct': is_correct
        })

    # --- Part 8: Display and Save Final Results ---
    print("\n[PHASE 5/5] Finalizing results...")
    accuracy = (correct_predictions / len(df_perturbed)) * 100

    df_results = pd.DataFrame(results_list)
    df_results.to_csv(RESULTS_FILE, index=False)

    print("\n" + "="*50)
    print("🎉 EXPERIMENT COMPLETE 🎉")
    print("="*50)
    print(f"\nModel Tested: {EVALUATION_MODEL}")
    print(f"Total Perturbed Sentences: {len(df_perturbed)}")
    print(f"Correctly Identified: {correct_predictions}")
    print(f"Overall Accuracy: {accuracy:.2f}%")
    print(f"\nDetailed results saved to '{RESULTS_FILE}'")
    print(f"Perturbed dataset saved to '{PERTURBED_DATASET_FILE}'")
    print("="*50)
    print("You can now start writing your paper!")

In [3]:
# This calls the main function from the script above to run the whole experiment
run_full_experiment()

🚀 STARTING EXPERIMENT 🚀
Number of root sentences: 200

[PHASE 1/5] Loading and filtering IMDb data...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

unsupervised-00000-of-00001.parquet:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/25000 [00:00<?, ? examples/s]


[PHASE 2/5] Extracting and cleaning sentences...
Successfully extracted 200 root sentences.

[PHASE 3/5] Perturbing sentences...
  -> Method A: Back-Translation (en-de-en)...


config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/298M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/298M [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/768k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/797k [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

Device set to use cuda:0


config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/298M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/797k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/768k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/298M [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

Device set to use cuda:0

Back-Translating:   0%|          | 0/200 [00:00<?, ?it/s][A
Back-Translating:   0%|          | 1/200 [00:02<07:18,  2.20s/it][A
Back-Translating:   1%|          | 2/200 [00:02<04:12,  1.28s/it][A
Back-Translating:   2%|▏         | 3/200 [00:03<03:57,  1.21s/it][A
Back-Translating:   2%|▏         | 4/200 [00:04<03:03,  1.07it/s][A
Back-Translating:   2%|▎         | 5/200 [00:04<02:31,  1.29it/s][A
Back-Translating:   3%|▎         | 6/200 [00:05<02:16,  1.42it/s][A
Back-Translating:   4%|▎         | 7/200 [00:05<01:56,  1.66it/s][A
Back-Translating:   4%|▍         | 8/200 [00:06<01:46,  1.80it/s][A
Back-Translating:   4%|▍         | 9/200 [00:06<01:36,  1.98it/s][A
Back-Translating:   5%|▌         | 10/200 [00:07<01:39,  1.90it/s][AYou seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset

Back-Translating:   6%|▌         | 11/200 [00:07<01:35,  1.98it/s][A
Back-Translating:   6%|▌         | 12/200 [

  -> Method B: AI Paraphrasing with safety check...


config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at tuner007/pegasus_paraphrase and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/86.0 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/1.91M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

Device set to use cuda:0

Paraphrasing:   0%|          | 0/200 [00:00<?, ?it/s][A
Paraphrasing:   0%|          | 1/200 [00:04<16:33,  4.99s/it][A
Paraphrasing:   1%|          | 2/200 [00:05<08:10,  2.48s/it][A
Paraphrasing:   2%|▏         | 3/200 [00:06<05:46,  1.76s/it][A
Paraphrasing:   2%|▏         | 4/200 [00:07<04:27,  1.37s/it][A
Paraphrasing:   2%|▎         | 5/200 [00:07<03:26,  1.06s/it][A
Paraphrasing:   3%|▎         | 6/200 [00:09<03:35,  1.11s/it][A
Paraphrasing:   4%|▎         | 7/200 [00:09<02:47,  1.15it/s][A
Paraphrasing:   4%|▍         | 8/200 [00:10<02:45,  1.16it/s][A
Paraphrasing:   4%|▍         | 9/200 [00:11<02:54,  1.10it/s][A
Paraphrasing:   5%|▌         | 10/200 [00:12<03:03,  1.04it/s][A
Paraphrasing:   6%|▌         | 11/200 [00:13<03:07,  1.01it/s][A
Paraphrasing:   6%|▌         | 12/200 [00:14<03:16,  1.04s/it][AToken indices sequence length is longer than the specified maximum sequence length for this model (61 > 60). Running this sequence thro


✅ Perturbed dataset saved to 'imdb_perturbation_dataset.csv'

[PHASE 4/5] Evaluating model performance...


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Evaluating: 100%|██████████| 400/400 [00:02<00:00, 152.48it/s]



[PHASE 5/5] Finalizing results...

🎉 EXPERIMENT COMPLETE 🎉

Model Tested: all-MiniLM-L6-v2
Total Perturbed Sentences: 400
Correctly Identified: 396
Overall Accuracy: 99.00%

Detailed results saved to 'experiment_results.csv'
Perturbed dataset saved to 'imdb_perturbation_dataset.csv'
You can now start writing your paper!


In [8]:
# Configure Git with your name and email
!git config --global user.name "rafidreezwan"
!git config --global user.email "rafidreezwan@gmail.com"

# Store your GitHub username and token
username =  "rafidreezwan"
token ="my token"

# Add, commit, and push the new result files
!git add imdb_perturbation_dataset.csv experiment_results.csv
!git commit -m "chore: Add experimental results from Colab run"

# Push to your repository using the token for authentication
!git push https://{my token}@github.com/rafidreezwan/nlp-robustness-study.git

On branch main
Your branch is ahead of 'origin/main' by 1 commit.
  (use "git push" to publish your local commits)

nothing to commit, working tree clean
Enumerating objects: 5, done.
Counting objects: 100% (5/5), done.
Delta compression using up to 2 threads
Compressing objects: 100% (4/4), done.
Writing objects: 100% (4/4), 24.12 KiB | 3.45 MiB/s, done.
Total 4 (delta 1), reused 0 (delta 0), pack-reused 0
remote: Resolving deltas: 100% (1/1), done.[K
To https://github.com/rafidreezwan/nlp-robustness-study.git
   ab5ece3..52f5054  main -> main
