# **Initial Setup**

In [None]:
!pip install transformers datasets torch

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (1

In [None]:
pip install transformers accelerate datasets



In [None]:
from datasets import load_dataset
from transformers import T5Tokenizer
from datasets import load_from_disk
from torch.utils.data import DataLoader
from transformers import T5ForConditionalGeneration
import torch
from transformers import AdamW
from transformers import get_scheduler
from torch.nn import functional as F
from tqdm import tqdm
import pandas as pd

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# **Idea 1**

## **Load and split data**

In [9]:
# Load the TellMeWhy dataset
dataset = load_dataset("StonyBrookNLP/tellmewhy")

# Access the splits
train_data = dataset["train"]
val_data = dataset["validation"]
test_data = dataset["test"]

# Inspect the first example in the training set
print(train_data[0])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/7.76k [00:00<?, ?B/s]

dataset_infos.json:   0%|          | 0.00/1.89k [00:00<?, ?B/s]

train.json:   0%|          | 0.00/70.1M [00:00<?, ?B/s]

validation.json:   0%|          | 0.00/8.71M [00:00<?, ?B/s]

test.json:   0%|          | 0.00/10.4M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/71892 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/8976 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/10689 [00:00<?, ? examples/s]

{'narrative': 'Cam ordered a pizza and took it home. He opened the box to take out a slice. Cam discovered that the store did not cut the pizza for him. He looked for his pizza cutter but did not find it. He had to use his chef knife to cut a slice.', 'question': 'Why did Cam order a pizza?', 'original_sentence_for_question': 'Cam ordered a pizza and took it home.', 'narrative_lexical_overlap': 0.33333333330000003, 'is_ques_answerable': 'Not Answerable', 'answer': 'Cam was hungry.', 'is_ques_answerable_annotator': 'Not Answerable', 'original_narrative_form': ['Cam ordered a pizza and took it home.', 'He opened the box to take out a slice.', 'Cam discovered that the store did not cut the pizza for him.', 'He looked for his pizza cutter but did not find it.', 'He had to use his chef knife to cut a slice.'], 'question_meta': 'rocstories_narrative_41270_sentence_0_question_0', 'helpful_sentences': [], 'human_eval': False, 'val_ann': [], 'gram_ann': []}


## **Data Preprocessing**

In [6]:
def preprocess_data(examples):
    """
    Format the dataset for T5:
    - Input: 'question: <question> context: <narrative>'
    - Target: '<answer>'
    """
    inputs = ["question: " + q + " context: " + n for q, n in zip(examples["question"], examples["narrative"])]
    targets = examples["answer"]  # Use 'answer' field as the target
    return {"input_text": inputs, "target_text": targets}

In [10]:
train_data = train_data.map(preprocess_data, batched=True)
val_data = val_data.map(preprocess_data, batched=True)
test_data = test_data.map(preprocess_data, batched=True)
print(train_data[0])

Map:   0%|          | 0/71892 [00:00<?, ? examples/s]

Map:   0%|          | 0/8976 [00:00<?, ? examples/s]

Map:   0%|          | 0/10689 [00:00<?, ? examples/s]

{'narrative': 'Cam ordered a pizza and took it home. He opened the box to take out a slice. Cam discovered that the store did not cut the pizza for him. He looked for his pizza cutter but did not find it. He had to use his chef knife to cut a slice.', 'question': 'Why did Cam order a pizza?', 'original_sentence_for_question': 'Cam ordered a pizza and took it home.', 'narrative_lexical_overlap': 0.33333333330000003, 'is_ques_answerable': 'Not Answerable', 'answer': 'Cam was hungry.', 'is_ques_answerable_annotator': 'Not Answerable', 'original_narrative_form': ['Cam ordered a pizza and took it home.', 'He opened the box to take out a slice.', 'Cam discovered that the store did not cut the pizza for him.', 'He looked for his pizza cutter but did not find it.', 'He had to use his chef knife to cut a slice.'], 'question_meta': 'rocstories_narrative_41270_sentence_0_question_0', 'helpful_sentences': [], 'human_eval': False, 'val_ann': [], 'gram_ann': [], 'input_text': 'question: Why did Cam 

In [11]:
print(len(train_data), len(val_data), len(test_data))

71892 8976 10689


In [12]:
train_data = train_data.remove_columns([
    'narrative', 'question', 'original_sentence_for_question',
    'narrative_lexical_overlap', 'is_ques_answerable',
    'is_ques_answerable_annotator', 'original_narrative_form',
    'question_meta', 'helpful_sentences', 'human_eval',
    'val_ann', 'gram_ann'
])
val_data = val_data.remove_columns([
    'narrative', 'question', 'original_sentence_for_question',
    'narrative_lexical_overlap', 'is_ques_answerable',
    'is_ques_answerable_annotator', 'original_narrative_form',
    'question_meta', 'helpful_sentences', 'human_eval',
    'val_ann', 'gram_ann'
])
test_data = test_data.remove_columns([
    'narrative', 'question', 'original_sentence_for_question',
    'narrative_lexical_overlap', 'is_ques_answerable',
    'is_ques_answerable_annotator', 'original_narrative_form',
    'question_meta', 'helpful_sentences', 'human_eval',
    'val_ann', 'gram_ann'
])

In [13]:
print(train_data[0])
print(val_data[0])
print(test_data[0])

{'answer': 'Cam was hungry.', 'input_text': 'question: Why did Cam order a pizza? context: Cam ordered a pizza and took it home. He opened the box to take out a slice. Cam discovered that the store did not cut the pizza for him. He looked for his pizza cutter but did not find it. He had to use his chef knife to cut a slice.', 'target_text': 'Cam was hungry.'}
{'answer': 'Nick was attending a university.', 'input_text': 'question: Why was Nick a student? context: Nick was a student at a large university. Nick had a classmate named Phil. Nick and Phil worked on a project together. They got along quite well. Nick and Phil became close friends.', 'target_text': 'Nick was attending a university.'}
{'answer': 'Bob enjoyed the language processing that was part of the job.', 'input_text': 'question: Why was Bob a computer scientist? context: Bob was a computer scientist. He enjoyed natural language processing. He decided to revolutionize the industry! He formulated a machine learning algorithm

## **Tokenization**

In [14]:
# Load the T5 tokenizer
tokenizer = T5Tokenizer.from_pretrained("t5-small")


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [None]:
def tokenize_data(examples):
    """
    Tokenize the input_text and target_text for T5.
    """
    # Tokenize input text
    model_inputs = tokenizer(
        examples["input_text"],
        max_length=512,  # Max input sequence length
        padding="max_length",
        truncation=True
    )

    # Tokenize target text
    labels = tokenizer(
        examples["target_text"],
        max_length=64,  # Max target sequence length
        padding="max_length",
        truncation=True
    )

    # Add labels to the model inputs
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


In [None]:
# Apply tokenization
tokenized_train_data = train_data.map(tokenize_data, batched=True)
tokenized_val_data = val_data.map(tokenize_data, batched=True)
tokenized_test_data = test_data.map(tokenize_data, batched=True)

Map:   0%|          | 0/71892 [00:00<?, ? examples/s]

Map:   0%|          | 0/8976 [00:00<?, ? examples/s]

Map:   0%|          | 0/10689 [00:00<?, ? examples/s]

In [None]:
tokenized_train_data.save_to_disk('/content/drive/My Drive/CSE354_Project/Idea1/tokenized_train_data')
tokenized_val_data.save_to_disk('/content/drive/My Drive/CSE354_Project/Idea1/tokenized_val_data')
tokenized_test_data.save_to_disk('/content/drive/My Drive/CSE354_Project/Idea1/tokenized_test_data')

Saving the dataset (0/1 shards):   0%|          | 0/71892 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/8976 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/10689 [00:00<?, ? examples/s]

### Load tokenized Data

In [15]:
tokenized_train_data = load_from_disk('/content/drive/My Drive/CSE354_Project/Idea1/tokenized_train_data')
tokenized_val_data = load_from_disk('/content/drive/My Drive/CSE354_Project/Idea1/tokenized_val_data')
tokenized_test_data = load_from_disk('/content/drive/My Drive/CSE354_Project/Idea1/tokenized_test_data')

In [16]:
print(tokenized_train_data[0])

{'narrative': 'Cam ordered a pizza and took it home. He opened the box to take out a slice. Cam discovered that the store did not cut the pizza for him. He looked for his pizza cutter but did not find it. He had to use his chef knife to cut a slice.', 'question': 'Why did Cam order a pizza?', 'original_sentence_for_question': 'Cam ordered a pizza and took it home.', 'narrative_lexical_overlap': 0.33333333330000003, 'is_ques_answerable': 'Not Answerable', 'answer': 'Cam was hungry.', 'is_ques_answerable_annotator': 'Not Answerable', 'original_narrative_form': ['Cam ordered a pizza and took it home.', 'He opened the box to take out a slice.', 'Cam discovered that the store did not cut the pizza for him.', 'He looked for his pizza cutter but did not find it.', 'He had to use his chef knife to cut a slice.'], 'question_meta': 'rocstories_narrative_41270_sentence_0_question_0', 'helpful_sentences': [], 'human_eval': False, 'val_ann': [], 'gram_ann': [], 'input_text': 'question: Why did Cam 

## **Data Loaders**

In [17]:
# Set format to PyTorch
tokenized_train_data.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
tokenized_val_data.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
tokenized_test_data.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

In [18]:
# Define batch size
BATCH_SIZE = 16

# Create DataLoader objects
train_dataloader = DataLoader(tokenized_train_data, shuffle=True, batch_size=BATCH_SIZE)
val_dataloader = DataLoader(tokenized_val_data, shuffle=False, batch_size=BATCH_SIZE)
test_dataloader = DataLoader(tokenized_test_data, shuffle=False, batch_size=BATCH_SIZE)

In [19]:
# Get a batch of data
batch = next(iter(train_dataloader))

# Inspect the batch
print(batch.keys())  # Should show 'input_ids', 'attention_mask', 'labels'
print(batch['input_ids'].shape)  # Shape: [BATCH_SIZE, SEQ_LEN]
print(batch['labels'].shape)  # Shape: [BATCH_SIZE, TARGET_SEQ_LEN]


dict_keys(['input_ids', 'attention_mask', 'labels'])
torch.Size([16, 512])
torch.Size([16, 64])


## **Load Pre-Trained T5 Model**

In [20]:
# Load the pre-trained T5 model
model = T5ForConditionalGeneration.from_pretrained("t5-small")

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [21]:
# Check for GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Drop

### Set Up the optimizer and scheduler

In [22]:
# Define optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# set up a scheduler
num_training_steps = len(train_dataloader) * 3  # Assuming 3 epochs
lr_scheduler = get_scheduler(
    "linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)



### Fine-tuning loop

In [None]:
# Set the number of epochs
num_epochs = 3

# Training loop
for epoch in range(num_epochs):
    model.train()  # Set model to training mode
    epoch_loss = 0

    # Iterate over batches
    for batch in tqdm(train_dataloader):
        # Move batch to device
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        # Forward pass
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )
        loss = outputs.loss
        logits = outputs.logits

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Update epoch loss
        epoch_loss += loss.item()

    # Print epoch loss
    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {epoch_loss/len(train_dataloader)}")

    # Validation (optional for now)
    model.eval()
    # (Add validation loop here if desired)


  0%|          | 0/4494 [00:00<?, ?it/s]Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.
100%|██████████| 4494/4494 [40:19<00:00,  1.86it/s]


Epoch 1/3, Loss: 0.39912637944188606


100%|██████████| 4494/4494 [40:19<00:00,  1.86it/s]


Epoch 2/3, Loss: 0.2935716943690179


100%|██████████| 4494/4494 [40:20<00:00,  1.86it/s]

Epoch 3/3, Loss: 0.2820474669556115





In [None]:
save_path = "/content/drive/MyDrive/Colab Notebooks/CSE_354_Project_outputs"
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

('/content/drive/MyDrive/Colab Notebooks/CSE_354_Project_outputs/tokenizer_config.json',
 '/content/drive/MyDrive/Colab Notebooks/CSE_354_Project_outputs/special_tokens_map.json',
 '/content/drive/MyDrive/Colab Notebooks/CSE_354_Project_outputs/spiece.model',
 '/content/drive/MyDrive/Colab Notebooks/CSE_354_Project_outputs/added_tokens.json')

In [None]:
!zip -r /content/drive/My\ Drive/CSE_354_Project_outputs.zip /content/drive/MyDrive/Colab\ Notebooks/CSE_354_Project_outputs

  adding: content/drive/MyDrive/Colab Notebooks/CSE_354_Project_outputs/ (stored 0%)
  adding: content/drive/MyDrive/Colab Notebooks/CSE_354_Project_outputs/config.json (deflated 62%)
  adding: content/drive/MyDrive/Colab Notebooks/CSE_354_Project_outputs/generation_config.json (deflated 29%)
  adding: content/drive/MyDrive/Colab Notebooks/CSE_354_Project_outputs/model.safetensors (deflated 13%)
  adding: content/drive/MyDrive/Colab Notebooks/CSE_354_Project_outputs/tokenizer_config.json (deflated 94%)
  adding: content/drive/MyDrive/Colab Notebooks/CSE_354_Project_outputs/special_tokens_map.json (deflated 85%)
  adding: content/drive/MyDrive/Colab Notebooks/CSE_354_Project_outputs/added_tokens.json (deflated 83%)
  adding: content/drive/MyDrive/Colab Notebooks/CSE_354_Project_outputs/spiece.model (deflated 48%)


## **Evaluation**

In [23]:
# Load the fine-tuned model and tokenizer
model_path = "/content/drive/MyDrive/CSE354_Project/Idea1/fine_tuned_t5"
model = T5ForConditionalGeneration.from_pretrained(model_path)
tokenizer = T5Tokenizer.from_pretrained(model_path)
model.to("cuda" if torch.cuda.is_available() else "cpu")

T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Drop

In [None]:
# Maybe delete it??
val_data = load_from_disk("tokenized_val_data")
test_data = load_from_disk("tokenized_test_data")

In [None]:
def generate_predictions(data, model, tokenizer, device):
    predictions = []
    references = []
    for example in tqdm(data):
        input_ids = example["input_ids"]
        attention_mask = example["attention_mask"]

        # Generate prediction
        outputs = model.generate(
            input_ids=torch.tensor([input_ids]).to(device),
            attention_mask=torch.tensor([attention_mask]).to(device),
            max_length=64,
            num_beams=5,
        )

        # Decode prediction
        predicted_answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
        predictions.append(predicted_answer)
        references.append(example["target_text"])

    return predictions, references

# Generate predictions for validation and test sets
val_predictions, val_references = generate_predictions(val_data, model, tokenizer, model.device)
test_predictions, test_references = generate_predictions(test_data, model, tokenizer, model.device)


100%|██████████| 8976/8976 [51:16<00:00,  2.92it/s]
100%|██████████| 10689/10689 [1:03:07<00:00,  2.82it/s]


In [None]:
# Save CSVs to Drive
val_csv_path = "/content/drive/MyDrive/Colab Notebooks/CSE_354_Project_outputs/val_predictions.csv"
test_csv_path = "/content/drive/MyDrive/Colab Notebooks/CSE_354_Project_outputs/test_predictions.csv"

pd.DataFrame({"predicted_answer": val_predictions, "gold_answer": val_references}).to_csv(val_csv_path, index=False)
pd.DataFrame({"predicted_answer": test_predictions, "gold_answer": test_references}).to_csv(test_csv_path, index=False)


In [25]:
val_df = pd.read_csv("/content/drive/MyDrive/CSE354_Project/Idea1/val_predictions.csv")
test_df = pd.read_csv("/content/drive/MyDrive/CSE354_Project/Idea1/test_predictions.csv")

In [26]:
val_predictions = val_df["predicted_answer"].tolist()
val_references = val_df["gold_answer"].tolist()
test_predictions = test_df["predicted_answer"].tolist()
test_references = test_df["gold_answer"].tolist()

In [27]:
# Clone BLEURT GitHub repository
!git clone https://github.com/google-research/bleurt.git

# Change to BLEURT directory
%cd bleurt

# Install BLEURT package
!pip install .

Cloning into 'bleurt'...
remote: Enumerating objects: 134, done.[K
remote: Counting objects: 100% (18/18), done.[K
remote: Compressing objects: 100% (18/18), done.[K
remote: Total 134 (delta 0), reused 17 (delta 0), pack-reused 116 (from 1)[K
Receiving objects: 100% (134/134), 31.28 MiB | 16.64 MiB/s, done.
Resolving deltas: 100% (49/49), done.
/content/bleurt
Processing /content/bleurt
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: BLEURT
  Building wheel for BLEURT (setup.py) ... [?25l[?25hdone
  Created wheel for BLEURT: filename=BLEURT-0.0.2-py3-none-any.whl size=16456764 sha256=2336718301abb0fa77b63a685e28a0383969d8a573a67377e9a5c0fc4e1df8d9
  Stored in directory: /tmp/pip-ephem-wheel-cache-hlbcxey1/wheels/92/4f/fb/afa555fa27aa9e2c7958df797a62cc4e74f0f459cec9c4fa7c
Successfully built BLEURT
Installing collected packages: BLEURT
Successfully installed BLEURT-0.0.2


In [28]:
pip install sacrebleu rouge-metric bert-score bleurt

Collecting sacrebleu
  Downloading sacrebleu-2.4.3-py3-none-any.whl.metadata (51 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/51.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting rouge-metric
  Downloading rouge_metric-1.0.1-py3-none-any.whl.metadata (9.5 kB)
Collecting bert-score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Collecting portalocker (from sacrebleu)
  Downloading portalocker-3.0.0-py3-none-any.whl.metadata (8.5 kB)
Collecting colorama (from sacrebleu)
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Downloading sacrebleu-2.4.3-py3-none-any.whl (103 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.0/104.0 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading rouge_metric-1.0.1-py3-none-any.whl (151 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
import sacrebleu

def calculate_bleu(predictions, references):
    bleu_score = sacrebleu.corpus_bleu(predictions, [references])
    print(f"BLEU Score: {bleu_score.score}")
    return bleu_score

val_bleu = calculate_bleu(val_predictions, val_references)
test_bleu = calculate_bleu(test_predictions, test_references)


BLEU Score: 13.514502010881479
BLEU Score: 10.814816756836143


In [None]:
from rouge_metric import PyRouge

def calculate_rouge(predictions, references):
    rouge = PyRouge(rouge_n=(1, 2), rouge_l=True)
    scores = rouge.evaluate(predictions, [[ref] for ref in references])
    print("ROUGE Scores:", scores)
    return scores

val_rouge = calculate_rouge(val_predictions, val_references)
test_rouge = calculate_rouge(test_predictions, test_references)


ROUGE Scores: {'rouge-1': {'r': 0.334020950025961, 'p': 0.2357586567544979, 'f': 0.27641680947814007}, 'rouge-2': {'r': 0.17277351497535268, 'p': 0.1318554242850683, 'f': 0.14956638839110828}, 'rouge-l': {'r': 0.3163004309334884, 'p': 0.22469847073398636, 'f': 0.2627444266677654}}
ROUGE Scores: {'rouge-1': {'r': 0.2964631580870306, 'p': 0.2124517493431638, 'f': 0.2475231738422904}, 'rouge-2': {'r': 0.13730525285908488, 'p': 0.10719389589615001, 'f': 0.12039538833491478}, 'rouge-l': {'r': 0.2802751293340354, 'p': 0.20193462300967485, 'f': 0.2347412191726675}}


In [None]:
from bert_score import BERTScorer

def calculate_bertscore(predictions, references):
    scorer = BERTScorer(lang="en", rescale_with_baseline=True)
    P, R, F1 = scorer.score(predictions, references)
    print(f"BERTScore - Precision: {P.mean():.4f}, Recall: {R.mean():.4f}, F1: {F1.mean():.4f}")
    return F1.mean()

val_bertscore = calculate_bertscore(val_predictions, val_references)
test_bertscore = calculate_bertscore(test_predictions, test_references)


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BERTScore - Precision: 0.3749, Recall: 0.4709, F1: 0.4223


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BERTScore - Precision: 0.3563, Recall: 0.4488, F1: 0.4020


In [None]:
!wget https://storage.googleapis.com/bleurt-oss/bleurt-base-128.zip
!unzip bleurt-base-128.zip -d bleurt-base-128

--2024-12-02 04:29:23--  https://storage.googleapis.com/bleurt-oss/bleurt-base-128.zip
Resolving storage.googleapis.com (storage.googleapis.com)... 173.194.210.207, 173.194.215.207, 173.194.216.207, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|173.194.210.207|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 405489453 (387M) [application/zip]
Saving to: ‘bleurt-base-128.zip’


2024-12-02 04:29:27 (96.4 MB/s) - ‘bleurt-base-128.zip’ saved [405489453/405489453]

Archive:  bleurt-base-128.zip
   creating: bleurt-base-128/bleurt-base-128/
  inflating: bleurt-base-128/bleurt-base-128/vocab.txt  
  inflating: bleurt-base-128/bleurt-base-128/bert_config.json  
   creating: bleurt-base-128/bleurt-base-128/variables/
  inflating: bleurt-base-128/bleurt-base-128/variables/variables.index  
  inflating: bleurt-base-128/bleurt-base-128/variables/variables.data-00000-of-00001  
  inflating: bleurt-base-128/bleurt-base-128/bleurt_config.json  
  inflating

In [None]:
!ls bleurt-base-128

bleurt-base-128


In [None]:
!unzip bleurt-base-128.zip -d bleurt-base-128

Archive:  bleurt-base-128.zip
replace bleurt-base-128/bleurt-base-128/vocab.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: bleurt-base-128/bleurt-base-128/vocab.txt  
replace bleurt-base-128/bleurt-base-128/bert_config.json? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: bleurt-base-128/bleurt-base-128/bert_config.json  
replace bleurt-base-128/bleurt-base-128/variables/variables.index? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: bleurt-base-128/bleurt-base-128/variables/variables.index  
replace bleurt-base-128/bleurt-base-128/variables/variables.data-00000-of-00001? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: bleurt-base-128/bleurt-base-128/variables/variables.data-00000-of-00001  y

replace bleurt-base-128/bleurt-base-128/bleurt_config.json? [y]es, [n]o, [A]ll, [N]one, [r]ename:   inflating: bleurt-base-128/bleurt-base-128/bleurt_config.json  
replace bleurt-base-128/bleurt-base-128/saved_model.pb? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflatin

In [None]:
!mv bleurt-base-128/bleurt-base-128/* bleurt-base-128/
!rm -r bleurt-base-128/bleurt-base-128

In [None]:
!ls bleurt-base-128


bert_config.json  bleurt_config.json  saved_model.pb  variables  vocab.txt


In [None]:
from bleurt import score

def calculate_bleurt(predictions, references, checkpoint="bleurt-base-128"):
    scorer = score.BleurtScorer(checkpoint)
    scores = scorer.score(references=references, candidates=predictions)
    print(f"Average BLEURT Score: {sum(scores) / len(scores):.4f}")
    return scores

val_bleurt = calculate_bleurt(val_predictions, val_references)
test_bleurt = calculate_bleurt(test_predictions, test_references)


Average BLEURT Score: -0.7531
Average BLEURT Score: -0.7950


In [None]:
val_metrics = {
    "BLEU": val_bleu.score,
    "ROUGE": val_rouge,
    "BERTScore": val_bertscore,
    "BLEURT": sum(val_bleurt) / len(val_bleurt),
}

test_metrics = {
    "BLEU": test_bleu.score,
    "ROUGE": test_rouge,
    "BERTScore": test_bertscore,
    "BLEURT": sum(test_bleurt) / len(test_bleurt),
}

print("Validation Metrics:", val_metrics)
print("Test Metrics:", test_metrics)


Validation Metrics: {'BLEU': 13.514502010881479, 'ROUGE': {'rouge-1': {'r': 0.334020950025961, 'p': 0.2357586567544979, 'f': 0.27641680947814007}, 'rouge-2': {'r': 0.17277351497535268, 'p': 0.1318554242850683, 'f': 0.14956638839110828}, 'rouge-l': {'r': 0.3163004309334884, 'p': 0.22469847073398636, 'f': 0.2627444266677654}}, 'BERTScore': tensor(0.4223), 'BLEURT': -0.7531431899011739}
Test Metrics: {'BLEU': 10.814816756836143, 'ROUGE': {'rouge-1': {'r': 0.2964631580870306, 'p': 0.2124517493431638, 'f': 0.2475231738422904}, 'rouge-2': {'r': 0.13730525285908488, 'p': 0.10719389589615001, 'f': 0.12039538833491478}, 'rouge-l': {'r': 0.2802751293340354, 'p': 0.20193462300967485, 'f': 0.2347412191726675}}, 'BERTScore': tensor(0.4020), 'BLEURT': -0.7949906133525844}


In [30]:
for i in range(20,25):  # Print a few examples
    print("Gold Answer:", test_references[i])
    print("Predicted Answer:", test_predictions[i])
    print("---")

Gold Answer: he apparently missed the friends from the school.
Predicted Answer: Jay is an alumni. so He visited his friends.
---
Gold Answer: he was headed to his old college.
Predicted Answer: Jay took a trip to his old college.
---
Gold Answer: he is at his old college.
Predicted Answer: He visited his friends. so Jay is an alumni.
---
Gold Answer: Jay was at his old college.
Predicted Answer: Jay is an alumni. so He visited his friends.
---
Gold Answer: Jay wanted to visit his friends.
Predicted Answer: he wanted to get drunk.
---


# **Idea 2**

## Data Preporocessing

In [31]:
# Load the TellMeWhy dataset
dataset = load_dataset("StonyBrookNLP/tellmewhy")

# Access the splits
train_data = dataset["train"]
val_data = dataset["validation"]
test_data = dataset["test"]

# Inspect the first example in the training set
print(train_data[0])

{'narrative': 'Cam ordered a pizza and took it home. He opened the box to take out a slice. Cam discovered that the store did not cut the pizza for him. He looked for his pizza cutter but did not find it. He had to use his chef knife to cut a slice.', 'question': 'Why did Cam order a pizza?', 'original_sentence_for_question': 'Cam ordered a pizza and took it home.', 'narrative_lexical_overlap': 0.33333333330000003, 'is_ques_answerable': 'Not Answerable', 'answer': 'Cam was hungry.', 'is_ques_answerable_annotator': 'Not Answerable', 'original_narrative_form': ['Cam ordered a pizza and took it home.', 'He opened the box to take out a slice.', 'Cam discovered that the store did not cut the pizza for him.', 'He looked for his pizza cutter but did not find it.', 'He had to use his chef knife to cut a slice.'], 'question_meta': 'rocstories_narrative_41270_sentence_0_question_0', 'helpful_sentences': [], 'human_eval': False, 'val_ann': [], 'gram_ann': []}


### Normalize text

In [32]:
def normalize_text(text):
    """
    Cleans text by removing unnecessary spaces, special characters, and line breaks.
    """
    if text is None:
        return ""
    text = text.replace("\n", " ").strip()  # Remove line breaks
    text = " ".join(text.split())  # Remove extra spaces
    return text

### Create Input and Target text

In [33]:
def preprocess_data(row):
    """
    Prepares data for T5 fine-tuning with contextual clues.
    """
    narrative = normalize_text(row["narrative"])
    question = normalize_text(row["question"])
    answer = normalize_text(row["answer"])

    # Extract helpful sentences
    helpful_sentences = row.get("helpful_sentences", [])
    highlighted_clues = " ".join([f"<highlight> {s} </highlight>" for s in helpful_sentences])

    # Combine narrative with highlighted clues
    input_narrative = narrative
    if highlighted_clues:
        input_narrative += f" {highlighted_clues}"

    input_text = f"question: {question} context: {input_narrative}"
    target_text = answer

    return {"input_text": input_text, "target_text": target_text}

In [34]:
train_data = train_data.map(preprocess_data, batched=False)
val_data = val_data.map(preprocess_data, batched=False)
test_data = test_data.map(preprocess_data, batched=False)

Map:   0%|          | 0/71892 [00:00<?, ? examples/s]

Map:   0%|          | 0/8976 [00:00<?, ? examples/s]

Map:   0%|          | 0/10689 [00:00<?, ? examples/s]

In [35]:
# Inspect processed data
print(train_data[0])

{'narrative': 'Cam ordered a pizza and took it home. He opened the box to take out a slice. Cam discovered that the store did not cut the pizza for him. He looked for his pizza cutter but did not find it. He had to use his chef knife to cut a slice.', 'question': 'Why did Cam order a pizza?', 'original_sentence_for_question': 'Cam ordered a pizza and took it home.', 'narrative_lexical_overlap': 0.33333333330000003, 'is_ques_answerable': 'Not Answerable', 'answer': 'Cam was hungry.', 'is_ques_answerable_annotator': 'Not Answerable', 'original_narrative_form': ['Cam ordered a pizza and took it home.', 'He opened the box to take out a slice.', 'Cam discovered that the store did not cut the pizza for him.', 'He looked for his pizza cutter but did not find it.', 'He had to use his chef knife to cut a slice.'], 'question_meta': 'rocstories_narrative_41270_sentence_0_question_0', 'helpful_sentences': [], 'human_eval': False, 'val_ann': [], 'gram_ann': [], 'input_text': 'question: Why did Cam 

## Tokenize the data

In [None]:
from transformers import T5TokenizerFast

# Load the T5 tokenizer
tokenizer = T5TokenizerFast.from_pretrained("t5-base")

def tokenize_data(data, tokenizer, max_input_length=512, max_target_length=64):
    """
    Tokenizes the input_text and target_text fields for T5 fine-tuning.
    """
    def tokenize_row(row):
        # Tokenize input_text
        input_encodings = tokenizer(
            row["input_text"],
            max_length=max_input_length,
            padding="max_length",
            truncation=True
        )
        # Tokenize target_text
        target_encodings = tokenizer(
            row["target_text"],
            max_length=max_target_length,
            padding="max_length",
            truncation=True
        )

        # Return tokenized input and target
        return {
            "input_ids": input_encodings["input_ids"],
            "attention_mask": input_encodings["attention_mask"],
            "labels": target_encodings["input_ids"]
        }

    # Apply the tokenization row-wise
    tokenized_data = data.map(tokenize_row, batched=False)
    return tokenized_data


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

In [None]:
# Tokenize the processed data
tokenized_train_data = tokenize_data(train_data, tokenizer)
tokenized_val_data = tokenize_data(val_data, tokenizer)
tokenized_test_data = tokenize_data(test_data, tokenizer)

# Inspect tokenized data
print(tokenized_train_data[0])

Map:   0%|          | 0/71892 [00:00<?, ? examples/s]

Map:   0%|          | 0/8976 [00:00<?, ? examples/s]

Map:   0%|          | 0/10689 [00:00<?, ? examples/s]

{'narrative': 'Cam ordered a pizza and took it home. He opened the box to take out a slice. Cam discovered that the store did not cut the pizza for him. He looked for his pizza cutter but did not find it. He had to use his chef knife to cut a slice.', 'question': 'Why did Cam order a pizza?', 'original_sentence_for_question': 'Cam ordered a pizza and took it home.', 'narrative_lexical_overlap': 0.33333333330000003, 'is_ques_answerable': 'Not Answerable', 'answer': 'Cam was hungry.', 'is_ques_answerable_annotator': 'Not Answerable', 'original_narrative_form': ['Cam ordered a pizza and took it home.', 'He opened the box to take out a slice.', 'Cam discovered that the store did not cut the pizza for him.', 'He looked for his pizza cutter but did not find it.', 'He had to use his chef knife to cut a slice.'], 'question_meta': 'rocstories_narrative_41270_sentence_0_question_0', 'helpful_sentences': [], 'human_eval': False, 'val_ann': [], 'gram_ann': [], 'input_text': 'question: Why did Cam 

In [36]:
save_path = "/content/drive/MyDrive/CSE354_Project/Idea2"
tokenized_train_data.save_to_disk(f"{save_path}/tokenized_train_data")
tokenized_val_data.save_to_disk(f"{save_path}/tokenized_val_data")
tokenized_test_data.save_to_disk(f"{save_path}/tokenized_test_data")

Saving the dataset (0/1 shards):   0%|          | 0/71892 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/8976 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/10689 [00:00<?, ? examples/s]

## Data Loader

In [37]:
pip install torch datasets



In [38]:
import torch
from torch.utils.data import DataLoader
from datasets import Dataset

In [39]:
from datasets import Dataset
# Load tokenized data from disk
tokenized_train_data = Dataset.load_from_disk("/content/drive/MyDrive/CSE354_Project/Idea2/tokenized_train_data")
tokenized_val_data = Dataset.load_from_disk("/content/drive/MyDrive/CSE354_Project/Idea2/tokenized_val_data")
tokenized_test_data = Dataset.load_from_disk("/content/drive/MyDrive/CSE354_Project/Idea2/tokenized_test_data")


In [40]:
# Convert the dataset to PyTorch format
tokenized_train_data.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
tokenized_val_data.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
tokenized_test_data.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

In [41]:
def create_dataloader(data, batch_size=16, shuffle=False):
    """
    Creates a PyTorch DataLoader from a tokenized dataset.
    """
    return DataLoader(data, batch_size=batch_size, shuffle=shuffle)


In [42]:
# Batch size for training
BATCH_SIZE = 16

# Create DataLoaders
train_dataloader = create_dataloader(tokenized_train_data, batch_size=BATCH_SIZE, shuffle=True)
val_dataloader = create_dataloader(tokenized_val_data, batch_size=BATCH_SIZE, shuffle=False)
test_dataloader = create_dataloader(tokenized_test_data, batch_size=BATCH_SIZE, shuffle=False)

# Inspect a batch
batch = next(iter(train_dataloader))
print("Batch keys:", batch.keys())  # Should show: 'input_ids', 'attention_mask', 'labels'
print("Input IDs shape:", batch["input_ids"].shape)  # Shape: (BATCH_SIZE, max_input_length)
print("Labels shape:", batch["labels"].shape)  # Shape: (BATCH_SIZE, max_target_length)


Batch keys: dict_keys(['input_ids', 'attention_mask', 'labels'])
Input IDs shape: torch.Size([16, 512])
Labels shape: torch.Size([16, 64])


## Fine Tune T5 Model

In [43]:
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration, AdamW
from tqdm import tqdm

In [44]:
# Load the pretrained T5 model and tokenizer
MODEL_NAME = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)
model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME)

# Send the model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Drop

In [None]:
from transformers import AdamW

# Define optimizer and learning rate
learning_rate = 5e-5
optimizer = AdamW(model.parameters(), lr=learning_rate)

# Print details
print(f"Learning Rate: {learning_rate}")


Learning Rate: 5e-05




In [None]:
def train_model(model, train_dataloader, val_dataloader, epochs=3, learning_rate=5e-5):
    """
    Function to fine-tune the T5 model.
    """
    # Optimizer
    optimizer = AdamW(model.parameters(), lr=learning_rate)

    # Loop through epochs
    for epoch in range(epochs):
        print(f"Epoch {epoch + 1}/{epochs}")
        print("-" * 50)

        # Training phase
        model.train()
        train_loss = 0.0
        for batch in tqdm(train_dataloader, desc="Training"):
            # Move data to GPU/CPU
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            # Forward pass
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss

            # Backward pass
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            train_loss += loss.item()

        avg_train_loss = train_loss / len(train_dataloader)
        print(f"Average Training Loss: {avg_train_loss}")

        # Validation phase
        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for batch in tqdm(val_dataloader, desc="Validation"):
                # Move data to GPU/CPU
                input_ids = batch["input_ids"].to(device)
                attention_mask = batch["attention_mask"].to(device)
                labels = batch["labels"].to(device)

                # Forward pass
                outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
                loss = outputs.loss
                val_loss += loss.item()

        avg_val_loss = val_loss / len(val_dataloader)
        print(f"Average Validation Loss: {avg_val_loss}")

    print("Fine-tuning complete!")



In [None]:
# Set training parameters
EPOCHS = 3
LEARNING_RATE = 5e-5

# Fine-tune the model
train_model(model, train_dataloader, val_dataloader, epochs=EPOCHS, learning_rate=LEARNING_RATE)

Epoch 1/3
--------------------------------------------------


Training:   0%|          | 0/4494 [00:00<?, ?it/s]Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.
Training: 100%|██████████| 4494/4494 [40:26<00:00,  1.85it/s]


Average Training Loss: 0.39985093399451266


Validation: 100%|██████████| 561/561 [01:45<00:00,  5.34it/s]


Average Validation Loss: 0.2828656521338618
Epoch 2/3
--------------------------------------------------


Training: 100%|██████████| 4494/4494 [40:24<00:00,  1.85it/s]


Average Training Loss: 0.2938739153998769


Validation: 100%|██████████| 561/561 [01:44<00:00,  5.35it/s]


Average Validation Loss: 0.27359800325597033
Epoch 3/3
--------------------------------------------------


Training: 100%|██████████| 4494/4494 [40:22<00:00,  1.85it/s]


Average Training Loss: 0.28231669706758733


Validation: 100%|██████████| 561/561 [01:44<00:00,  5.34it/s]

Average Validation Loss: 0.26878647949818835
Fine-tuning complete!





In [None]:
# Save the fine-tuned model and tokenizer
save_path = "/content/drive/MyDrive/CSE354_Project/Idea2/fine_tuned_t5"
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

print(f"Model saved to {save_path}")


Model saved to /content/drive/MyDrive/Colab Notebooks/CSE_354_Project_outputs/Idea2/Idea2/fine_tuned_t5


## Evaluation Steps

In [45]:
# Load the fine-tuned model and tokenizer
model_path = "/content/drive/MyDrive/CSE354_Project/Idea2/fine_tuned_t5"
tokenizer = T5Tokenizer.from_pretrained(model_path)
model = T5ForConditionalGeneration.from_pretrained(model_path)
model.eval()

T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Drop

In [None]:
def generate_predictions(dataloader, tokenizer, model, device):
    predictions, references = [], []
    model.to(device)

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            # Generate predictions
            outputs = model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                max_length=64,  # Adjust based on task
                num_beams=5,    # Beam search for better outputs
                early_stopping=True
            )

            # Decode predictions and references
            preds = tokenizer.batch_decode(outputs, skip_special_tokens=True)
            refs = tokenizer.batch_decode(labels, skip_special_tokens=True)

            predictions.extend(preds)
            references.extend(refs)

    return predictions, references


In [None]:
# Generate predictions on the test dataset
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
test_predictions, test_references = generate_predictions(test_dataloader, tokenizer, model, device)

In [None]:
# Save predictions and references
results_df = pd.DataFrame({"Input": [d['input_text'] for d in test_data],
                           "Gold Answer": test_references,
                           "Predicted Answer": test_predictions})
results_df.to_csv("/content/drive/MyDrive/CSE354_Project/Idea2/test_results.csv", index=False)


In [46]:
import pandas as pd

# Define the path where the predictions were saved
results_path = "/content/drive/MyDrive/CSE354_Project/Idea2/test_results.csv"

# Load the predictions CSV file
results_df = pd.read_csv(results_path)

# Access specific columns if needed
predictions = results_df["Predicted Answer"].tolist()
references = results_df["Gold Answer"].tolist()

In [None]:
!pip install evaluate



In [None]:
import evaluate

# BLEU Score
def calculate_bleu(predictions, references):
    bleu = evaluate.load("bleu")

    # Ensure predictions are strings
    predictions = [" ".join(pred) if isinstance(pred, list) else pred for pred in predictions]

    # Ensure references are lists of strings
    references = [[" ".join(ref)] if isinstance(ref, list) else [ref] for ref in references]

    # Compute BLEU score
    result = bleu.compute(predictions=predictions, references=references)
    print(f"BLEU Score: {result['bleu'] * 100:.2f}")
    return result['bleu']

# Calculate BLEU
bleu_score = calculate_bleu(test_predictions, test_references)

BLEU Score: 13.55


In [None]:
# ROUGE Score
def calculate_rouge(predictions, references):
    rouge = evaluate.load("rouge")
    result = rouge.compute(predictions=predictions, references=references)
    print("ROUGE Scores:", result)
    return result
rouge_scores = calculate_rouge(test_predictions, test_references)

ROUGE Scores: {'rouge1': 0.2734870919850048, 'rouge2': 0.12799702435915664, 'rougeL': 0.2626370102560428, 'rougeLsum': 0.26245643088481263}


In [None]:
# BERTScore
from bert_score import score as bertscore

def calculate_bertscore(predictions, references):
    P, R, F1 = bertscore(predictions, references, lang="en", verbose=True)
    print(f"BERTScore - Precision: {P.mean():.4f}, Recall: {R.mean():.4f}, F1: {F1.mean():.4f}")
    return {"Precision": P.mean().item(), "Recall": R.mean().item(), "F1": F1.mean().item()}
bert_scores = calculate_bertscore(test_predictions, test_references)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/207 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/168 [00:00<?, ?it/s]

done in 34.96 seconds, 305.71 sentences/sec
BERTScore - Precision: 0.9046, Recall: 0.9056, F1: 0.9049


In [None]:
# BLEURT Score
from bleurt import score as bleurt
import numpy as np

def calculate_bleurt(predictions, references, checkpoint="bleurt-base-128"):
    scorer = bleurt.BleurtScorer(checkpoint)
    scores = scorer.score(references=references, candidates=predictions)
    print(f"Average BLEURT Score: {np.mean(scores):.4f}")
    return scores
bleurt_scores = calculate_bleurt(test_predictions, test_references)

Average BLEURT Score: -0.7370


In [50]:
# Inspect Sample Predictions
# Print some sample predictions and references
print("Sample Predictions:")
for i in range(20,25):  # Display first 5 samples
    print(f"Input: {test_data[i]['input_text']}")
    print(f"Prediction: {predictions[i]}")
    print(f"Reference: {references[i]}")
    print("-" * 50)

Sample Predictions:
Input: question: Why did He visit his friends? context: Jay took a trip to his old college. Jay is an alumni. He visited his friends. He went and got drunk. He had a good time.
Prediction: Jay is an alumni.
Reference: he apparently missed the friends from the school.
--------------------------------------------------
Input: question: Why did Jay take a trip? context: Jay took a trip to his old college. Jay is an alumni. He visited his friends. He went and got drunk. He had a good time.
Prediction: Jay took a trip to his old college.
Reference: he was headed to his old college.
--------------------------------------------------
Input: question: Why is Jay an alumni? context: Jay took a trip to his old college. Jay is an alumni. He visited his friends. He went and got drunk. He had a good time.
Prediction: Jay is an alumni.
Reference: he is at his old college.
--------------------------------------------------
Input: question: Why did He visit his friends? context: Ja

# **Idea 3**

## Enviornment Setup

In [51]:
!pip install transformers datasets torch sentencepiece



In [52]:
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration, AutoTokenizer, AutoModelForSeq2SeqLM
from datasets import load_dataset
from torch.utils.data import DataLoader
from tqdm import tqdm

## Load Dataset

In [54]:
dataset = load_dataset("StonyBrookNLP/tellmewhy")

# Access dataset splits
train_data = dataset["train"]
val_data = dataset["validation"]
test_data = dataset["test"]

print(f"Train size: {len(train_data)}, Val size: {len(val_data)}, Test size: {len(test_data)}")

Train size: 71892, Val size: 8976, Test size: 10689


## Generate Commonsense Knowledge

In [None]:
# Load the tokenizer and model for the UnifiedQA T5 model
commonsense_model_name = "allenai/unifiedqa-t5-base"
commonsense_tokenizer = AutoTokenizer.from_pretrained(commonsense_model_name)
commonsense_model = AutoModelForSeq2SeqLM.from_pretrained(commonsense_model_name)

# Set the model to evaluation mode
commonsense_model.eval()


T5ForConditionalGeneration(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=768, out_features=3072, bias=False)
              (wo): Linear(in_features=3072, out_features=768, bias=False)
              (dropout): Dro

In [None]:
save_path = "/content/drive/MyDrive/CSE354_Project/Idea3/Processed_Datasets"
import os
os.makedirs(save_path, exist_ok=True)

In [None]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
commonsense_model.to(device)

# Generate commonsense batch with device parameter
def generate_commonsense_batch(narratives, device):
    """
    Generates commonsense knowledge for a batch of narratives.
    """
    prompts = [f"generate commonsense for: {n}" for n in narratives]
    inputs = commonsense_tokenizer(
        prompts,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=512
    ).to(device)

    # Generate outputs
    outputs = commonsense_model.generate(
        inputs["input_ids"],
        max_length=50,
        num_beams=5,
        early_stopping=True
    )
    commonsense_batch = [
        commonsense_tokenizer.decode(output, skip_special_tokens=True) for output in outputs
    ]
    return commonsense_batch

In [None]:
from tqdm import tqdm
import os
import json

def preprocess_with_commonsense_batch(data, save_path, dataset_name, batch_size=32):
    """
    Processes the dataset in batches, generating commonsense knowledge for each batch.
    Dynamically updates progress in one line.
    """
    processed_data = []
    with tqdm(total=len(data), desc=f"Processing {dataset_name} data") as pbar:
        for i in range(0, len(data), batch_size):
            batch = data[i:i + batch_size]

            # Extract narratives
            narratives = [row["narrative"] for row in batch]

            # Generate commonsense for the batch
            commonsense_batch = generate_commonsense_batch(narratives, device)

            # Process each row in the batch
            batch_output = []
            for j, row in enumerate(batch):
                question = row["question"]
                answer = row["answer"]
                commonsense = commonsense_batch[j]

                input_text = f"question: {question} context: {row['narrative']} Commonsense: {commonsense}"
                target_text = answer

                processed_row = {"input_text": input_text, "target_text": target_text}
                batch_output.append(processed_row)
                processed_data.append(processed_row)

            # Save the current batch to a JSON file
            batch_file_path = os.path.join(save_path, f"{dataset_name}_batch_{i // batch_size}.json")
            with open(batch_file_path, "w") as f:
                json.dump(batch_output, f)

            # Update progress bar and description
            pbar.update(len(batch))
            pbar.set_description(f"Processing {dataset_name} data - Saved batch {i // batch_size}")

        # Save the full processed dataset as a single JSON file
        full_dataset_path = os.path.join(save_path, f"{dataset_name}_full.json")
        with open(full_dataset_path, "w") as f:
            json.dump(processed_data, f)
        print(f"Full dataset saved to {full_dataset_path}")

    return processed_data

In [None]:
train_data = [dict(row) for row in train_data]
val_data = [dict(row) for row in val_data]
test_data = [dict(row) for row in test_data]

In [None]:
# Process datasets in batches
print("Processing train data...")
train_data = preprocess_with_commonsense_batch(train_data, save_path, "train")

print("Processing validation data...")
val_data = preprocess_with_commonsense_batch(val_data, save_path, "validation")

print("Processing test data...")
test_data = preprocess_with_commonsense_batch(test_data, save_path, "test")

Processing train data...


Processing train data - Saved batch 2246: 100%|██████████| 71892/71892 [30:04<00:00, 39.85it/s]


Full dataset saved to /content/drive/MyDrive/Commonsense/Processed_Datasets/train_full.json
Processing validation data...


Processing validation data - Saved batch 280: 100%|██████████| 8976/8976 [03:23<00:00, 44.13it/s]


Full dataset saved to /content/drive/MyDrive/Commonsense/Processed_Datasets/validation_full.json
Processing test data...


Processing test data - Saved batch 334: 100%|██████████| 10689/10689 [04:06<00:00, 43.38it/s]

Full dataset saved to /content/drive/MyDrive/Commonsense/Processed_Datasets/test_full.json





In [None]:
import json
import glob

def combine_batches(save_path, dataset_name):
    combined_data = []
    batch_files = glob.glob(f"{save_path}/{dataset_name}_batch_*.json")
    batch_files.sort()  # Ensure files are processed in order
    for batch_file in batch_files:
        with open(batch_file, "r") as f:
            batch_data = json.load(f)
            combined_data.extend(batch_data)
    return combined_data

# Combine train, validation, and test datasets
train_data_combined = combine_batches(save_path, "train")
val_data_combined = combine_batches(save_path, "validation")
test_data_combined = combine_batches(save_path, "test")

print(f"Combined train data size: {len(train_data_combined)}")
print(f"Combined validation data size: {len(val_data_combined)}")
print(f"Combined test data size: {len(test_data_combined)}")

Combined train data size: 71892
Combined validation data size: 8976
Combined test data size: 10689


In [None]:
save_path = "/content/drive/MyDrive/CSE354_Project/Idea3"
import os
os.makedirs(save_path, exist_ok=True)

In [None]:
with open(f"{save_path}/train_combined.json", "w") as f:
    json.dump(train_data_combined, f)

with open(f"{save_path}/validation_combined.json", "w") as f:
    json.dump(val_data_combined, f)

with open(f"{save_path}/test_combined.json", "w") as f:
    json.dump(test_data_combined, f)

In [55]:
import json

# Define the path where the files are saved
save_path = "/content/drive/MyDrive/CSE354_Project/Idea3"

# Load the training, validation, and test data
with open(f"{save_path}/train_combined.json", "r") as f:
    train_data_combined = json.load(f)

with open(f"{save_path}/validation_combined.json", "r") as f:
    val_data_combined = json.load(f)

with open(f"{save_path}/test_combined.json", "r") as f:
    test_data_combined = json.load(f)

print("Data successfully loaded!")

Data successfully loaded!


In [56]:
print("Sample from training data:")
print(train_data_combined[:3])  # First 3 entries

Sample from training data:
[{'input_text': 'question: Why did Cam order a pizza? context: Cam ordered a pizza and took it home. He opened the box to take out a slice. Cam discovered that the store did not cut the pizza for him. He looked for his pizza cutter but did not find it. He had to use his chef knife to cut a slice. Commonsense: cam', 'target_text': 'Cam was hungry.'}, {'input_text': 'question: Why did He open the box? context: Cam ordered a pizza and took it home. He opened the box to take out a slice. Cam discovered that the store did not cut the pizza for him. He looked for his pizza cutter but did not find it. He had to use his chef knife to cut a slice. Commonsense: cam', 'target_text': 'The pizza was in the box.'}, {'input_text': 'question: Why did Cam discover that the store did not cut the pizza for him? context: Cam ordered a pizza and took it home. He opened the box to take out a slice. Cam discovered that the store did not cut the pizza for him. He looked for his pizz

## Tokenize the Data

In [None]:
from transformers import T5Tokenizer

# Load the tokenizer
tokenizer = T5Tokenizer.from_pretrained("t5-small")

def tokenize_data(data, tokenizer, max_input_length=512, max_target_length=64):
    tokenized_data = []
    for row in data:
        input_encodings = tokenizer(
            row["input_text"],
            max_length=max_input_length,
            padding="max_length",
            truncation=True
        )
        target_encodings = tokenizer(
            row["target_text"],
            max_length=max_target_length,
            padding="max_length",
            truncation=True
        )
        tokenized_data.append({
            "input_ids": input_encodings["input_ids"],
            "attention_mask": input_encodings["attention_mask"],
            "labels": target_encodings["input_ids"]
        })
    return tokenized_data

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [None]:
# Tokenize datasets
tokenized_train_data = tokenize_data(train_data_combined, tokenizer)
tokenized_val_data = tokenize_data(val_data_combined, tokenizer)
tokenized_test_data = tokenize_data(test_data_combined, tokenizer)

In [None]:
from datasets import Dataset

# Convert lists to Dataset objects
tokenized_train_dataset = Dataset.from_list(tokenized_train_data)
tokenized_val_dataset = Dataset.from_list(tokenized_val_data)
tokenized_test_dataset = Dataset.from_list(tokenized_test_data)

In [None]:
tokenized_train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
tokenized_val_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
tokenized_test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

In [None]:
# Save the datasets to disk
tokenized_train_dataset.save_to_disk(f"{save_path}/tokenized_train_data")
tokenized_val_dataset.save_to_disk(f"{save_path}/tokenized_val_data")
tokenized_test_dataset.save_to_disk(f"{save_path}/tokenized_test_data")

print("Tokenized data saved to Google Drive.")

Saving the dataset (0/1 shards):   0%|          | 0/71892 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/8976 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/10689 [00:00<?, ? examples/s]

Tokenized data saved to Google Drive.


In [57]:
# Load the tokenized datasets
tokenized_train_dataset = load_from_disk(f"{save_path}/tokenized_train_data")
tokenized_val_dataset = load_from_disk(f"{save_path}/tokenized_val_data")
tokenized_test_dataset = load_from_disk(f"{save_path}/tokenized_test_data")

print("Tokenized datasets loaded successfully.")
print(f"Train dataset size: {len(tokenized_train_dataset)}")
print(f"Validation dataset size: {len(tokenized_val_dataset)}")
print(f"Test dataset size: {len(tokenized_test_dataset)}")

Tokenized datasets loaded successfully.
Train dataset size: 71892
Validation dataset size: 8976
Test dataset size: 10689


## Fine Tune Model

In [None]:
import os
import torch
from torch.utils.data import DataLoader
from datasets import load_dataset, Dataset
from transformers import T5ForConditionalGeneration, T5Tokenizer, Adafactor
from torch.cuda.amp import autocast, GradScaler
from tqdm import tqdm

In [None]:
train_dataloader = DataLoader(tokenized_train_dataset, batch_size=16, shuffle=True, num_workers=4, pin_memory=True)
val_dataloader = DataLoader(tokenized_val_dataset, batch_size=16, shuffle=False, num_workers=4, pin_memory=True)
test_dataloader = DataLoader(tokenized_test_dataset, batch_size=16, shuffle=False, num_workers=4, pin_memory=True)



In [None]:
# Check device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Define save paths
checkpoint_dir = "/content/drive/MyDrive/CSE354_Project/Idea3/Model_Checkpoints"
os.makedirs(checkpoint_dir, exist_ok=True)

# Load tokenizer and model
tokenizer = T5Tokenizer.from_pretrained("t5-small")
model = T5ForConditionalGeneration.from_pretrained("t5-small").to(device)

Using device: cpu


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [None]:

# Define optimizer
optimizer = Adafactor(
    model.parameters(),
    scale_parameter=True,
    relative_step=True,
    warmup_init=True
)

In [None]:
# Initialize GradScaler
scaler = GradScaler()

# Training loop
num_epochs = 3
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch in tqdm(train_dataloader, desc=f"Training Epoch {epoch + 1}/{num_epochs}"):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        with autocast():  # Mixed precision
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss

        # Backward pass with GradScaler
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        optimizer.zero_grad()

        total_loss += loss.item()

    # Save checkpoint
    checkpoint_path = os.path.join(checkpoint_dir, f"checkpoint_epoch_{epoch + 1}.pt")
    torch.save({
        "epoch": epoch + 1,
        "model_state_dict": model.state_dict(),
        "optimizer_state_dict": optimizer.state_dict(),
        "loss": total_loss / len(train_dataloader)
    }, checkpoint_path)
    print(f"Epoch {epoch + 1} Loss: {total_loss / len(train_dataloader)}")
    print(f"Checkpoint saved at {checkpoint_path}")

  scaler = GradScaler()
  with autocast():  # Mixed precision
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.
Training Epoch 1/3: 100%|██████████| 4494/4494 [26:07<00:00,  2.87it/s]


Epoch 1 Loss: 0.5648293511956121
Checkpoint saved at /content/drive/MyDrive/Colab Notebooks/CSE_354_Project_outputs/Commonsense/Model_Checkpoints/checkpoint_epoch_1.pt


Training Epoch 2/3: 100%|██████████| 4494/4494 [26:02<00:00,  2.88it/s]


Epoch 2 Loss: 0.31320402981124035
Checkpoint saved at /content/drive/MyDrive/Colab Notebooks/CSE_354_Project_outputs/Commonsense/Model_Checkpoints/checkpoint_epoch_2.pt


Training Epoch 3/3: 100%|██████████| 4494/4494 [26:01<00:00,  2.88it/s]


Epoch 3 Loss: 0.3191941633263481
Checkpoint saved at /content/drive/MyDrive/Colab Notebooks/CSE_354_Project_outputs/Commonsense/Model_Checkpoints/checkpoint_epoch_3.pt


In [59]:
checkpoint_path = "/content/drive/MyDrive/CSE354_Project/Idea3/Model_Checkpoints/checkpoint_epoch_3.pt"

# Load the checkpoint with map_location
checkpoint = torch.load(checkpoint_path, map_location=torch.device("cpu"))

# Restore model and optimizer states
model.load_state_dict(checkpoint["model_state_dict"])
optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
start_epoch = checkpoint["epoch"]
loss = checkpoint["loss"]

print(f"Checkpoint loaded. Resuming from epoch {start_epoch} with loss {loss:.4f}.")



  checkpoint = torch.load(checkpoint_path, map_location=torch.device("cpu"))


Checkpoint loaded. Resuming from epoch 3 with loss 0.3192.


In [63]:
import torch
from transformers import T5ForConditionalGeneration

# Load the checkpoint
checkpoint = torch.load(checkpoint_path, map_location=torch.device('cpu'))

# Initialize the model
model = T5ForConditionalGeneration.from_pretrained("t5-small")

# Load the model's state_dict
model.load_state_dict(checkpoint["model_state_dict"])

# Optional: Load the optimizer state if needed
optimizer.load_state_dict(checkpoint["optimizer_state_dict"])

# Retrieve additional training details if needed
start_epoch = checkpoint["epoch"]
loss = checkpoint["loss"]

print(f"Model loaded. Resuming from epoch {start_epoch} with loss {loss:.4f}.")

  checkpoint = torch.load(checkpoint_path, map_location=torch.device('cpu'))


Model loaded. Resuming from epoch 3 with loss 0.3192.


## Evaluate

In [None]:
!pip install evaluate rouge-score bert-score bleurt sacrebleu

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting bert-score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
[31mERROR: Could not find a version that satisfies the requirement bleurt (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for bleurt[0m[31m
[0m

In [None]:
# Clone BLEURT GitHub repository
!git clone https://github.com/google-research/bleurt.git

# Change to BLEURT directory
%cd bleurt

# Install BLEURT package
!pip install .

Cloning into 'bleurt'...
remote: Enumerating objects: 134, done.[K
remote: Counting objects: 100% (18/18), done.[K
remote: Compressing objects: 100% (18/18), done.[K
remote: Total 134 (delta 0), reused 17 (delta 0), pack-reused 116 (from 1)[K
Receiving objects: 100% (134/134), 31.28 MiB | 20.13 MiB/s, done.
Resolving deltas: 100% (49/49), done.
/content/bleurt
Processing /content/bleurt
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: BLEURT
  Building wheel for BLEURT (setup.py) ... [?25l[?25hdone
  Created wheel for BLEURT: filename=BLEURT-0.0.2-py3-none-any.whl size=16456764 sha256=1c8c3cdc34341247a8e873223bbcbe9d5ab34699cc58a71333a2e074dc2d022d
  Stored in directory: /tmp/pip-ephem-wheel-cache-oyrplsg6/wheels/92/4f/fb/afa555fa27aa9e2c7958df797a62cc4e74f0f459cec9c4fa7c
Successfully built BLEURT
Installing collected packages: BLEURT
Successfully installed BLEURT-0.0.2


In [None]:
!wget https://storage.googleapis.com/bleurt-oss/bleurt-base-128.zip
!unzip bleurt-base-128.zip -d bleurt-base-128

--2024-12-08 19:19:36--  https://storage.googleapis.com/bleurt-oss/bleurt-base-128.zip
Resolving storage.googleapis.com (storage.googleapis.com)... 142.251.111.207, 64.233.180.207, 142.251.163.207, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|142.251.111.207|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 405489453 (387M) [application/zip]
Saving to: ‘bleurt-base-128.zip’


2024-12-08 19:19:43 (54.6 MB/s) - ‘bleurt-base-128.zip’ saved [405489453/405489453]

Archive:  bleurt-base-128.zip
   creating: bleurt-base-128/bleurt-base-128/
  inflating: bleurt-base-128/bleurt-base-128/vocab.txt  
  inflating: bleurt-base-128/bleurt-base-128/bert_config.json  
   creating: bleurt-base-128/bleurt-base-128/variables/
  inflating: bleurt-base-128/bleurt-base-128/variables/variables.index  
  inflating: bleurt-base-128/bleurt-base-128/variables/variables.data-00000-of-00001  
  inflating: bleurt-base-128/bleurt-base-128/bleurt_config.json  
  inflating:

In [None]:
!mv bleurt-base-128/bleurt-base-128/* bleurt-base-128/
!rm -r bleurt-base-128/bleurt-base-128

In [None]:
from tqdm import tqdm
import torch

# Generate predictions
def generate_predictions(dataloader, model, tokenizer, device):
    predictions, references = [], []
    model.eval()  # Set model to evaluation mode
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating"):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            # Generate predictions
            outputs = model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                max_length=64,  # Adjust based on your use case
                num_beams=5,    # Use beam search for better predictions
                early_stopping=True
            )

            # Decode predictions and references
            preds = tokenizer.batch_decode(outputs, skip_special_tokens=True)
            refs = tokenizer.batch_decode(labels, skip_special_tokens=True)

            predictions.extend(preds)
            references.extend(refs)

    return predictions, references

# Example usage
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
predictions, references = generate_predictions(test_dataloader, model, tokenizer, device)

Evaluating: 100%|██████████| 669/669 [10:44<00:00,  1.04it/s]


In [None]:
results_df = pd.DataFrame({
    "Input": [row["input_text"] for row in test_data_combined],  # Use the original preprocessed data
    "Gold Answer": references,
    "Predicted Answer": predictions
})

In [None]:
results_path = "/content/drive/MyDrive/CSE354_Project/Idea3/predictions.csv"
results_df.to_csv(results_path, index=False)
print(f"Predictions saved to {results_path}")

Predictions saved to /content/drive/MyDrive/Colab Notebooks/CSE_354_Project_outputs/Commonsense/predictions.csv


In [68]:
import pandas as pd

# Define the path where the predictions were saved
results_path = "/content/drive/MyDrive/CSE354_Project/Idea3/predictions.csv"

# Load the predictions CSV file
results_df = pd.read_csv(results_path)

# Access specific columns if needed
predictions = results_df["Predicted Answer"].tolist()
references = results_df["Gold Answer"].tolist()

# Print some sample predictions and references
print("Sample Predictions:")
for i in range(20,25):  # Display first 5 samples
    print(f"Input: {test_data_combined[i]['input_text']}")
    print(f"Prediction: {predictions[i]}")
    print(f"Reference: {references[i]}")
    print("-" * 50)


Sample Predictions:
Input: question: Why did He visit his friends? context: Jay took a trip to his old college. Jay is an alumni. He visited his friends. He went and got drunk. He had a good time. Commonsense: Jay
Prediction: Jay took a trip to his old college.
Reference: he apparently missed the friends from the school.
--------------------------------------------------
Input: question: Why did Jay take a trip? context: Jay took a trip to his old college. Jay is an alumni. He visited his friends. He went and got drunk. He had a good time. Commonsense: Jay
Prediction: Jay took a trip to his old college.
Reference: he was headed to his old college.
--------------------------------------------------
Input: question: Why is Jay an alumni? context: Jay took a trip to his old college. Jay is an alumni. He visited his friends. He went and got drunk. He had a good time. Commonsense: Jay
Prediction: he took a trip to his old college.
Reference: he is at his old college.
-----------------------

In [None]:
!pip install evaluate

Collecting evaluate
  Using cached evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [None]:
pip install sacrebleu

Collecting sacrebleu
  Downloading sacrebleu-2.4.3-py3-none-any.whl.metadata (51 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/51.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting portalocker (from sacrebleu)
  Downloading portalocker-3.0.0-py3-none-any.whl.metadata (8.5 kB)
Collecting colorama (from sacrebleu)
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Downloading sacrebleu-2.4.3-py3-none-any.whl (103 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.0/104.0 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Downloading portalocker-3.0.0-py3-none-any.whl (19 kB)
Installing collected packages: portalocker, colorama, sacrebleu
Successfully installed colorama-0.4.6 portalocker-3.0.0 sacrebleu-2.4.3


In [None]:
import evaluate

# Load BLEU metric
bleu = evaluate.load("sacrebleu")

# Ensure predictions and references are strings
formatted_predictions = [" ".join(pred) if isinstance(pred, list) else pred for pred in predictions]
formatted_references = [[" ".join(ref)] if isinstance(ref, list) else [ref] for ref in references]

# Compute BLEU score
bleu_result = bleu.compute(
    predictions=formatted_predictions,
    references=formatted_references
)

print(f"BLEU Score: {bleu_result['score']:.2f}")

BLEU Score: 11.15


In [None]:
pip install rouge_score

Collecting rouge_score
  Using cached rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=e349f42f783d1eb4089474fd1b4ce7151c4071509cc48c548196de6ba1e41ef6
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [None]:
# Load ROUGE metric
rouge = evaluate.load("rouge")

# Compute ROUGE score
rouge_result = rouge.compute(
    predictions=predictions,
    references=references
)
print("ROUGE Scores:", rouge_result)

ROUGE Scores: {'rouge1': 0.25203527559910144, 'rouge2': 0.1071995198156111, 'rougeL': 0.24233930099749057, 'rougeLsum': 0.24242070581495978}


In [None]:
!wget https://storage.googleapis.com/bleurt-oss/bleurt-base-128.zip
!unzip bleurt-base-128.zip -d bleurt-base-128

--2024-12-08 19:26:35--  https://storage.googleapis.com/bleurt-oss/bleurt-base-128.zip
Resolving storage.googleapis.com (storage.googleapis.com)... 142.250.31.207, 142.251.111.207, 64.233.180.207, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|142.250.31.207|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 405489453 (387M) [application/zip]
Saving to: ‘bleurt-base-128.zip.1’


2024-12-08 19:26:38 (148 MB/s) - ‘bleurt-base-128.zip.1’ saved [405489453/405489453]

Archive:  bleurt-base-128.zip
   creating: bleurt-base-128/bleurt-base-128/
  inflating: bleurt-base-128/bleurt-base-128/vocab.txt  
  inflating: bleurt-base-128/bleurt-base-128/bert_config.json  
   creating: bleurt-base-128/bleurt-base-128/variables/
  inflating: bleurt-base-128/bleurt-base-128/variables/variables.index  
  inflating: bleurt-base-128/bleurt-base-128/variables/variables.data-00000-of-00001  
  inflating: bleurt-base-128/bleurt-base-128/bleurt_config.json  
  inflating

In [None]:
from bleurt import score

# Initialize BLEURT scorer
bleurt_scorer = score.BleurtScorer("bleurt-base-128")

# Compute BLEURT scores
bleurt_scores = bleurt_scorer.score(
    references=references,
    candidates=predictions
)
average_bleurt_score = sum(bleurt_scores) / len(bleurt_scores)
print(f"Average BLEURT Score: {average_bleurt_score:.4f}")


Average BLEURT Score: -0.8437


In [None]:
from bert_score import score as bert_score

# Compute BERTScore
P, R, F1 = bert_score(predictions, references, lang="en", verbose=True)
print(f"BERTScore - Precision: {P.mean():.4f}, Recall: {R.mean():.4f}, F1: {F1.mean():.4f}")

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/202 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/168 [00:00<?, ?it/s]

done in 29.63 seconds, 360.75 sentences/sec
BERTScore - Precision: 0.9019, Recall: 0.9013, F1: 0.9014


## Evaluate Explicit vs Implicit

In [None]:
!pip install bert_score


Collecting bert_score
  Using cached bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bert_score
Successfully installed bert_score-0.3.13


In [None]:
from datasets import load_dataset
from bleurt import score as bleurt_score
from bert_score import score as bert_score
import evaluate
import pandas as pd
from collections import Counter

In [None]:
# Inspect the first few rows of test_data_combined
for i, row in enumerate(test_data_combined[:5]):
    print(f"Row {i}: {row}")


Row 0: {'input_text': 'question: Why was Bob a computer scientist? context: Bob was a computer scientist. He enjoyed natural language processing. He decided to revolutionize the industry! He formulated a machine learning algorithm to process words. He won the nobel prize for his work! Commonsense: the industry.', 'target_text': 'Bob enjoyed the language processing that was part of the job.'}
Row 1: {'input_text': 'question: Why did He enjoy natural language processing? context: Bob was a computer scientist. He enjoyed natural language processing. He decided to revolutionize the industry! He formulated a machine learning algorithm to process words. He won the nobel prize for his work! Commonsense: the industry.', 'target_text': 'Bob had an innate talent for it.'}
Row 2: {'input_text': 'question: Why did He decide to revolutionize the industry? context: Bob was a computer scientist. He enjoyed natural language processing. He decided to revolutionize the industry! He formulated a machine 

In [None]:
# Assuming test_data is the original dataset with 'is_ques_answerable'
for i, row in enumerate(test_data):
    test_data_combined[i]["is_ques_answerable"] = row["is_ques_answerable"]

# Verify that the field is correctly reattached
print(f"Row 0 after reattaching: {test_data_combined[0]}")


Row 0 after reattaching: {'input_text': 'question: Why was Bob a computer scientist? context: Bob was a computer scientist. He enjoyed natural language processing. He decided to revolutionize the industry! He formulated a machine learning algorithm to process words. He won the nobel prize for his work! Commonsense: the industry.', 'target_text': 'Bob enjoyed the language processing that was part of the job.', 'is_ques_answerable': 'Answerable'}


In [None]:
# Separate implicit and explicit indices
implicit_indices = [i for i, row in enumerate(test_data_combined) if row.get("is_ques_answerable") == "Not Answerable"]
explicit_indices = [i for i, row in enumerate(test_data_combined) if row.get("is_ques_answerable") == "Answerable"]

# Extract predictions and references for implicit and explicit data
implicit_predictions = [predictions[i] for i in implicit_indices]
implicit_references = [references[i] for i in implicit_indices]

explicit_predictions = [predictions[i] for i in explicit_indices]
explicit_references = [references[i] for i in explicit_indices]

# Verify the counts
print("\nData Split Verification:")
print(f"Number of implicit predictions: {len(implicit_predictions)}")
print(f"Number of implicit references: {len(implicit_references)}")
print(f"Number of explicit predictions: {len(explicit_predictions)}")
print(f"Number of explicit references: {len(explicit_references)}")



Data Split Verification:
Number of implicit predictions: 3195
Number of implicit references: 3195
Number of explicit predictions: 7494
Number of explicit references: 7494


In [None]:
print(f"Number of implicit predictions: {len(implicit_predictions)}")
print(f"Number of implicit references: {len(implicit_references)}")

print(f"Number of explicit predictions: {len(explicit_predictions)}")
print(f"Number of explicit references: {len(explicit_references)}")


Number of implicit predictions: 3195
Number of implicit references: 3195
Number of explicit predictions: 7494
Number of explicit references: 7494


In [None]:
# Step 3: Evaluate Metrics for Implicit and Explicit Examples
import evaluate

# BLEU for Implicit
bleu = evaluate.load("sacrebleu")
implicit_bleu = bleu.compute(
    predictions=implicit_predictions,
    references=[[ref] for ref in implicit_references]
)
print(f"Implicit BLEU: {implicit_bleu['score']:.2f}")

# BLEU for Explicit
explicit_bleu = bleu.compute(
    predictions=explicit_predictions,
    references=[[ref] for ref in explicit_references]
)
print(f"Explicit BLEU: {explicit_bleu['score']:.2f}")


Implicit BLEU: 11.12
Explicit BLEU: 11.14


In [None]:
# Load evaluation metrics
bleu = evaluate.load("sacrebleu")
rouge = evaluate.load("rouge")
bleurt_scorer = bleurt_score.BleurtScorer("bleurt-base-128")

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [None]:
rouge = evaluate.load("rouge")

# Implicit ROUGE
implicit_rouge = rouge.compute(
    predictions=implicit_predictions,
    references=implicit_references
)
print("Implicit ROUGE:", implicit_rouge)

# Explicit ROUGE
explicit_rouge = rouge.compute(
    predictions=explicit_predictions,
    references=explicit_references
)
print("Explicit ROUGE:", explicit_rouge)


Implicit ROUGE: {'rouge1': 0.2526269237017725, 'rouge2': 0.1062612502861131, 'rougeL': 0.24291061660719465, 'rougeLsum': 0.24289240353470987}
Explicit ROUGE: {'rouge1': 0.2519933034289846, 'rouge2': 0.10779762855824264, 'rougeL': 0.2420512414934553, 'rougeLsum': 0.24205645112270374}


In [None]:
from bert_score import score as bert_score

# Implicit BERTScore
P, R, F1 = bert_score(implicit_predictions, implicit_references, lang="en", verbose=True)
print(f"Implicit BERTScore - Precision: {P.mean():.4f}, Recall: {R.mean():.4f}, F1: {F1.mean():.4f}")

# Explicit BERTScore
P, R, F1 = bert_score(explicit_predictions, explicit_references, lang="en", verbose=True)
print(f"Explicit BERTScore - Precision: {P.mean():.4f}, Recall: {R.mean():.4f}, F1: {F1.mean():.4f}")

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/73 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/50 [00:00<?, ?it/s]

done in 503.12 seconds, 6.35 sentences/sec
Implicit BERTScore - Precision: 0.9023, Recall: 0.9007, F1: 0.9013


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/152 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/118 [00:00<?, ?it/s]

done in 1014.73 seconds, 7.39 sentences/sec
Explicit BERTScore - Precision: 0.9017, Recall: 0.9015, F1: 0.9014


In [None]:
from bleurt import score as bleurt_score

# Load BLEURT model
bleurt_scorer = bleurt_score.BleurtScorer("bleurt-base-128")

# Implicit BLEURT
implicit_bleurt = bleurt_scorer.score(
    references=implicit_references,
    candidates=implicit_predictions
)
print(f"Implicit BLEURT: {sum(implicit_bleurt) / len(implicit_bleurt):.4f}")

# Explicit BLEURT
explicit_bleurt = bleurt_scorer.score(
    references=explicit_references,
    candidates=explicit_predictions
)
print(f"Explicit BLEURT: {sum(explicit_bleurt) / len(explicit_bleurt):.4f}")

Implicit BLEURT: -0.8388
Explicit BLEURT: -0.8457


In [None]:
import pandas as pd

# Combine results into a DataFrame
results_df = pd.DataFrame({
    "Type": ["Implicit"] * len(implicit_predictions) + ["Explicit"] * len(explicit_predictions),
    "Prediction": implicit_predictions + explicit_predictions,
    "Reference": implicit_references + explicit_references
})

# Save to CSV
save_path = "/content/drive/MyDrive/CSE354_Project/Results/implicit_explicit_results.csv"
results_df.to_csv(save_path, index=False)
print(f"Results saved to {save_path}")

# **Extra Idea 3**

### Load Data

In [None]:
from datasets import load_dataset
# Load the dataset
tellmewhy = load_dataset('StonyBrookNLP/tellmewhy')
# Check available splits
print(tellmewhy)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/7.76k [00:00<?, ?B/s]

dataset_infos.json:   0%|          | 0.00/1.89k [00:00<?, ?B/s]

train.json:   0%|          | 0.00/70.1M [00:00<?, ?B/s]

validation.json:   0%|          | 0.00/8.71M [00:00<?, ?B/s]

test.json:   0%|          | 0.00/10.4M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/71892 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/8976 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/10689 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['narrative', 'question', 'original_sentence_for_question', 'narrative_lexical_overlap', 'is_ques_answerable', 'answer', 'is_ques_answerable_annotator', 'original_narrative_form', 'question_meta', 'helpful_sentences', 'human_eval', 'val_ann', 'gram_ann'],
        num_rows: 71892
    })
    validation: Dataset({
        features: ['narrative', 'question', 'original_sentence_for_question', 'narrative_lexical_overlap', 'is_ques_answerable', 'answer', 'is_ques_answerable_annotator', 'original_narrative_form', 'question_meta', 'helpful_sentences', 'human_eval', 'val_ann', 'gram_ann'],
        num_rows: 8976
    })
    test: Dataset({
        features: ['narrative', 'question', 'original_sentence_for_question', 'narrative_lexical_overlap', 'is_ques_answerable', 'answer', 'is_ques_answerable_annotator', 'original_narrative_form', 'question_meta', 'helpful_sentences', 'human_eval', 'val_ann', 'gram_ann'],
        num_rows: 10689
    })
})


### Preprocess Data

In [None]:
def preprocess_tellmewhy(example):
    """
    Prepare the input and target text for T5 fine-tuning.
    """
    # Normalize text (optional: remove excess spaces)
    def normalize_text(text):
        if text:
            return " ".join(text.split())
        return ""

    # Get narrative and helpful sentences
    narrative = normalize_text(example["narrative"])
    question = normalize_text(example["question"])
    answer = normalize_text(example["answer"])

    # Extract helpful sentences if available
    helpful_indices = example.get("helpful_sentences", [])
    helpful_text = " ".join([example["original_narrative_form"][i - 1] for i in helpful_indices])

    # Combine narrative and helpful sentences
    input_context = narrative
    if helpful_text:
        input_context += f" Helpful Context: {helpful_text}"

    # Prepare T5 input and target
    input_text = f"question: {question} context: {input_context}"
    target_text = answer

    return {"input_text": input_text, "target_text": target_text}

In [None]:
# Apply preprocessing to all splits
train_data = tellmewhy['train'].map(preprocess_tellmewhy)
val_data = tellmewhy['validation'].map(preprocess_tellmewhy)
test_data = tellmewhy['test'].map(preprocess_tellmewhy)

# View a processed sample
print(train_data[0])

Map:   0%|          | 0/71892 [00:00<?, ? examples/s]

Map:   0%|          | 0/8976 [00:00<?, ? examples/s]

Map:   0%|          | 0/10689 [00:00<?, ? examples/s]

{'narrative': 'Cam ordered a pizza and took it home. He opened the box to take out a slice. Cam discovered that the store did not cut the pizza for him. He looked for his pizza cutter but did not find it. He had to use his chef knife to cut a slice.', 'question': 'Why did Cam order a pizza?', 'original_sentence_for_question': 'Cam ordered a pizza and took it home.', 'narrative_lexical_overlap': 0.33333333330000003, 'is_ques_answerable': 'Not Answerable', 'answer': 'Cam was hungry.', 'is_ques_answerable_annotator': 'Not Answerable', 'original_narrative_form': ['Cam ordered a pizza and took it home.', 'He opened the box to take out a slice.', 'Cam discovered that the store did not cut the pizza for him.', 'He looked for his pizza cutter but did not find it.', 'He had to use his chef knife to cut a slice.'], 'question_meta': 'rocstories_narrative_41270_sentence_0_question_0', 'helpful_sentences': [], 'human_eval': False, 'val_ann': [], 'gram_ann': [], 'input_text': 'question: Why did Cam 

In [None]:
import json

# Define the path in Google Drive
preprocessed_train_path = "/content/drive/My Drive/CSE354_Project/Final_Model/preprocessed_train.json"
preprocessed_val_path = "/content/drive/My Drive/CSE354_Project/Final_Model/preprocessed_val.json"
preprocessed_test_path = "/content/drive/My Drive/CSE354_Project/Final_Model/preprocessed_test.json"

# Save preprocessed data
train_data.to_json(preprocessed_train_path)
val_data.to_json(preprocessed_val_path)
test_data.to_json(preprocessed_test_path)

print("Preprocessed data saved to Google Drive!")

Creating json from Arrow format:   0%|          | 0/72 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/9 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/11 [00:00<?, ?ba/s]

Preprocessed data saved to Google Drive!


### Tokenize for T5

In [None]:
from transformers import T5TokenizerFast

# Load tokenizer
tokenizer = T5TokenizerFast.from_pretrained("t5-base")

def tokenize_tellmewhy(example):
    """
    Tokenize input and target text for T5.
    """
    # Tokenize input text
    input_encodings = tokenizer(
        example["input_text"],
        max_length=512,
        padding="max_length",
        truncation=True
    )

    # Tokenize target text
    target_encodings = tokenizer(
        example["target_text"],
        max_length=64,
        padding="max_length",
        truncation=True
    )

    # Combine input and target encodings
    return {
        "input_ids": input_encodings["input_ids"],
        "attention_mask": input_encodings["attention_mask"],
        "labels": target_encodings["input_ids"]
    }

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

In [None]:
# Tokenize all splits
tokenized_train_data = train_data.map(tokenize_tellmewhy)
tokenized_val_data = val_data.map(tokenize_tellmewhy)
tokenized_test_data = test_data.map(tokenize_tellmewhy)

# Inspect a tokenized sample
print("Tokenized sample:")
print(tokenized_train_data[0])

Map:   0%|          | 0/71892 [00:00<?, ? examples/s]

Map:   0%|          | 0/8976 [00:00<?, ? examples/s]

Map:   0%|          | 0/10689 [00:00<?, ? examples/s]

Tokenized sample:
{'narrative': 'Cam ordered a pizza and took it home. He opened the box to take out a slice. Cam discovered that the store did not cut the pizza for him. He looked for his pizza cutter but did not find it. He had to use his chef knife to cut a slice.', 'question': 'Why did Cam order a pizza?', 'original_sentence_for_question': 'Cam ordered a pizza and took it home.', 'narrative_lexical_overlap': 0.33333333330000003, 'is_ques_answerable': 'Not Answerable', 'answer': 'Cam was hungry.', 'is_ques_answerable_annotator': 'Not Answerable', 'original_narrative_form': ['Cam ordered a pizza and took it home.', 'He opened the box to take out a slice.', 'Cam discovered that the store did not cut the pizza for him.', 'He looked for his pizza cutter but did not find it.', 'He had to use his chef knife to cut a slice.'], 'question_meta': 'rocstories_narrative_41270_sentence_0_question_0', 'helpful_sentences': [], 'human_eval': False, 'val_ann': [], 'gram_ann': [], 'input_text': 'ques

In [None]:
# Define the paths in Google Drive
tokenized_train_path = "/content/drive/My Drive/CSE354_Project/Final_Model/tokenized_train_data"
tokenized_val_path = "/content/drive/My Drive/CSE354_Project/Final_Model/tokenized_val_data"
tokenized_test_path = "/content/drive/My Drive/CSE354_Project/Final_Model/tokenized_test_data"

# Save tokenized datasets
tokenized_train_data.save_to_disk(tokenized_train_path)
tokenized_val_data.save_to_disk(tokenized_val_path)
tokenized_test_data.save_to_disk(tokenized_test_path)

print("Tokenized data saved to Google Drive!")

Saving the dataset (0/1 shards):   0%|          | 0/71892 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/8976 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/10689 [00:00<?, ? examples/s]

Tokenized data saved to Google Drive!


In [None]:
from datasets import load_from_disk

# Load tokenized datasets
tokenized_train_data = load_from_disk("/content/drive/My Drive/CSE354_Project/Final_Model/tokenized_train_data")
tokenized_val_data = load_from_disk("/content/drive/My Drive/CSE354_Project/Final_Model/tokenized_val_data")
tokenized_test_data = load_from_disk("/content/drive/My Drive/CSE354_Project/Final_Model/tokenized_test_data")

print("Tokenized data loaded from Google Drive!")

Tokenized data loaded from Google Drive!


### Data Inspection

In [None]:
# Inspect a preprocessed example
print(f"Input Text: {train_data[0]['input_text']}")
print(f"Target Text: {train_data[0]['target_text']}")

Input Text: question: Why did Cam order a pizza? context: Cam ordered a pizza and took it home. He opened the box to take out a slice. Cam discovered that the store did not cut the pizza for him. He looked for his pizza cutter but did not find it. He had to use his chef knife to cut a slice.
Target Text: Cam was hungry.


In [None]:
# Decode tokenized input and labels for verification
decoded_input = tokenizer.decode(tokenized_train_data[0]["input_ids"], skip_special_tokens=True)
decoded_label = tokenizer.decode(tokenized_train_data[0]["labels"], skip_special_tokens=True)

print(f"Decoded Input: {decoded_input}")
print(f"Decoded Target: {decoded_label}")

Decoded Input: question: Why did Cam order a pizza? context: Cam ordered a pizza and took it home. He opened the box to take out a slice. Cam discovered that the store did not cut the pizza for him. He looked for his pizza cutter but did not find it. He had to use his chef knife to cut a slice.
Decoded Target: Cam was hungry.


### Data Loader

In [None]:
from torch.utils.data import DataLoader

# Ensure the tokenized dataset is in PyTorch tensor format
tokenized_train_data.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
tokenized_val_data.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
tokenized_test_data.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

In [None]:
def create_dataloader(data, batch_size=16, shuffle=False):
    """
    Creates a PyTorch DataLoader from a tokenized dataset.

    Args:
        data: The tokenized dataset (Hugging Face Dataset in PyTorch format).
        batch_size: The number of examples per batch.
        shuffle: Whether to shuffle the dataset at every epoch.

    Returns:
        DataLoader: A PyTorch DataLoader for the dataset.
    """
    return DataLoader(data, batch_size=batch_size, shuffle=shuffle)

In [None]:
# Define batch size for training, validation, and testing
BATCH_SIZE = 16

# Create DataLoaders for training, validation, and testing
train_dataloader = create_dataloader(tokenized_train_data, batch_size=BATCH_SIZE, shuffle=True)
val_dataloader = create_dataloader(tokenized_val_data, batch_size=BATCH_SIZE, shuffle=False)
test_dataloader = create_dataloader(tokenized_test_data, batch_size=BATCH_SIZE, shuffle=False)

In [None]:
# Inspect a batch from the train DataLoader
batch = next(iter(train_dataloader))
print("Batch keys:", batch.keys())  # Should show: 'input_ids', 'attention_mask', 'labels'
print("Input IDs shape:", batch["input_ids"].shape)  # Shape: (BATCH_SIZE, max_input_length)
print("Labels shape:", batch["labels"].shape)  # Shape: (BATCH_SIZE, max_target_length)

Batch keys: dict_keys(['input_ids', 'attention_mask', 'labels'])
Input IDs shape: torch.Size([16, 512])
Labels shape: torch.Size([16, 64])


### Fine Tune T5 with new Hyperparameters

In [None]:
# Select a small subset (10% of training and validation data)
subset_size = int(len(tokenized_train_data) * 0.1)
train_subset = tokenized_train_data.select(range(subset_size))

val_subset_size = int(len(tokenized_val_data) * 0.1)
val_subset = tokenized_val_data.select(range(val_subset_size))

test_subset_size = int(len(tokenized_test_data) * 0.1)
test_subset = tokenized_test_data.select(range(subset_size))

test_dataloader = create_dataloader(test_subset, batch_size=BATCH_SIZE, shuffle=False)

In [None]:
from transformers import TrainingArguments

# Training arguments
training_args = TrainingArguments(
    output_dir="/content/drive/My Drive/CSE354_Project/Final_Model/results",  # Directory to save checkpoints
    eval_strategy="epoch",                                                   # Evaluate at the end of each epoch
    save_strategy="epoch",                                                   # Save checkpoint at the end of each epoch
    learning_rate=5e-5,                                                      # Learning rate
    per_device_train_batch_size=8,                                           # Batch size per GPU
    per_device_eval_batch_size=8,                                            # Batch size for evaluation
    num_train_epochs=3,                                                      # Number of epochs
    weight_decay=0.01,                                                       # Weight decay for regularization
    logging_dir="/content/drive/My Drive/CSE354_Project/Final_Model/results/logs",  # Directory for logging
    logging_steps=50,                                                        # Log every 50 steps
    save_total_limit=2,                                                      # Limit to 2 saved checkpoints
    load_best_model_at_end=True,                                             # Load best model after training
    metric_for_best_model="eval_loss",                                       # Metric to evaluate the best model
    save_steps=None,                                                         # Not needed if save_strategy="epoch"
    fp16=True,                                                                # Use mixed precision for faster training
    report_to="none"  # Disable W&B
)


In [None]:
from transformers import T5ForConditionalGeneration, T5TokenizerFast

# Load model and tokenizer
model_name = "t5-base"
model = T5ForConditionalGeneration.from_pretrained(model_name)
tokenizer = T5TokenizerFast.from_pretrained(model_name)

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

In [None]:
from transformers import Trainer

# Define the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_subset,
    eval_dataset=val_subset,
    tokenizer=tokenizer
)

  trainer = Trainer(


In [None]:
# Train the model
trainer.train()

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss
1,0.2528,0.257187
2,0.25,0.251364
3,0.2235,0.253161


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=2697, training_loss=0.34172984198547973, metrics={'train_runtime': 1733.1855, 'train_samples_per_second': 12.444, 'train_steps_per_second': 1.556, 'total_flos': 1.313339482570752e+16, 'train_loss': 0.34172984198547973, 'epoch': 3.0})

In [None]:
save_path = "/content/drive/My Drive/CSE354_Project/Final_Model/fine_tuned_t5_small"
# Save the model and tokenizer to Google Drive
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)
print(f"Model and tokenizer saved to {save_path}!")

Model and tokenizer saved to /content/drive/My Drive/CSE354_Project/Final_Model/fine_tuned_t5_small!


## Evaluation

In [None]:
!pip install datasets rouge-score bert-score evaluate bleurt

Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting bert-score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
[31mERROR: Could not find a version that satisfies the requirement bleurt (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for bleurt[0m[31m
[0m

In [None]:
!pip install evaluate

Collecting evaluate
  Using cached evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [None]:
# Clone BLEURT GitHub repository
!git clone https://github.com/google-research/bleurt.git

# Change to BLEURT directory
%cd bleurt

# Install BLEURT package
!pip install .

Cloning into 'bleurt'...
remote: Enumerating objects: 134, done.[K
remote: Counting objects: 100% (18/18), done.[K
remote: Compressing objects: 100% (18/18), done.[K
remote: Total 134 (delta 0), reused 17 (delta 0), pack-reused 116 (from 1)[K
Receiving objects: 100% (134/134), 31.28 MiB | 22.94 MiB/s, done.
Resolving deltas: 100% (49/49), done.
/content/bleurt
Processing /content/bleurt
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: BLEURT
  Building wheel for BLEURT (setup.py) ... [?25l[?25hdone
  Created wheel for BLEURT: filename=BLEURT-0.0.2-py3-none-any.whl size=16456764 sha256=1b2688ba2c9977a3b8f8e6172670107bcde838a77f699d4a127b85504b755411
  Stored in directory: /tmp/pip-ephem-wheel-cache-2w8kwmke/wheels/92/4f/fb/afa555fa27aa9e2c7958df797a62cc4e74f0f459cec9c4fa7c
Successfully built BLEURT
Installing collected packages: BLEURT
Successfully installed BLEURT-0.0.2


In [None]:
!wget https://storage.googleapis.com/bleurt-oss/bleurt-base-128.zip
!unzip bleurt-base-128.zip -d bleurt-base-128

--2024-12-07 01:39:33--  https://storage.googleapis.com/bleurt-oss/bleurt-base-128.zip
Resolving storage.googleapis.com (storage.googleapis.com)... 74.125.20.207, 108.177.98.207, 74.125.197.207, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|74.125.20.207|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 405489453 (387M) [application/zip]
Saving to: ‘bleurt-base-128.zip’


2024-12-07 01:39:37 (106 MB/s) - ‘bleurt-base-128.zip’ saved [405489453/405489453]

Archive:  bleurt-base-128.zip
   creating: bleurt-base-128/bleurt-base-128/
  inflating: bleurt-base-128/bleurt-base-128/vocab.txt  
  inflating: bleurt-base-128/bleurt-base-128/bert_config.json  
   creating: bleurt-base-128/bleurt-base-128/variables/
  inflating: bleurt-base-128/bleurt-base-128/variables/variables.index  
  inflating: bleurt-base-128/bleurt-base-128/variables/variables.data-00000-of-00001  
  inflating: bleurt-base-128/bleurt-base-128/bleurt_config.json  
  inflating: bleur

In [None]:
!mv bleurt-base-128/bleurt-base-128/* bleurt-base-128/
!rm -r bleurt-base-128/bleurt-base-128

In [None]:
!pip install sacrebleu rouge-metric bert-score bleurt

Collecting sacrebleu
  Downloading sacrebleu-2.4.3-py3-none-any.whl.metadata (51 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/51.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting rouge-metric
  Downloading rouge_metric-1.0.1-py3-none-any.whl.metadata (9.5 kB)
Collecting bert-score
  Using cached bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Collecting portalocker (from sacrebleu)
  Downloading portalocker-3.0.0-py3-none-any.whl.metadata (8.5 kB)
Collecting colorama (from sacrebleu)
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Downloading sacrebleu-2.4.3-py3-none-any.whl (103 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.0/104.0 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading rouge_metric-1.0.1-py3-none-any.whl (151 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
def generate_predictions(dataloader, tokenizer, model, device):
    predictions, references = [], []
    model.to(device)

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            # Generate predictions
            outputs = model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                max_length=64,  # Adjust based on task
                num_beams=5,    # Beam search for better outputs
                early_stopping=True
            )

            # Decode predictions and references
            preds = tokenizer.batch_decode(outputs, skip_special_tokens=True)
            refs = tokenizer.batch_decode(labels, skip_special_tokens=True)

            predictions.extend(preds)
            references.extend(refs)

    return predictions, references

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
test_predictions, test_references = generate_predictions(test_dataloader, tokenizer, model, device)

In [None]:
import evaluate

# BLEU Score
def calculate_bleu(predictions, references):
    bleu = evaluate.load("bleu")
    formatted_references = [[ref] for ref in references]
    result = bleu.compute(predictions=predictions, references=formatted_references)

    # Display BLEU score
    print(f"BLEU Score: {result['bleu'] * 100:.2f}")
    return result['bleu']

In [None]:
# ROUGE Score
def calculate_rouge(predictions, references):
    rouge = evaluate.load("rouge")
    result = rouge.compute(predictions=predictions, references=references)
    print("ROUGE Scores:", result)
    return result

In [None]:
# BERTScore
from bert_score import score as bertscore

def calculate_bertscore(predictions, references):
    P, R, F1 = bertscore(predictions, references, lang="en", verbose=True)
    print(f"BERTScore - Precision: {P.mean():.4f}, Recall: {R.mean():.4f}, F1: {F1.mean():.4f}")
    return {"Precision": P.mean().item(), "Recall": R.mean().item(), "F1": F1.mean().item()}

In [None]:
# BLEURT Score
from bleurt import score as bleurt

def calculate_bleurt(predictions, references, checkpoint="bleurt-base-128"):
    scorer = bleurt.BleurtScorer(checkpoint)
    scores = scorer.score(references=references, candidates=predictions)
    print(f"Average BLEURT Score: {np.mean(scores):.4f}")
    return scores

In [None]:
# Calculate Metrics
bleu_scores = calculate_bleu(test_predictions, test_references)
rouge_scores = calculate_rouge(test_predictions, test_references)
bert_scores = calculate_bertscore(test_predictions, test_references)
bleurt_scores = calculate_bleurt(test_predictions, test_references)

BLEU Score: 4.67


ImportError: To be able to use evaluate-metric/rouge, you need to install the following dependencies['rouge_score'] using 'pip install rouge_score' for instance'

In [None]:
# Inspect Sample Predictions
for i in range(5):  # Display first 5 examples
    print(f"Input: {test_data[i]['input_text']}")
    print(f"Gold Answer: {test_references[i]}")
    print(f"Predicted Answer: {test_predictions[i]}")
    print("-" * 50)

In [None]:
test_metrics = {
    "BLEU": bleu_score.scores,
    "ROUGE": rouge_scores,
    "BERTScore": bert_scores,
    "BLEURT": sum(bleurt_scores) / len(bleurt_scores),
}

print("Test Metrics:", test_metrics)

In [None]:
# Save predictions and references
results_df = pd.DataFrame({"Input": [d['input_text'] for d in test_data],
                           "Gold Answer": test_references,
                           "Predicted Answer": test_predictions})
results_df.to_csv("/content/drive/MyDrive/CSE354_Project/Idea2/test_results.csv", index=False)

# Save metrics
metrics = {
    "BLEU": bleu_score,
    "ROUGE": rouge_scores,
    "BERTScore": bert_scores,
    "BLEURT": np.mean(bleurt_scores)
}
metrics_df = pd.DataFrame(metrics)
metrics_df.to_csv("/content/drive/MyDrive/CSE354_Project/Idea2/test_metrics.csv", index=False)