In [2]:
!pip install git+https://github.com/google-research/bleurt.git -q
!pip install bert_score rouge-score evaluate -q

  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for BLEURT (setup.py) ... [?25l[?25hdone


In [3]:
import os
import pandas as pd
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration

torch.manual_seed(42)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

2025-04-11 16:19:20.964526: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1744388361.191746      31 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1744388361.254903      31 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [4]:
#initializing model and tokenizer
model_name = "t5-base"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [None]:
def preprocess_dataset(path):
    """
    Preprocess the dataset by reading all CSV files in the given path.
    For each conversation (CSV file), the function constructs an input string
    that concatenates each utterance preceded by its emotion token. 
    This way, each line is contextualized with the emotional cues from previous lines.
    """
    csv_files = [os.path.join(path, file) for file in os.listdir(path) if file.endswith(".csv")]
    data = []
    
    for file in csv_files:
        df = pd.read_csv(file)
        
        # Ensure that the CSV file has been enriched with an Emotion column
        # and clean the utterance strings for consistency.
        df['Utterance_cleaned'] = df['Utterance'].str.lower().str.strip()
        
        # Extract the summary row (if available)
        summary_row = df[df["Utterance_cleaned"] == "summary"]
        summary_text = summary_row.iloc[0, 1] if not summary_row.empty else ""
        
        # Filter out rows that are not actual utterances (e.g., summary, primary_topic, secondary_topic)
        dialogue_df = df[~df["Utterance_cleaned"].isin(["summary", "primary_topic", "secondary_topic"])]
        # Drop rows labeled as inactive
        dialogue_df = dialogue_df[dialogue_df['Sub topic'] != 'inactive']
        
        # Construct conversation history:
        # For each utterance, prepend its emotion token, and accumulate all lines.
        conversation_context = ""
        for idx, row in dialogue_df.iterrows():
            # Get the current emotion and utterance.
            # We assume that the CSV was enriched earlier so that the "Emotion" column is non-empty.
            current_emotion = row["Emotion"]
            current_utterance = row["Utterance"]
            # Format the line as "[Emotion] utterance"
            current_line = f"[{current_emotion}] {current_utterance}"
            # Append this line to the conversation context.
            conversation_context += " " + current_line
        
        # Build the final input text. The model will see the full conversation (with emotion tokens).
        input_text = f"summarize: {conversation_context.strip()}"
        data.append({"input": input_text, "summary": summary_text})
    
    return data


In [6]:
train_data = preprocess_dataset("/kaggle/input/memo-kdd/Train_Emo")
val_data = preprocess_dataset("/kaggle/input/memo-kdd/Validation_Emo")
test_data = preprocess_dataset("/kaggle/input/memo-kdd/Test")

In [7]:
from transformers import TrainingArguments, Trainer
from datasets import Dataset, DatasetDict
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

#making hugging face dataset instance to fine tune with trainer api
train_dataset = Dataset.from_list(train_data)
val_dataset = Dataset.from_list(val_data)
test_dataset = Dataset.from_list(test_data)

#create a dataset dictionary
dataset = DatasetDict({
    "train": train_dataset,
    "validation": val_dataset,
    "test": test_dataset
})

In [8]:
def preprocess_function(examples):
    """
    Preprocesses the dataset for fine tuning the model.
    """
    model_inputs = tokenizer(examples["input"], padding="max_length", truncation=True, max_length=1024)
    labels = tokenizer(examples["summary"], padding="max_length", truncation=True, max_length=150)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_dataset = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/131 [00:00<?, ? examples/s]

Map:   0%|          | 0/21 [00:00<?, ? examples/s]

Map:   0%|          | 0/39 [00:00<?, ? examples/s]

In [9]:
#fine tuning the model with trainer api and save the model
training_args = TrainingArguments(
    output_dir="./t5-finetuned",
    eval_strategy="epoch",
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    learning_rate=1e-5,
    num_train_epochs=20,
    weight_decay=0.001,
    save_strategy="epoch",
    save_total_limit=1,
    logging_dir="./logs",
    logging_steps=10,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"]
)

trainer.train()

model.save_pretrained("./t5-finetuned")
tokenizer.save_pretrained("./t5-finetuned")

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss
1,7.1344,6.241673
2,4.9409,3.761084
3,3.2246,3.065937
4,2.6101,2.918926
5,2.8617,2.827606
6,2.6032,2.769051
7,2.4426,2.734866
8,2.4542,2.708006
9,2.1975,2.685897
10,2.3486,2.667094


('./t5-finetuned/tokenizer_config.json',
 './t5-finetuned/special_tokens_map.json',
 './t5-finetuned/spiece.model',
 './t5-finetuned/added_tokens.json')

In [10]:
model_path = "/kaggle/working/t5-finetuned"
model = T5ForConditionalGeneration.from_pretrained(model_path)
tokenizer = T5Tokenizer.from_pretrained(model_path)
model.to(device)

for item in test_data:
    input_text = item["input"]

    #tokenize input
    input_ids = tokenizer(input_text, return_tensors="pt", padding=True).input_ids.to(device)

    #generate summary
    summary_ids = model.generate(input_ids, max_length=150, num_beams=8, repetition_penalty=5.0, early_stopping=True)
    generated_summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

    #store the generated summary
    item["generated_summary"] = generated_summary

In [12]:
references = [] #list to store target summaries
predictions = [] #list to store generated summaries

for item in test_data:
    references.append(item["summary"])  #ground truth summaries
    predictions.append(item["generated_summary"])

In [13]:
from bert_score import score
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import evaluate
from rouge_score import rouge_scorer


#Rogue score
def compute_rouge(predictions, references):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = {'rouge1': [], 'rouge2': [], 'rougeL': []}
    for pred, ref in zip(predictions, references):
        score = scorer.score(ref, pred)
        scores['rouge1'].append(score['rouge1'].fmeasure)
        scores['rouge2'].append(score['rouge2'].fmeasure)
        scores['rougeL'].append(score['rougeL'].fmeasure)
    return {key: sum(val)/len(val) for key, val in scores.items()}  # Averaging scores

rouge_scores = compute_rouge(predictions, references)

#BLEURT score
bleurt = evaluate.load("bleurt", config_name="bleurt-base-128")
results = bleurt.compute(predictions=predictions, references=references)
avg_bleurt = sum(results["scores"]) / len(results["scores"])

#BLEU score
smoothie = SmoothingFunction().method4
bleu_scores = [sentence_bleu(ref, pred.split(), smoothing_function=smoothie) for ref, pred in zip(references, predictions)]
avg_bleu = sum(bleu_scores) / len(bleu_scores)

#BERT score
P, R, F1 = score(predictions, [ref[0] for ref in references], lang="en")

Downloading builder script:   0%|          | 0.00/5.20k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/405M [00:00<?, ?B/s]

I0000 00:00:1744389770.414519      31 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 5354 MB memory:  -> device: 0, name: Tesla T4, pci bus id: 0000:00:04.0, compute capability: 7.5
I0000 00:00:1744389770.415174      31 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 7378 MB memory:  -> device: 1, name: Tesla T4, pci bus id: 0000:00:05.0, compute capability: 7.5


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
print(f"Rogue-1 Score: {rouge_scores['rouge1'] * 100:.2f}")
print(f"Rogue-2 Score: {rouge_scores['rouge2'] * 100:.2f}")
print(f"Rogue-L Score: {rouge_scores['rougeL'] * 100:.2f}")
print(f"BLEURT Score: {avg_bleurt:.4f}")
print()
print(f"BLEU score: {avg_bleu * 100:.2f}")
print(f"BERT score F1: {F1.mean().item() * 100:.2f}")
print(f"BERT score Precision: {P.mean().item() * 100:.2f}")
print(f"BERT score Recall: {R.mean().item() * 100:.2f}")

Rogue-1 Score: 31.46
Rogue-2 Score: 9.99
Rogue-L Score: 19.46
BLEURT Score: -0.8224

BLEU score: 0.39
BERT score F1: 75.63
BERT score Precision: 73.81
BERT score Recall: 77.55


In [15]:
print('Original summary:', references[1])
print()
print('Generated summary:', predictions[1])

Original summary: The therapist examines the abdomen and other parts of the body. The therapist pulls down the eyelids and checks for anemia, then for scar signs. The therapist requests to perform shifting dullness test, full lymph retinopathy screen including accelerate and inguinal lymph nodes.

Generated summary: Dr. batata examines patient's abdomen and other parts of body . doctor pulls down on eyelids looking for any evidence of anemia.
