In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Install Dependencies

In [None]:
# Install dependencies
!pip install transformers datasets torch pandas numpy nltk sacrebleu rouge-score bert-score

In [None]:
# Import libraries
import pandas as pd
import numpy as np
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
from datasets import Dataset
from sklearn.model_selection import train_test_split
import torch
from nltk.translate.bleu_score import sentence_bleu
from rouge_score import rouge_scorer
import bert_score
import warnings
from transformers import TrainerCallback
import logging
import os
warnings.filterwarnings("ignore")

print("Setup complete!")

## Load DialoCONAN Dataset

In [None]:
# Load dataset
df = pd.read_csv("/kaggle/input/dialoconan/DIALOCONAN.csv")  # Update path to your dataset

# Display basic info
print("Dataset Shape:", df.shape)
print("Columns:", df.columns.tolist())
print("Unique Targets:", df["TARGET"].unique())
print("Unique Dialogue ID:", df["dialogue_id"].unique())
print("Unique Turn ID:", df["turn_id"].unique())
print("Unique Types:", df["type"].unique())
print("Unique Sources:", df["source"].unique())
print("Sample Data:")
display(df.head())

# Check dialogue distribution
print("Dialogue Count by Target:")
print(df.groupby("TARGET")["dialogue_id"].nunique())

## Preprocess the data

In [None]:
def create_dialogue_pairs(df):
    dialogues = []
    for dialogue_id in df["dialogue_id"].unique():
        dialogue = df[df["dialogue_id"] == dialogue_id].sort_values("turn_id")
        context = []
        for i, row in dialogue.iterrows():
            if row["type"] == "HS":
                context.append(f"[{row['TARGET']} HS]: {row['text']}")
            elif row["type"] == "CN":
                dialogues.append({
                    "input": " ".join(context[-5:]),  # Limit to last 5 turns for context
                    "output": row["text"],
                    "target": row["TARGET"]
                })
                context.append(f"[{row['TARGET']} CN]: {row['text']}")
    return dialogues

# Create dialogue pairs
data = create_dialogue_pairs(df)
print(f"Total Dialogue Pairs: {len(data)}")

# Split data
train_data, temp_data = train_test_split(data, test_size=0.2, random_state=42)
val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)

# Convert to Hugging Face Dataset
train_dataset = Dataset.from_list(train_data)
val_dataset = Dataset.from_list(val_data)
test_dataset = Dataset.from_list(test_data)

print(f"Train Size: {len(train_dataset)}, Val Size: {len(val_dataset)}, Test Size: {len(test_dataset)}")

## Tokenize Dataset

In [None]:
# Load tokenizer
model_name = "t5-base"
tokenizer = T5Tokenizer.from_pretrained(model_name)

# Tokenize function
def tokenize(batch):
    inputs = tokenizer(
        batch["input"],
        max_length=512,
        truncation=True,
        padding="max_length",
        return_tensors="np"
    )
    outputs = tokenizer(
        batch["output"],
        max_length=128,
        truncation=True,
        padding="max_length",
        return_tensors="np"
    )
    return {
        "input_ids": inputs.input_ids,
        "attention_mask": inputs.attention_mask,
        "labels": outputs.input_ids,
        "input": batch["input"],  # Preserve original fields
        "output": batch["output"],
        "target": batch["target"]
    }

# Apply tokenization
train_dataset = train_dataset.map(tokenize, batched=True)
val_dataset = val_dataset.map(tokenize, batched=True)
test_dataset = test_dataset.map(tokenize, batched=True)

# Set format for training, but keep all columns
train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels", "input", "output", "target"])
val_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels", "input", "output", "target"])
test_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels", "input", "output", "target"])

print("Tokenization complete!")

## Initialize Model

In [None]:
# Load model
model = T5ForConditionalGeneration.from_pretrained(model_name)

# Move to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

print(f"Model loaded on {device}")

## Fine-Tune T5 Model

In [None]:
# Suppress RoBERTa warnings from bert-score
warnings.filterwarnings("ignore", message="Some weights of RobertaModel were not initialized")

# Set up logging
logging.basicConfig(
    format='%(asctime)s - %(levelname)s - %(message)s',
    level=logging.INFO,
    handlers=[
        logging.StreamHandler(),
        logging.FileHandler("/kaggle/working/training_logs.txt")
    ]
)
logger = logging.getLogger(__name__)

# Custom callback for validation metrics
class CustomMetricsCallback(TrainerCallback):
    def __init__(self, tokenizer, val_dataset, max_samples=100):
        self.tokenizer = tokenizer
        self.val_dataset = val_dataset.select(range(min(max_samples, len(val_dataset))))
        self.scorer = rouge_scorer.RougeScorer(["rougeL"], use_stemmer=True)

    def on_evaluate(self, args, state, control, model, **kwargs):
        logger.info(f"Computing custom metrics at step {state.global_step}")
        bleu_scores, rouge_scores, bert_scores = [], [], []
        
        model.eval()
        for item in self.val_dataset:
            try:
                hs_text = item["input"].split("] HS: ")[-1]
                target = item["target"]
                input_text = item["input"]
                
                # Generate prediction
                inputs = self.tokenizer(
                    input_text,
                    return_tensors="pt",
                    max_length=512,
                    truncation=True
                ).to(model.device)
                outputs = model.generate(
                    inputs["input_ids"],
                    max_length=128,
                    num_beams=5,
                    no_repeat_ngram_size=2,
                    past_key_values=EncoderDecoderCache.from_legacy_cache(None)  # Fix deprecation warning
                )
                pred = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
                ref = item["output"]
                
                # Compute metrics
                bleu_scores.append(sentence_bleu([ref.split()], pred.split()))
                rouge_scores.append(self.scorer.score(ref, pred)["rougeL"].fmeasure)
                P, R, F1 = bert_score.score([pred], [ref], lang="en", rescale_with_baseline=True)
                bert_scores.append(F1.item())
            except Exception as e:
                logger.warning(f"Error processing item: {e}")
                continue
        
        # Log metrics
        metrics = {
            "val_bleu": np.mean(bleu_scores) if bleu_scores else 0.0,
            "val_rouge_l": np.mean(rouge_scores) if rouge_scores else 0.0,
            "val_bertscore": np.mean(bert_scores) if bert_scores else 0.0
        }
        logger.info(f"Validation Metrics: {metrics}")
        # Return None to preserve TrainerControl
        return None

# Training arguments
training_args = TrainingArguments(
    output_dir="/kaggle/working/t5-counterspeech",
    num_train_epochs=5, # Number of Epochs
    per_device_train_batch_size=2,  # Training in batches of 2 to avoid OOM
    per_device_eval_batch_size=2,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="/kaggle/working/logs",
    logging_steps=100,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    save_total_limit=1,
    fp16=True,
    report_to="none",
)

# Initialize trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    callbacks=[CustomMetricsCallback(tokenizer, val_dataset)]
)

# Train model
logger.info("Starting training...")
trainer.train()

# Save model
model.save_pretrained("/kaggle/working/t5-counterspeech-final")
tokenizer.save_pretrained("/kaggle/working/t5-counterspeech-final")

# Save logs
with open("/kaggle/working/final_training_logs.txt", "w") as f:
    f.write(str(trainer.state.log_history))

print("Training complete! Check /kaggle/working/training_logs.txt for detailed logs.")

## Plot training/validation loss

In [None]:
import matplotlib.pyplot as plt
logs = trainer.state.log_history
steps = [log["step"] for log in logs if "loss" in log]
losses = [log["loss"] for log in logs if "loss" in log]
plt.plot(steps, losses, label="Training Loss")
plt.xlabel("Batch Steps")
plt.ylabel("Loss")
plt.legend()
plt.show()

## Generate Counterspeech on test-split

In [None]:
def generate_counterspeech(hs_text, target, dialogue_history=None):
    # Prepare input
    input_text = f"[{target} HS]: {hs_text}"
    if dialogue_history:
        input_text = dialogue_history + " " + input_text
    
    # Tokenize
    inputs = tokenizer(
        input_text,
        return_tensors="pt",
        max_length=512,
        truncation=True
    ).to(device)
    
    # Generate
    outputs = model.generate(
        inputs["input_ids"],
        max_length=128,
        num_beams=5,
        no_repeat_ngram_size=2,
        early_stopping=True
    )
    
    # Decode
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

# Test inference
sample_hs = "All migrants are criminals and should be deported!"
sample_target = "MIGRANTS"
sample_history = "[MIGRANTS HS]: More migrants crossing the channel today. We don’t have enough accommodation. [MIGRANTS CN]: Are you forgetting that last year every rough sleeper was offered a bed during lockdown?"
generated_cn = generate_counterspeech(sample_hs, sample_target, sample_history)
print(f"Hate Speech: {sample_hs}")
print(f"Generated Counterspeech: {generated_cn}")

## Evaluate Model

In [None]:
def evaluate_model(dataset):
    bleu_scores = []
    rouge_scores = []
    bert_scores = []
    scorer = rouge_scorer.RougeScorer(["rougeL"], use_stemmer=True)
    
    for item in dataset:
        # Generate prediction
        hs_text = item["input"].split("] HS: ")[-1]
        pred = generate_counterspeech(hs_text, item["target"], item["input"])
        ref = item["output"]
        
        # BLEU
        bleu_scores.append(sentence_bleu([ref.split()], pred.split()))
        
        # ROUGE
        rouge_scores.append(scorer.score(ref, pred)["rougeL"].fmeasure)
        
        # BERTScore
        P, R, F1 = bert_score.score([pred], [ref], lang="en", rescale_with_baseline=True)
        bert_scores.append(F1.item())
    
    return {
        "BLEU": np.mean(bleu_scores),
        "ROUGE-L": np.mean(rouge_scores),
        "BERTScore": np.mean(bert_scores)
    }

# Run evaluation
results = evaluate_model(test_dataset)
print("Evaluation Results:")
for metric, score in results.items():
    print(f"{metric}: {score:.4f}")

## Save Results

In [None]:
# Save evaluation results
import json
with open("/kaggle/working/evaluation_results.json", "w") as f:
    json.dump(results, f, indent=4)

# Save sample predictions
predictions = []
for item in test_dataset.select(range(5)):  # Save first 5 for inspection
    hs_text = item["input"].split("] HS: ")[-1]
    pred = generate_counterspeech(hs_text, item["target"], item["input"])
    predictions.append({
        "input": item["input"],
        "hate_speech": hs_text,
        "target": item["target"],
        "predicted_counterspeech": pred,
        "reference_counterspeech": item["output"]
    })

pd.DataFrame(predictions).to_csv("/kaggle/working/sample_predictions.csv", index=False)
print("Results and predictions saved!")