In [3]:
!pip install -qU evaluate
!pip install -qU rouge_score

In [4]:
import os
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import (
    T5ForConditionalGeneration,
    T5Tokenizer,
    get_linear_schedule_with_warmup,
    EvalPrediction
)
from torch.optim import AdamW
from sklearn.model_selection import train_test_split
from tqdm.auto import tqdm
import evaluate

In [5]:
np.random.seed(42)
torch.manual_seed(42)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(42)

In [6]:
# Configuration
class Config:
    model_name = "t5-small"
    max_source_length = 512
    max_target_length = 128
    batch_size = 2
    train_epochs = 1
    learning_rate = 3e-5
    weight_decay = 0.01
    warmup_steps = 250
    gradient_accumulation_steps = 2
    eval_steps = 500
    output_dir = "./models/t5_code_review"

config = Config()

In [7]:
# Create output directory
os.makedirs(config.output_dir, exist_ok=True)

In [8]:
class CodeReviewDataset(Dataset):
    def __init__(self, source_texts, target_texts, tokenizer, max_source_length, max_target_length):
        self.source_texts = source_texts
        self.target_texts = target_texts
        self.tokenizer = tokenizer
        self.max_source_length = max_source_length
        self.max_target_length = max_target_length

    def __len__(self):
        return len(self.source_texts)

    def __getitem__(self, idx):
        source = self.source_texts[idx]
        target = self.target_texts[idx]

        # Remove the "code2comment :" prefix if it exists
        if source.startswith("code2comment :"):
            source = source[len("code2comment :"):]

        # T5 input format: "summarize: " + source
        source_text = f"review code: {source}"

        source_encodings = self.tokenizer(
            source_text,
            max_length=self.max_source_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )

        target_encodings = self.tokenizer(
            target,
            max_length=self.max_target_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )

        input_ids = source_encodings["input_ids"].squeeze()
        attention_mask = source_encodings["attention_mask"].squeeze()
        labels = target_encodings["input_ids"].squeeze()
        # Replace padding token id's with -100 so they're ignored in the loss
        labels[labels == self.tokenizer.pad_token_id] = -100

        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "labels": labels
        }

In [9]:
def load_data(source_path, target_path):
    """Load source and target files"""
    with open(source_path, 'r', encoding='utf-8') as f:
        source_texts = f.readlines()

    with open(target_path, 'r', encoding='utf-8') as f:
        target_texts = f.readlines()

    # Remove trailing newlines and create dataframe
    source_texts = [text.strip() for text in source_texts]
    target_texts = [text.strip() for text in target_texts]

    assert len(source_texts) == len(target_texts), "Source and target files must have the same number of lines"

    data = pd.DataFrame({
        'source': source_texts,
        'target': target_texts
    })

    # Split into training and validation sets
    train_df, val_df = train_test_split(data, test_size=0.1, random_state=42)

    return train_df, val_df

def compute_metrics(predictions, labels):
    """Compute evaluation metrics with proper handling of variable-length sequences"""
    # Decode the predictions and labels
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)

    # Replace -100 with the pad_token_id
    labels_copy = []
    for label_seq in labels:
        label_seq = np.array(label_seq)
        label_seq_copy = label_seq.copy()
        label_seq_copy[label_seq_copy == -100] = tokenizer.pad_token_id
        labels_copy.append(label_seq_copy)

    decoded_labels = tokenizer.batch_decode(labels_copy, skip_special_tokens=True)

    # ROUGE metrics
    rouge = evaluate.load("rouge")
    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    # Extract ROUGE metrics and convert to percentages
    result = {key: value * 100 for key, value in result.items()}

    # Add BLEU score
    bleu = evaluate.load("bleu")
    bleu_result = bleu.compute(predictions=decoded_preds, references=[[l] for l in decoded_labels])
    result["bleu"] = bleu_result["bleu"] * 100

    # Calculate exact match score
    exact_match = sum(pred == label for pred, label in zip(decoded_preds, decoded_labels)) / len(decoded_preds)
    result["exact_match"] = exact_match * 100

    # Add prediction samples for logging
    result["gen_len"] = np.mean([len(pred.split()) for pred in decoded_preds])
    result["sample_predictions"] = decoded_preds[:3]
    result["sample_labels"] = decoded_labels[:3]

    return result

In [10]:
def train():
    """Main training function"""
    # Load and prepare data
    print("Loading data...")
    train_df, val_df = load_data("source.txt", "target.txt")
    print(f"Training samples: {len(train_df)}, Validation samples: {len(val_df)}")

    # Load tokenizer and model
    print(f"Loading {config.model_name} model and tokenizer...")
    global tokenizer
    tokenizer = T5Tokenizer.from_pretrained(config.model_name)
    model = T5ForConditionalGeneration.from_pretrained(config.model_name)

    # Create datasets
    train_dataset = CodeReviewDataset(
        train_df["source"].tolist(),
        train_df["target"].tolist(),
        tokenizer,
        config.max_source_length,
        config.max_target_length
    )

    val_dataset = CodeReviewDataset(
        val_df["source"].tolist(),
        val_df["target"].tolist(),
        tokenizer,
        config.max_source_length,
        config.max_target_length
    )

    # Create dataloaders
    train_dataloader = DataLoader(
        train_dataset,
        batch_size=config.batch_size,
        shuffle=True
    )

    val_dataloader = DataLoader(
        val_dataset,
        batch_size=config.batch_size
    )

    # Set up optimizer and scheduler
    optimizer = AdamW(
        model.parameters(),
        lr=config.learning_rate,
        weight_decay=config.weight_decay
    )

    total_steps = len(train_dataloader) * config.train_epochs // config.gradient_accumulation_steps
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=config.warmup_steps,
        num_training_steps=total_steps
    )

    # Set device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")
    model = model.to(device)

    # Training loop
    print("Starting training...")
    global_step = 0
    best_rouge_l = 0

    for epoch in range(config.train_epochs):
        model.train()
        total_loss = 0
        progress_bar = tqdm(train_dataloader, desc=f"Epoch {epoch+1}/{config.train_epochs}")

        for step, batch in enumerate(progress_bar):
            # Move batch to device
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            # Forward pass
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )

            loss = outputs.loss
            total_loss += loss.item()

            # Loss scaling for gradient accumulation
            loss = loss / config.gradient_accumulation_steps
            loss.backward()

            # Update weights after gradient accumulation steps
            if (step + 1) % config.gradient_accumulation_steps == 0:
                optimizer.step()
                scheduler.step()
                optimizer.zero_grad()
                global_step += 1
                progress_bar.set_postfix({"loss": total_loss / (step + 1)})

                # Evaluate during training
                if global_step % config.eval_steps == 0:
                    eval_results = evaluate_model(model, val_dataloader, device)
                    print(f"Step {global_step}: {eval_results}")

                    # Save best model - fixed key name to use 'rougeL' instead of 'rouge_l'
                    if eval_results["rougeL"] > best_rouge_l:
                        best_rouge_l = eval_results["rougeL"]
                        print(f"New best ROUGE-L: {best_rouge_l:.2f}. Saving model...")
                        model.save_pretrained(os.path.join(config.output_dir, "best_model"))
                        tokenizer.save_pretrained(os.path.join(config.output_dir, "best_model"))

        # End of epoch evaluation
        eval_results = evaluate_model(model, val_dataloader, device)
        print(f"Epoch {epoch+1} - Evaluation results: {eval_results}")

    # Save final model
    print("Training complete. Saving final model...")
    model.save_pretrained(os.path.join(config.output_dir, "final_model"))
    tokenizer.save_pretrained(os.path.join(config.output_dir, "final_model"))

    return os.path.join(config.output_dir, "best_model")

In [11]:
def evaluate_model(model, dataloader, device):
    """Evaluate model on validation data"""
    model.eval()
    all_predictions = []
    all_labels = []

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating"):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            # Generate predictions
            outputs = model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                max_length=config.max_target_length,
                num_beams=4,
                early_stopping=True
            )

            # Store predictions and labels properly
            all_predictions.extend(outputs.detach().cpu().tolist())
            all_labels.extend(labels.detach().cpu().tolist())

    # Compute metrics directly (don't use EvalPrediction class)
    metrics = compute_metrics(all_predictions, all_labels)

    return metrics

In [13]:
best_model_path = train()
print(f"Best model saved to {best_model_path}")

Loading data...
Training samples: 15102, Validation samples: 1678
Loading t5-small model and tokenizer...
Using device: cuda
Starting training...


Epoch 1/1:   0%|          | 0/7551 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/839 [00:00<?, ?it/s]

Step 500: {'rouge1': np.float64(6.571076609875481), 'rouge2': np.float64(1.603737142387896), 'rougeL': np.float64(5.850327399597972), 'rougeLsum': np.float64(5.837482196119019), 'bleu': 2.490217641157689, 'exact_match': 0.0, 'gen_len': np.float64(14.763408820023837), 'sample_predictions': ['okHttpClient okHttpClient = this.okHttpClient; if (stdin!= null)  okHttpClient = this.okHttpClient.newBuilder().addNetworkInterceptor(chain ->  Response response = chain.proceed(chain.request()); if (stdin!= null)', 'This.streamLog = streamLog; this.dataCache = Caffeine.newBuilder() + (int)(sizeOf.deepSizeOf(logData.getMetadataMap()))).removalListener(this::handleEviction).build(this::handleRetrieval);', 'RunMethod = null; try  runMethod = getClass().getMethod(name, (Class[]) null);  catch (NoSuchMethodException e)  fail("Method "" + name + "" not found");  catch (IllegalAccessException e)  e.fillInStackTrace(); throw e.fillInStackTrace(); throw'], 'sample_labels': ["doesn't okHttpClient.newBuilder 

Evaluating:   0%|          | 0/839 [00:00<?, ?it/s]

Step 1000: {'rouge1': np.float64(6.501722070141805), 'rouge2': np.float64(1.148095430630572), 'rougeL': np.float64(5.844463630892017), 'rougeLsum': np.float64(5.852071790971951), 'bleu': 1.7542938435220405, 'exact_match': 0.0, 'gen_len': np.float64(6.969606674612634), 'sample_predictions': ['@Override @SneakyThrows?', "I'm not sure if this is the case, but it's not the case.", ''], 'sample_labels': ["doesn't okHttpClient.newBuilder inherit parent's hostnameVerifier?", 'Add 8 bytes for the key', 'How about "TestCase name cannot be null"']}


Evaluating:   0%|          | 0/839 [00:00<?, ?it/s]

Step 1500: {'rouge1': np.float64(7.42952380363365), 'rouge2': np.float64(1.0438050448522578), 'rougeL': np.float64(6.708787058937848), 'rougeLsum': np.float64(6.692224887235482), 'bleu': 1.5688014333261027, 'exact_match': 0.0, 'gen_len': np.float64(7.6597139451728244), 'sample_predictions': ['okHttpClient okHttpClient', "I'm not sure if this is the case.", ''], 'sample_labels': ["doesn't okHttpClient.newBuilder inherit parent's hostnameVerifier?", 'Add 8 bytes for the key', 'How about "TestCase name cannot be null"']}
New best ROUGE-L: 6.71. Saving model...


Evaluating:   0%|          | 0/839 [00:00<?, ?it/s]

Step 2000: {'rouge1': np.float64(8.5400587059089), 'rouge2': np.float64(0.9782323069058706), 'rougeL': np.float64(7.500146976975117), 'rougeLsum': np.float64(7.496655124935259), 'bleu': 1.2451351096448793, 'exact_match': 0.0, 'gen_len': np.float64(9.17103694874851), 'sample_predictions': ['okHttpClient okHttpClient', "I'm not sure if it's a config.getMaxCacheSize?", ''], 'sample_labels': ["doesn't okHttpClient.newBuilder inherit parent's hostnameVerifier?", 'Add 8 bytes for the key', 'How about "TestCase name cannot be null"']}
New best ROUGE-L: 7.50. Saving model...


Evaluating:   0%|          | 0/839 [00:00<?, ?it/s]

Step 2500: {'rouge1': np.float64(8.78027339418123), 'rouge2': np.float64(0.8010573838889528), 'rougeL': np.float64(7.710805660372482), 'rougeLsum': np.float64(7.722351475033701), 'bleu': 0.9667570739791038, 'exact_match': 0.0, 'gen_len': np.float64(9.690703218116806), 'sample_predictions': ['okHttpClient okHttpClient', "I'm not sure if this is the case, but it's not the case.", 'Is there a way to use e.fillInStackTrace?'], 'sample_labels': ["doesn't okHttpClient.newBuilder inherit parent's hostnameVerifier?", 'Add 8 bytes for the key', 'How about "TestCase name cannot be null"']}
New best ROUGE-L: 7.71. Saving model...


Evaluating:   0%|          | 0/839 [00:00<?, ?it/s]

Step 3000: {'rouge1': np.float64(8.863372250023977), 'rouge2': np.float64(0.8288356831941386), 'rougeL': np.float64(7.853465825027158), 'rougeLsum': np.float64(7.870449356824527), 'bleu': 0.9944942359831815, 'exact_match': 0.0, 'gen_len': np.float64(9.781883194278903), 'sample_predictions': ['@Override @SneakyThrows', "I'm not sure if it's a config.getMaxCacheSize?", 'I think this is a good way to use e.fillInStackTrace()'], 'sample_labels': ["doesn't okHttpClient.newBuilder inherit parent's hostnameVerifier?", 'Add 8 bytes for the key', 'How about "TestCase name cannot be null"']}
New best ROUGE-L: 7.85. Saving model...


Evaluating:   0%|          | 0/839 [00:00<?, ?it/s]

Step 3500: {'rouge1': np.float64(8.83290479301042), 'rouge2': np.float64(0.8666239861151168), 'rougeL': np.float64(7.76703040627639), 'rougeLsum': np.float64(7.775696794136798), 'bleu': 1.1607499186522807, 'exact_match': 0.0, 'gen_len': np.float64(9.178784266984506), 'sample_predictions': ['okHttpClient okHttpClient', "I'm not sure if it's a config.getMaxCacheSize?", 'Is this a good idea?'], 'sample_labels': ["doesn't okHttpClient.newBuilder inherit parent's hostnameVerifier?", 'Add 8 bytes for the key', 'How about "TestCase name cannot be null"']}


Evaluating:   0%|          | 0/839 [00:00<?, ?it/s]

Epoch 1 - Evaluation results: {'rouge1': np.float64(8.892240437588631), 'rouge2': np.float64(0.8529467081251727), 'rougeL': np.float64(7.83395023798045), 'rougeLsum': np.float64(7.848106844810386), 'bleu': 1.0663637292445702, 'exact_match': 0.0, 'gen_len': np.float64(9.077473182359952), 'sample_predictions': ['okHttpClient', "I'm not sure if it's a config.getMaxCacheSize?", 'I think this is a good idea.'], 'sample_labels': ["doesn't okHttpClient.newBuilder inherit parent's hostnameVerifier?", 'Add 8 bytes for the key', 'How about "TestCase name cannot be null"']}
Training complete. Saving final model...
Best model saved to ./models/t5_code_review/best_model
