In [None]:
pip install transformers datasets



In [None]:
from datasets import load_dataset

dataset = load_dataset("samaxr/code-summary-java")
print(dataset)
train_data = dataset["train"]
subset = train_data.select(range(200))  # Select first 200 indices

for item in subset:
    code = item["code"]
    summary = item["summary"]

DatasetDict({
    train: Dataset({
        features: ['code', 'summary'],
        num_rows: 285670
    })
    validation: Dataset({
        features: ['code', 'summary'],
        num_rows: 31741
    })
    test: Dataset({
        features: ['code', 'summary'],
        num_rows: 79352
    })
})


In [None]:

import torch
from torch.utils.data import DataLoader
from torch.utils.data.dataset import Dataset
from transformers import T5ForConditionalGeneration, T5Tokenizer
from tqdm.auto import tqdm
from datasets import load_dataset

dataset = load_dataset("samaxr/code-summary-java")
print(dataset)
train_data = dataset["train"]
subset = train_data.select(range(50))
# Define the model and tokenizer
model_name = "t5-small"  # You can use any pre-trained T5 model
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Define dataset class
class CodeSummaryDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=512):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        entry = self.data[idx]
        code = entry['code']
        summary = entry['summary']
        input_text = f"code: {code} summary: {summary}"  # Concatenate code and summary
        input_ids = self.tokenizer.encode(input_text, max_length=self.max_length, padding='max_length', truncation=True, return_tensors="pt")
        return input_ids.squeeze()

# Dummy training data (replace with your actual training data)
train_data = [
    {'code': 'code_snippet_1', 'summary': 'summary_1'},
    {'code': 'code_snippet_2', 'summary': 'summary_2'},
    # Add more data as needed
]

# Prepare training data
train_dataset = CodeSummaryDataset(train_data, tokenizer)

# Define training parameters
batch_size = 8  # Decreased batch size (you can try different values)
epochs = 10  # Increased epochs
accumulation_steps = 4  # Accumulate gradients over 4 steps

# Define training loop
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

for epoch in range(epochs):
    model.train()
    total_loss = 0
    tqdm_dataloader = tqdm(train_dataloader, desc=f"Epoch {epoch+1}/{epochs}")

    for i, input_ids in enumerate(tqdm_dataloader):
        input_ids = input_ids.to(device)
        labels = input_ids.clone()
        labels[input_ids == tokenizer.pad_token_id] = -100  # Ignore padding tokens in loss calculation
        outputs = model(input_ids=input_ids, labels=labels)
        loss = outputs.loss

        # Perform gradient accumulation
        loss = loss / accumulation_steps
        loss.backward()

        # Update parameters after accumulating gradients for specified steps
        if (i + 1) % accumulation_steps == 0 or i == len(train_dataloader) - 1:
            optimizer.step()
            optimizer.zero_grad()

        total_loss += loss.item()

    tqdm_dataloader.close()
    avg_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch+1}/{epochs}, Average Loss: {avg_loss:.4f}")

# Save the trained model
model.save_pretrained("code_summary_model")

DatasetDict({
    train: Dataset({
        features: ['code', 'summary'],
        num_rows: 285670
    })
    validation: Dataset({
        features: ['code', 'summary'],
        num_rows: 31741
    })
    test: Dataset({
        features: ['code', 'summary'],
        num_rows: 79352
    })
})


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Epoch 1/10:   0%|          | 0/1 [00:00<?, ?it/s]

Epoch 1/10, Average Loss: 4.1107


Epoch 2/10:   0%|          | 0/1 [00:00<?, ?it/s]

Epoch 2/10, Average Loss: 1.3855


Epoch 3/10:   0%|          | 0/1 [00:00<?, ?it/s]

Epoch 3/10, Average Loss: 1.8745


Epoch 4/10:   0%|          | 0/1 [00:00<?, ?it/s]

Epoch 4/10, Average Loss: 1.1039


Epoch 5/10:   0%|          | 0/1 [00:00<?, ?it/s]

Epoch 5/10, Average Loss: 0.9704


Epoch 6/10:   0%|          | 0/1 [00:00<?, ?it/s]

Epoch 6/10, Average Loss: 1.0372


Epoch 7/10:   0%|          | 0/1 [00:00<?, ?it/s]

Epoch 7/10, Average Loss: 0.6611


Epoch 8/10:   0%|          | 0/1 [00:00<?, ?it/s]

Epoch 8/10, Average Loss: 0.8293


Epoch 9/10:   0%|          | 0/1 [00:00<?, ?it/s]

Epoch 9/10, Average Loss: 0.8053


Epoch 10/10:   0%|          | 0/1 [00:00<?, ?it/s]

Epoch 10/10, Average Loss: 0.7375


In [None]:
import torch
from torch.utils.data import DataLoader
from torch.utils.data.dataset import Dataset
from transformers import T5ForConditionalGeneration, T5Tokenizer
from tqdm.auto import tqdm
from datasets import load_dataset


model_name = "t5-small"  # You can use any pre-trained T5 model
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Freeze the special token embeddings
for param in model.shared.parameters():
    param.requires_grad = False

# Define dataset class
class CodeSummaryDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=512):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        entry = self.data[idx]
        code = entry['code']
        summary = entry['summary']
        input_text = f"code: {code} summary: {summary}"  # Concatenate code and summary
        input_ids = self.tokenizer.encode(input_text, max_length=self.max_length, padding='max_length', truncation=True, return_tensors="pt")
        return input_ids.squeeze()


# Prepare training data
train_dataset = CodeSummaryDataset(train_data, tokenizer)

# Define training parameters
batch_size = 8  # Decreased batch size (you can try different values)
epochs = 10  # Increased epochs
accumulation_steps = 4  # Accumulate gradients over 4 steps

# Define training loop
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

for epoch in range(epochs):
    model.train()
    total_loss = 0
    tqdm_dataloader = tqdm(train_dataloader, desc=f"Epoch {epoch+1}/{epochs}")

    for i, input_ids in enumerate(tqdm_dataloader):
        input_ids = input_ids.to(device)
        labels = input_ids.clone()
        labels[input_ids == tokenizer.pad_token_id] = -100  # Ignore padding tokens in loss calculation
        outputs = model(input_ids=input_ids, labels=labels)
        loss = outputs.loss

        # Perform gradient accumulation
        loss = loss / accumulation_steps
        loss.backward()

        # Update parameters after accumulating gradients for specified steps
        if (i + 1) % accumulation_steps == 0 or i == len(train_dataloader) - 1:
            optimizer.step()
            optimizer.zero_grad()

        total_loss += loss.item()

    tqdm_dataloader.close()
    avg_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch+1}/{epochs}, Average Loss: {avg_loss:.4f}")

# Save the trained model
model.save_pretrained("code_summary_model")



Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Epoch 1/10:   0%|          | 0/1 [00:00<?, ?it/s]

Epoch 1/10, Average Loss: 4.9054


Epoch 2/10:   0%|          | 0/1 [00:00<?, ?it/s]

Epoch 2/10, Average Loss: 1.4791


Epoch 3/10:   0%|          | 0/1 [00:00<?, ?it/s]

Epoch 3/10, Average Loss: 0.9622


Epoch 4/10:   0%|          | 0/1 [00:00<?, ?it/s]

Epoch 4/10, Average Loss: 1.4851


Epoch 5/10:   0%|          | 0/1 [00:00<?, ?it/s]

Epoch 5/10, Average Loss: 0.9616


Epoch 6/10:   0%|          | 0/1 [00:00<?, ?it/s]

Epoch 6/10, Average Loss: 0.5953


Epoch 7/10:   0%|          | 0/1 [00:00<?, ?it/s]

Epoch 7/10, Average Loss: 0.5959


Epoch 8/10:   0%|          | 0/1 [00:00<?, ?it/s]

Epoch 8/10, Average Loss: 0.8727


Epoch 9/10:   0%|          | 0/1 [00:00<?, ?it/s]

Epoch 9/10, Average Loss: 0.6155


Epoch 10/10:   0%|          | 0/1 [00:00<?, ?it/s]

Epoch 10/10, Average Loss: 0.5599


In [None]:
!pip install rouge-score


Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24933 sha256=8da128ebb1a115f7d8ea858bb80cd3ae062c63b539819b50c6ddbd709cbd4fe8
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2


In [None]:
!pip install sacrebleu


Collecting sacrebleu
  Downloading sacrebleu-2.4.2-py3-none-any.whl (106 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/106.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━[0m [32m61.4/106.7 kB[0m [31m1.6 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m106.7/106.7 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting portalocker (from sacrebleu)
  Downloading portalocker-2.8.2-py3-none-any.whl (17 kB)
Collecting colorama (from sacrebleu)
  Downloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Installing collected packages: portalocker, colorama, sacrebleu
Successfully installed colorama-0.4.6 portalocker-2.8.2 sacrebleu-2.4.2


In [None]:
import torch
from transformers import T5ForConditionalGeneration, T5Tokenizer
from datasets import load_dataset, load_metric

# Load dataset
dataset = load_dataset("samaxr/code-summary-java")
train_data = dataset["train"]

# Load ROUGE and BLEU metrics
rouge_metric = load_metric("rouge")
bleu_metric = load_metric("sacrebleu")

# Load the saved model
model_path = "code_summary_model"
tokenizer = T5Tokenizer.from_pretrained(model_path)
model = T5ForConditionalGeneration.from_pretrained(model_path)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Define a function to preprocess the data
def preprocess_data(data, tokenizer):
    inputs = []
    references = []
    for entry in data:
        code = entry['code']
        summary = entry['summary']
        input_text = f"code: {code} summary: {summary}"  # Concatenate code and summary
        inputs.append(input_text)
        references.append(summary)
    return inputs, references

# Preprocess the data
inputs, references = preprocess_data(train_data, tokenizer)

# Calculate metrics
def calculate_metrics(model, tokenizer, inputs, references, device):
    predictions = []
    for input_text in inputs:
        input_ids = tokenizer.encode(input_text, max_length=512, padding='max_length', truncation=True, return_tensors="pt").to(device)
        output_ids = model.generate(input_ids=input_ids, max_length=150, num_beams=2, early_stopping=True)
        predicted_summary = tokenizer.decode(output_ids[0], skip_special_tokens=True)
        predictions.append(predicted_summary)

    rouge_scores = rouge_metric.compute(predictions=predictions, references=references)
    bleu_score = bleu_metric.compute(predictions=predictions, references=references)

    return rouge_scores, bleu_score

# Calculate metrics
rouge_scores, bleu_score = calculate_metrics(model, tokenizer, inputs, references, device)

# Print metrics
print(f"ROUGE: {rouge_scores['rougeL'].mid.fmeasure:.4f}, BLEU: {bleu_score['score']:.4f}")



You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


OSError: code_summary_model is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `huggingface-cli login` or by passing `token=<your_token>`