In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pandas as pd
import torch
from transformers import BertTokenizer, BertForMaskedLM, BertForPreTraining, AdamW

In [None]:
from google.colab import drive
drive.mount('/content/drive/')
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)
project_path='/content/drive/My Drive/Colab Notebooks/'

In [None]:
# Load dataset
df = pd.read_csv(project_path + 'Data/Data_sample.csv')
df.columns

In [None]:
input_text_combine = df['pull_request_titles', 'commit_msgs', 'issue_titles']		
release_data = df['release_content'] 
input_text_pr = df['pull_request_titles']
input_text_cm = df['commit_msgs']
input_text_issue = df['issue_titles']

In [None]:
# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForPreTraining.from_pretrained('bert-base-uncased')

In [None]:
# Tokenize release notes data with input combined data
tokenized_data = tokenizer(release_data, padding=True, truncation=True, return_tensors="pt")


# Define training parameters
optimizer = AdamW(model.parameters(), lr=5e-5)
epochs = 3

# Training loop
for epoch in range(epochs):
    for batch in DataLoader(df, batch_size=2, shuffle=True):
        optimizer.zero_grad()
        outputs = model(**batch, labels=batch["input_ids"])
        loss = outputs.loss
        loss.backward()
        optimizer.step()

input_ids = tokenizer.encode(input_text_combine, return_tensors="pt")
output = model.generate(input_ids, max_length=50, num_return_sequences=1)
generated_notes = tokenizer.decode(output[0], skip_special_tokens=True)

print("Generated Release Notes:", generated_notes)

In [None]:
# Tokenize release notes data with input PR titles
tokenized_data = tokenizer(release_data, padding=True, truncation=True, return_tensors="pt")


# Define training parameters
optimizer = AdamW(model.parameters(), lr=5e-5)
epochs = 3

# Training loop
for epoch in range(epochs):
    for batch in DataLoader(df, batch_size=2, shuffle=True):
        optimizer.zero_grad()
        outputs = model(**batch, labels=batch["input_ids"])
        loss = outputs.loss
        loss.backward()
        optimizer.step()

input_ids = tokenizer.encode(input_text_pr, return_tensors="pt")
output = model.generate(input_ids, max_length=50, num_return_sequences=1)
generated_notes_based_on_pr = tokenizer.decode(output[0], skip_special_tokens=True)

print("Generated Release Notes:", generated_notes_based_on_pr)

In [None]:
# Tokenize release notes data with input commit messages
tokenized_data = tokenizer(release_data, padding=True, truncation=True, return_tensors="pt")


# Define training parameters
optimizer = AdamW(model.parameters(), lr=5e-5)
epochs = 3

# Training loop
for epoch in range(epochs):
    total_loss = 0
    for batch in DataLoader(df, batch_size=2, shuffle=True):
        optimizer.zero_grad()
        outputs = model(**batch, labels=batch["input_ids"])
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        total_loss += loss.item()
    print(f"Epoch {epoch + 1}, Loss: {total_loss}")

input_ids = tokenizer.encode(input_text_cm, return_tensors="pt")
output = model.generate(input_ids, max_length=50, num_return_sequences=1)
generated_notes_cm = tokenizer.decode(output[0], skip_special_tokens=True)

print("Generated Release Notes:", generated_notes_cm)

In [None]:
# Tokenize release notes data with input issue titles
tokenized_data = tokenizer(release_data, padding=True, truncation=True, return_tensors="pt")


# Define training parameters
optimizer = AdamW(model.parameters(), lr=5e-5)
epochs = 3

# Training loop
for epoch in range(epochs):
    for batch in DataLoader(df, batch_size=2, shuffle=True):
        optimizer.zero_grad()
        outputs = model(**batch, labels=batch["input_ids"])
        loss = outputs.loss
        loss.backward()
        optimizer.step()

input_ids = tokenizer.encode(input_text_issue, return_tensors="pt")
output = model.generate(input_ids, max_length=50, num_return_sequences=1)
generated_notes_issue = tokenizer.decode(output[0], skip_special_tokens=True)

print("Generated Release Notes:", generated_notes_issue)