In [1]:
from datasets import load_dataset, load_from_disk
from transformers import RobertaForMaskedLM, Trainer, TrainingArguments
import torch
torch.cuda.empty_cache()

In [2]:
import os

os.environ['HF_DATASETS_CACHE'] = './Datasets'
!export PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0

In [3]:
from datasets import load_dataset
# Load the CodeSearchNet datasets for Python and Java
dataset_python = load_dataset('code_search_net', 'python', trust_remote_code=True, cache_dir='./Datasets')

# dataset_javascript = load_dataset('code_search_net', 'javascript', trust_remote_code=True)

# Combine the training sets of the datasets for multiple programming languages
train_dataset = dataset_python['train']
# combined_dataset = [combined_dataset, dataset_javascript['train']])

test_dataset = dataset_python['test']

# Sample for validation
val_dataset = dataset_python['validation']

# Display the first example from the combined dataset

In [4]:
val_dataset

Dataset({
    features: ['repository_name', 'func_path_in_repository', 'func_name', 'whole_func_string', 'language', 'func_code_string', 'func_code_tokens', 'func_documentation_string', 'func_documentation_tokens', 'split_name', 'func_code_url'],
    num_rows: 23107
})

In [5]:
token = "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"
model = RobertaForMaskedLM.from_pretrained("microsoft/codebert-base", use_auth_token=token, cache_dir = "./Models")

Some weights of RobertaForMaskedLM were not initialized from the model checkpoint at microsoft/codebert-base and are newly initialized: ['lm_head.bias', 'lm_head.decoder.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
device

device(type='cpu')

In [7]:
from transformers import AutoTokenizer

# Load the tokenizer for your model (e.g., CodeBERT)
tokenizer = AutoTokenizer.from_pretrained("microsoft/codebert-base")

# Define the tokenization function
def tokenize_function(examples):
    return tokenizer(
        examples['func_code_string'],  # Adjust the key based on your dataset structure
        padding='max_length',
        truncation=True,
        max_length=512
    )


In [8]:
from transformers import Trainer, TrainingArguments

# Define training arguments
training_args = TrainingArguments(
    output_dir="output",          # Directory to save model outputs
    evaluation_strategy="epoch",  # Evaluation strategy
    learning_rate=5e-5,           # Learning rate
    per_device_train_batch_size=4,  # Batch size for training
    per_device_eval_batch_size=4,  # Batch size for evaluation
    num_train_epochs=3,           # Number of training epochs
    weight_decay=0.01,            # Strength of weight decay
)

# Initialize the Trainer
trainer = Trainer(
    model=model,                 # Your model instance
    args=training_args,         # Training arguments
    train_dataset=train_dataset,  # Tokenized training dataset
    eval_dataset=val_dataset,     # Tokenized validation dataset
)




In [9]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    gradient_accumulation_steps=4,
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    remove_unused_columns=False # Set this to False to avoid the column check
)


In [10]:
def tokenize_function(examples):
    input_encodings = tokenizer(
        examples['func_code_string'],
        padding='max_length',
        truncation=True,
        max_length=512
    )

    labels = tokenizer(
        examples['func_documentation_string'],
        padding='max_length',
        truncation=True,
        max_length=512
    )

    return {
        'input_ids': input_encodings['input_ids'],  # Should be list of integers
        'attention_mask': input_encodings['attention_mask'],  # Should be list of integers
        'labels': labels['input_ids'],  # Should be list of integers
    }


In [11]:
# Tokenize the datasets
tokenized_train_data = train_dataset.map(tokenize_function, batched=True)
tokenized_val_data = val_dataset.map(tokenize_function, batched=True)

# Remove unused columns if necessary
tokenized_train_data = tokenized_train_data.remove_columns(train_dataset.column_names)
tokenized_val_data = tokenized_val_data.remove_columns(val_dataset.column_names)

In [12]:
device

device(type='cpu')

In [13]:
from torch.utils.data import DataLoader
import torch

def custom_collate_fn(batch):
    # Extract the necessary components from each item in the batch
    input_ids = [item['input_ids'] for item in batch]
    attention_mask = [item['attention_mask'] for item in batch]
    labels = [item['labels'] for item in batch]

    # Convert lists of lists into tensors
    return {
        'input_ids': torch.tensor(input_ids).to(device),
        'attention_mask': torch.tensor(attention_mask).to(device),
        'labels': torch.tensor(labels).to(device),
    }


In [14]:
# Creating the DataLoader with the custom collate function
train_dataloader = DataLoader(tokenized_train_data, batch_size=16, collate_fn=custom_collate_fn)


In [15]:
# Check the first few examples in your tokenized data
print(tokenized_train_data[:3])  # Adjust slicing based on your data structure


{'input_ids': [[0, 9232, 1606, 7823, 5865, 3181, 1640, 13367, 6, 1437, 7823, 5865, 3181, 6, 48875, 5457, 7447, 3256, 50118, 1437, 1437, 1437, 1437, 1437, 1437, 1437, 49434, 19186, 8845, 10, 47503, 7, 42, 7510, 18, 4576, 6, 8, 47505, 7, 70, 920, 35172, 25, 157, 4, 345, 16, 2628, 2368, 117, 240, 7, 486, 42, 2024, 6, 29198, 41030, 30, 4832, 119, 4774, 35, 12905, 44273, 12905, 49849, 50118, 1437, 1437, 1437, 1437, 1437, 1437, 1437, 114, 1403, 4, 808, 35, 1403, 4, 808, 49371, 1437, 7823, 5865, 3181, 50118, 1437, 1437, 1437, 1437, 1437, 1437, 1437, 114, 48875, 35, 50118, 1437, 1437, 1437, 1437, 1437, 1437, 1437, 1437, 1437, 1437, 1437, 13, 364, 11, 1403, 35, 50118, 1437, 1437, 1437, 1437, 1437, 1437, 1437, 1437, 1437, 1437, 1437, 1437, 1437, 1437, 1437, 860, 35, 50118, 1437, 1437, 1437, 1437, 1437, 1437, 1437, 1437, 1437, 1437, 1437, 1437, 1437, 1437, 1437, 1437, 1437, 1437, 1437, 364, 4, 4917, 7823, 5865, 3181, 1640, 7823, 5865, 3181, 6, 48875, 43, 50118, 1437, 1437, 1437, 1437, 1437, 1437,

In [16]:
from torch.utils.data import Dataset

class CustomDataset(Dataset):
    def __init__(self, tokenized_data):
        self.data = tokenized_data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

# Wrap your tokenized data into a Dataset
train_dataset = CustomDataset(tokenized_train_data)






In [17]:
# Use this Dataset with the Trainer
trainer = Trainer(
    model=model,  # Your pre-initialized model
    args=training_args,  # Training arguments
    train_dataset=train_dataset,  # Use the custom Dataset
    eval_dataset=val_dataset,  # Optional evaluation dataset
)

In [None]:

# Start training
trainer.train()

  0%|          | 0/77283 [00:00<?, ?it/s]

In [33]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

Duplicate cell

In [7]:
# Tokenize the datasets
tokenized_train_data = train_dataset.map(tokenize_function, batched=True)
tokenized_val_data = val_dataset.map(tokenize_function, batched=True)

# Remove the original code strings if necessary
tokenized_train_data = tokenized_train_data.remove_columns(['func_code_string'])
tokenized_val_data = tokenized_val_data.remove_columns(['func_code_string'])


Map: 100%|██████████| 866629/866629 [03:34<00:00, 4035.90 examples/s]


In [None]:
# Define the tokenization function
def tokenize_function(examples):
    # Tokenize the input code and the corresponding documentation
    input_encodings = tokenizer(
        examples['func_code_string'],  # Adjust the key based on your dataset structure
        padding='max_length',
        truncation=True,
        max_length=512
    )
    
    labels = tokenizer(
        examples['func_documentation_string'],  # Adjust this to the correct documentation key
        padding='max_length',
        truncation=True,
        max_length=512
    )

    # Return input encodings and labels
    return {
        'input_ids': input_encodings['input_ids'],
        'attention_mask': input_encodings['attention_mask'],
        'labels': labels['input_ids'],  # This is how the model expects labels
    }

# Tokenize the datasets
tokenized_train_data = train_dataset.map(tokenize_function, batched=True)
tokenized_val_data = val_dataset.map(tokenize_function, batched=True)

# Remove unused columns if necessary
tokenized_train_data = tokenized_train_data.remove_columns(train_dataset.column_names)
tokenized_val_data = tokenized_val_data.remove_columns(val_dataset.column_names)


In [None]:
from torch.utils.data import DataLoader

def custom_collate_fn(batch):
    # Assuming all inputs are lists of same length
    input_ids = [item['input_ids'] for item in batch]
    attention_mask = [item['attention_mask'] for item in batch]
    labels = [item['labels'] for item in batch]

    return {
        'input_ids': torch.tensor(input_ids),
        'attention_mask': torch.tensor(attention_mask),
        'labels': torch.tensor(labels),
    }

# Use this in your DataLoader
train_dataloader = DataLoader(tokenized_train_data, batch_size=16, collate_fn=custom_collate_fn)


In [None]:
print(dataset_python['train'].column_names)


In [None]:
from transformers import AutoTokenizer

token = "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"
tokenizer = AutoTokenizer.from_pretrained("Salesforce/codegen-350M-multi", use_auth_token=token, cache_dir = "./Models")


In [None]:
tokenizer.pad_token = tokenizer.eos_token
# Tokenize the datasets using the correct field name
def tokenize_function(example):
    return tokenizer(example['func_code_string'], padding="max_length", truncation=True, max_length=512)

# Apply tokenization
tokenized_datasets = combined_dataset.map(tokenize_function, batched=True)
# After tokenizing, save the dataset to a specific location
output_directory = './Datasets/tokenized'
tokenized_datasets.save_to_disk(output_directory)


In [None]:
# Define a preprocessing function to tokenize your dataset
def preprocess_function(examples):
    inputs = examples['input_text']  # Replace with the correct key for input texts
    targets = examples['target_text']  # Replace with the correct key for target texts
    model_inputs = tokenizer(inputs, max_length=512, truncation=True)
    
    # Tokenize the targets (for seq2seq models like T5)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=512, truncation=True)

    model_inputs['labels'] = labels['input_ids']
    return model_inputs

# Apply the preprocessing to your dataset
tokenized_datasets = combined_dataset.map(tokenize_function, batched=True)
# After tokenizing, save the dataset to a specific location
output_directory = './Datasets/tokenized'
tokenized_datasets.save_to_disk(output_directory)


In [None]:
# Apply tokenization
tokenized_datasets_validation = validation_dataset.map(tokenize_function, batched=True)
# After tokenizing, save the dataset to a specific location
output_directory = './Datasets/tokenized/validation'
tokenized_datasets_validation.save_to_disk(output_directory)

In [103]:
import torch
from transformers import Trainer, TrainingArguments, AutoModelForSeq2SeqLM
# Load your pre-trained model (e.g., T5 or CodeBERT)
model_name = "t5-base"  # or another model suitable for code-to-text tasks
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# Set up training arguments
training_args = TrainingArguments(
    output_dir='./results',                  # output directory for model predictions and checkpoints
    evaluation_strategy="steps",             # evaluation is done at the end of each epoch
    learning_rate=5e-5,                      # learning rate
    per_device_train_batch_size=3,           # batch size for training
    per_device_eval_batch_size=3,            # batch size for evaluation
    num_train_epochs=3,                      # total number of training epochs
    weight_decay=0.01,                       # strength of weight decay
    logging_dir='./logs',                    # directory for storing logs
    logging_steps=10,
    save_steps=500,
    load_best_model_at_end=True,             # load the best model at the end of training
    metric_for_best_model="eval_loss"        # metric for determining the best model
)

# Function to compute metrics
def compute_metrics(pred):
    labels_ids = pred.label_ids
    pred_ids = pred.predictions.argmax(axis=-1)
    # Replace -100 in the labels as we can't decode them
    labels_ids = np.where(labels_ids != -100, labels_ids, tokenizer.pad_token_id)
    # Decode the predictions and labels
    decoded_preds = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)

    # Use a metric such as BLEU or ROUGE here
    # For simplicity, you can use the following placeholder:
    return {"bleu": calculate_bleu(decoded_preds, decoded_labels)}

In [None]:
print(tokenized_datasets)  # to check the overall structure
print(tokenized_datasets.column_names)  # to list all column names


In [105]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1' 

In [106]:
#device = torch.device("cpu")  # Change to "cuda" when you want to run on GPU
#model.to(device)


In [107]:
from transformers import DataCollatorForSeq2Seq

# Custom data collator
def custom_data_collator(features):
    if not isinstance(features[0], dict):
        # Ensure that features are converted to dicts
        features = [vars(f) if hasattr(f, '__dict__') else f for f in features]
    return DataCollatorForSeq2Seq(tokenizer)(features)

In [110]:
# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["func_code_string"],
    eval_dataset=tokenized_datasets_validation["func_code_string"],
    compute_metrics=compute_metrics,
    data_collator=custom_data_collator,
    tokenizer=tokenizer
)


In [None]:


# Function to prepare the dataset
def prepare_data(example):
    return {
        'input_ids': example['func_code_tokens'],  # Tokenized function code
        'attention_mask': [1] * len(example['func_code_tokens']),  # All tokens are valid
        'labels': example['func_documentation_tokens']  # Documentation tokens
    }

# Map the Python dataset
train_data_python = dataset_python['train'].map(prepare_data, remove_columns=dataset_python['train'].column_names)
val_data_python = dataset_python['validation'].map(prepare_data, remove_columns=dataset_python['validation'].column_names)

# Map the Java dataset
train_data_java = dataset_java['train'].map(prepare_data, remove_columns=dataset_java['train'].column_names)
val_data_java = dataset_java['validation'].map(prepare_data, remove_columns=dataset_java['validation'].column_names)

# Check the mapped dataset columns for Python and Java
print("Mapped Train columns (Python):", train_data_python.column_names)
print("Mapped Validation columns (Python):", val_data_python.column_names)
print("Mapped Train columns (Java):", train_data_java.column_names)
print("Mapped Validation columns (Java):", val_data_java.column_names)


In [109]:
from transformers import RobertaTokenizer

# Load the tokenizer
tokenizer = RobertaTokenizer.from_pretrained("microsoft/codebert-base")

# Tokenization function
def prepare_data(example):
    # Tokenizing code and documentation
    code_encoding = tokenizer(example['func_code_tokens'], truncation=True, padding='max_length', max_length=512)
    doc_encoding = tokenizer(example['func_documentation_tokens'], truncation=True, padding='max_length', max_length=128)

    return {
        'input_ids': code_encoding['input_ids'],  # Tokenized function code
        'attention_mask': code_encoding['attention_mask'],  # Attention mask
        'labels': doc_encoding['input_ids']  # Tokenized documentation
    }


In [None]:
# Train the model 
trainer.train()

# Save the model
model.save_pretrained('./model')
tokenizer.save_pretrained('./model')

In [None]:
print(tokenized_datasets['func_code_string'][:10])  # Check first 10 samples
