In [20]:
from datasets import load_dataset, load_from_disk
from transformers import RobertaForMaskedLM, Trainer, TrainingArguments
import torch
from transformers import DataCollatorWithPadding
torch.cuda.empty_cache()

In [21]:
import os

os.environ['HF_DATASETS_CACHE'] = './Datasets'
os.environ['PYTORCH_MPS_HIGH_WATERMARK_RATIO']='0.0'

In [22]:
from datasets import load_dataset
# Load the CodeSearchNet datasets for Python and Java
train_python = load_dataset('code_search_net', 'python', trust_remote_code=True, cache_dir='./Datasets', split='train')
test_python = load_dataset('code_search_net', 'python', trust_remote_code=True, cache_dir='./Datasets', split='test')
val_python = load_dataset('code_search_net', 'python', trust_remote_code=True, cache_dir='./Datasets', split='validation')

# dataset_javascript = load_dataset('code_search_net', 'javascript', trust_remote_code=True)
print(type(train_python), train_python[1])
# Combine the training sets of the datasets for multiple programming language
train_dataset = train_python.shuffle(seed=42).select(range(100))
print(type(train_dataset))
# combined_dataset = [combined_dataset, dataset_javascript['train']])

test_dataset = test_python.shuffle(seed=42).select(range(100))

# Sample for validation
val_dataset = val_python.shuffle(seed=42).select(range(50))

# Display the first example from the combined dataset

<class 'datasets.arrow_dataset.Dataset'> {'repository_name': 'ageitgey/face_recognition', 'func_path_in_repository': 'examples/face_recognition_knn.py', 'func_name': 'predict', 'whole_func_string': 'def predict(X_img_path, knn_clf=None, model_path=None, distance_threshold=0.6):\n    """\n    Recognizes faces in given image using a trained KNN classifier\n\n    :param X_img_path: path to image to be recognized\n    :param knn_clf: (optional) a knn classifier object. if not specified, model_save_path must be specified.\n    :param model_path: (optional) path to a pickled knn classifier. if not specified, model_save_path must be knn_clf.\n    :param distance_threshold: (optional) distance threshold for face classification. the larger it is, the more chance\n           of mis-classifying an unknown person as a known one.\n    :return: a list of names and face locations for the recognized faces in the image: [(name, bounding box), ...].\n        For faces of unrecognized persons, the name \

In [23]:
train_dataset.shape,test_dataset.shape, val_dataset.shape, type(train_dataset), type(val_dataset), type(test_dataset), type(train_dataset[:3])

((100, 11),
 (100, 11),
 (50, 11),
 datasets.arrow_dataset.Dataset,
 datasets.arrow_dataset.Dataset,
 datasets.arrow_dataset.Dataset,
 dict)

### Model

In [24]:
token = "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"
model = RobertaForMaskedLM.from_pretrained("microsoft/codebert-base", use_auth_token=token, cache_dir = "./Models")

Some weights of RobertaForMaskedLM were not initialized from the model checkpoint at microsoft/codebert-base and are newly initialized: ['lm_head.bias', 'lm_head.decoder.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Device setup

In [25]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
device, torch.cuda.get_device_name(0)

(device(type='cuda'), 'NVIDIA GeForce GTX 1650')

### Tokenization

In [26]:
from transformers import AutoTokenizer

# Load the tokenizer for your model (e.g., CodeBERT)
tokenizer = AutoTokenizer.from_pretrained("microsoft/codebert-base")

# Define the tokenization function
def tokenize_function(examples):
    return tokenizer(
        examples['func_code_string'],  # Adjust the key based on your dataset structure
        padding='max_length',
        truncation=True,
        max_length=512
    )


In [28]:
code_texts = [example['func_code_string'] for example in train_python]
doc_texts = [example['func_documentation_string'] for example in train_python]

In [None]:
encoded_code = tokenizer(code_texts, padding=True, truncation=True, return_tensors="pt")
encoded_docs = tokenizer(doc_texts, padding=True, truncation=True, return_tensors="pt")

In [8]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    save_steps=1000,
    remove_unused_columns=False,
    save_total_limit=2,
    fp16=True # Set this to False to avoid the column check
)




In [11]:
device

device(type='cuda')

In [13]:
# Creating the DataLoader with the custom collate function
train_dataloader = DataLoader(tokenized_train_data, batch_size=16, collate_fn=custom_collate_fn)


In [14]:
# Check the first few examples in your tokenized data
print(tokenized_train_data[:3])  # Adjust slicing based on your data structure


{'input_ids': [[0, 9232, 2310, 1640, 13367, 43, 43839, 49460, 35, 50118, 1437, 1437, 1437, 1437, 1437, 1437, 1437, 49434, 50118, 1437, 1437, 1437, 1437, 1437, 1437, 1437, 47416, 2469, 7471, 37127, 4333, 8210, 48128, 18400, 37127, 15389, 9264, 36714, 4394, 9085, 48823, 711, 48569, 47504, 18164, 17, 48, 47240, 7487, 41907, 14292, 48, 17, 46, 43251, 4394, 14285, 48732, 4333, 47842, 15389, 48569, 48745, 5782, 47089, 9253, 37127, 14292, 5543, 44636, 23171, 37127, 10674, 9253, 45262, 711, 45682, 50118, 1437, 1437, 1437, 1437, 1437, 1437, 1437, 47416, 12410, 10470, 49257, 48820, 15722, 47873, 11423, 46499, 12736, 318, 12, 29802, 12, 42633, 1589, 4799, 1073, 6, 47111, 10659, 14285, 318, 12, 30597, 3786, 12, 11321, 47111, 10659, 14285, 1426, 12, 30597, 3786, 1437, 49065, 49117, 20024, 44574, 36714, 4394, 9085, 48823, 711, 47658, 9357, 42393, 15722, 27819, 45682, 50118, 1437, 1437, 1437, 1437, 1437, 1437, 1437, 47111, 48, 11423, 36484, 2840, 4726, 36714, 10809, 2840, 48105, 46015, 10278, 47876, 

In [16]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [17]:
# Use this Dataset with the Trainer
trainer = Trainer(
    model=model,  # Your pre-initialized model
    args=training_args,  # Training arguments
    train_dataset=train_dataset,  # Use the custom Dataset
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator  # Optional evaluation dataset
)

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


In [19]:

# Start training
trainer.train()

  0%|          | 0/21 [00:07<?, ?it/s]


ValueError: You should supply an encoding or a list of encodings to this method that includes input_ids, but you provided ['repository_name', 'func_path_in_repository', 'func_name', 'whole_func_string', 'language', 'func_code_string', 'func_code_tokens', 'func_documentation_string', 'func_documentation_tokens', 'split_name', 'func_code_url']

### existing code 

In [10]:
# Tokenize the datasets
print(train_dataset)
print(train_dataset[0])
tokenized_train_data = train_dataset.map(tokenize_function, batched=True)
tokenized_val_data = val_dataset.map(tokenize_function, batched=True)
print(tokenized_train_data)
print(tokenized_train_data[0])
# Remove unused columns if necessary
tokenized_train_data = tokenized_train_data.remove_columns(train_dataset.column_names)
tokenized_val_data = tokenized_val_data.remove_columns(val_dataset.column_names)
print(tokenized_train_data)
print(tokenized_train_data[0])

Dataset({
    features: ['repository_name', 'func_path_in_repository', 'func_name', 'whole_func_string', 'language', 'func_code_string', 'func_code_tokens', 'func_documentation_string', 'func_documentation_tokens', 'split_name', 'func_code_url'],
    num_rows: 100
})
{'repository_name': 'zeromake/aiko', 'func_path_in_repository': 'aiko/request.py', 'func_name': 'Request.fresh', 'whole_func_string': 'def fresh(self) -> bool:\n        """\n        检查请求缓存是否“新鲜”，也就是内容没有改变。\n        此方法用于 If-None-Match / ETag, 和 If-Modified-Since 和 Last-Modified 之间的缓存协商。\n        在设置一个或多个这些响应头后应该引用它。\n        """\n        method_str = self.method\n        if method_str != \'GET\' and method_str != \'HEAD\':\n            return False\n        s = self.ctx.status\n        if (s >= 200 and s < 300) or s == 304:\n            return fresh(\n                self.headers,\n                (self.response and self.response.headers) or {},\n            )\n        return False', 'language': 'python', 'func_code_string

In [15]:
from torch.utils.data import Dataset

class CustomDataset(Dataset):
    def __init__(self, tokenized_data):
        self.data = tokenized_data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

# Wrap your tokenized data into a Dataset
#train_dataset = CustomDataset(tokenized_train_data)






In [12]:
from torch.utils.data import DataLoader
import torch

def custom_collate_fn(batch):
    # Extract the necessary components from each item in the batch
    input_ids = [item['input_ids'] for item in batch]
    attention_mask = [item['attention_mask'] for item in batch]
    labels = [item['labels'] for item in batch]

    # Convert lists of lists into tensors
    return {
        'input_ids': torch.tensor(input_ids).to(device),
        'attention_mask': torch.tensor(attention_mask).to(device),
        'labels': torch.tensor(labels).to(device),
    }


In [9]:
def tokenize_function(examples):
    input_encodings = tokenizer(
        examples['func_code_string'],
        padding='max_length',
        truncation=True,
        max_length=512
    )

    labels = tokenizer(
        examples['func_documentation_string'],
        padding='max_length',
        truncation=True,
        max_length=512
    )

    return {
        'input_ids': input_encodings['input_ids'],  # Should be list of integers
        'attention_mask': input_encodings['attention_mask'],  # Should be list of integers
        'labels': labels['input_ids'],  # Should be list of integers
    }


Duplicate cell

In [33]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

In [7]:
# Tokenize the datasets
tokenized_train_data = train_dataset.map(tokenize_function, batched=True)
tokenized_val_data = val_dataset.map(tokenize_function, batched=True)

# Remove the original code strings if necessary
tokenized_train_data = tokenized_train_data.remove_columns(['func_code_string'])
tokenized_val_data = tokenized_val_data.remove_columns(['func_code_string'])


Map: 100%|██████████| 866629/866629 [03:34<00:00, 4035.90 examples/s]


In [None]:
# Define the tokenization function
def tokenize_function(examples):
    # Tokenize the input code and the corresponding documentation
    input_encodings = tokenizer(
        examples['func_code_string'],  # Adjust the key based on your dataset structure
        padding='max_length',
        truncation=True,
        max_length=512
    )
    
    labels = tokenizer(
        examples['func_documentation_string'],  # Adjust this to the correct documentation key
        padding='max_length',
        truncation=True,
        max_length=512
    )

    # Return input encodings and labels
    return {
        'input_ids': input_encodings['input_ids'],
        'attention_mask': input_encodings['attention_mask'],
        'labels': labels['input_ids'],  # This is how the model expects labels
    }

# Tokenize the datasets
tokenized_train_data = train_dataset.map(tokenize_function, batched=True)
tokenized_val_data = val_dataset.map(tokenize_function, batched=True)

# Remove unused columns if necessary
tokenized_train_data = tokenized_train_data.remove_columns(train_dataset.column_names)
tokenized_val_data = tokenized_val_data.remove_columns(val_dataset.column_names)


In [None]:
from torch.utils.data import DataLoader

def custom_collate_fn(batch):
    # Assuming all inputs are lists of same length
    input_ids = [item['input_ids'] for item in batch]
    attention_mask = [item['attention_mask'] for item in batch]
    labels = [item['labels'] for item in batch]

    return {
        'input_ids': torch.tensor(input_ids),
        'attention_mask': torch.tensor(attention_mask),
        'labels': torch.tensor(labels),
    }

# Use this in your DataLoader
train_dataloader = DataLoader(tokenized_train_data, batch_size=16, collate_fn=custom_collate_fn)


In [None]:
print(dataset_python['train'].column_names)


In [None]:
from transformers import AutoTokenizer

token = "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"
tokenizer = AutoTokenizer.from_pretrained("Salesforce/codegen-350M-multi", use_auth_token=token, cache_dir = "./Models")


In [None]:
tokenizer.pad_token = tokenizer.eos_token
# Tokenize the datasets using the correct field name
def tokenize_function(example):
    return tokenizer(example['func_code_string'], padding="max_length", truncation=True, max_length=512)

# Apply tokenization
tokenized_datasets = combined_dataset.map(tokenize_function, batched=True)
# After tokenizing, save the dataset to a specific location
output_directory = './Datasets/tokenized'
tokenized_datasets.save_to_disk(output_directory)


In [None]:
# Define a preprocessing function to tokenize your dataset
def preprocess_function(examples):
    inputs = examples['input_text']  # Replace with the correct key for input texts
    targets = examples['target_text']  # Replace with the correct key for target texts
    model_inputs = tokenizer(inputs, max_length=512, truncation=True)
    
    # Tokenize the targets (for seq2seq models like T5)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=512, truncation=True)

    model_inputs['labels'] = labels['input_ids']
    return model_inputs

# Apply the preprocessing to your dataset
tokenized_datasets = combined_dataset.map(tokenize_function, batched=True)
# After tokenizing, save the dataset to a specific location
output_directory = './Datasets/tokenized'
tokenized_datasets.save_to_disk(output_directory)


In [None]:
# Apply tokenization
tokenized_datasets_validation = validation_dataset.map(tokenize_function, batched=True)
# After tokenizing, save the dataset to a specific location
output_directory = './Datasets/tokenized/validation'
tokenized_datasets_validation.save_to_disk(output_directory)

In [103]:
import torch
from transformers import Trainer, TrainingArguments, AutoModelForSeq2SeqLM
# Load your pre-trained model (e.g., T5 or CodeBERT)
model_name = "t5-base"  # or another model suitable for code-to-text tasks
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# Set up training arguments
training_args = TrainingArguments(
    output_dir='./results',                  # output directory for model predictions and checkpoints
    evaluation_strategy="steps",             # evaluation is done at the end of each epoch
    learning_rate=5e-5,                      # learning rate
    per_device_train_batch_size=3,           # batch size for training
    per_device_eval_batch_size=3,            # batch size for evaluation
    num_train_epochs=3,                      # total number of training epochs
    weight_decay=0.01,                       # strength of weight decay
    logging_dir='./logs',                    # directory for storing logs
    logging_steps=10,
    save_steps=500,
    load_best_model_at_end=True,             # load the best model at the end of training
    metric_for_best_model="eval_loss"        # metric for determining the best model
)

# Function to compute metrics
def compute_metrics(pred):
    labels_ids = pred.label_ids
    pred_ids = pred.predictions.argmax(axis=-1)
    # Replace -100 in the labels as we can't decode them
    labels_ids = np.where(labels_ids != -100, labels_ids, tokenizer.pad_token_id)
    # Decode the predictions and labels
    decoded_preds = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)

    # Use a metric such as BLEU or ROUGE here
    # For simplicity, you can use the following placeholder:
    return {"bleu": calculate_bleu(decoded_preds, decoded_labels)}

In [None]:
print(tokenized_datasets)  # to check the overall structure
print(tokenized_datasets.column_names)  # to list all column names


In [105]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1' 

In [106]:
#device = torch.device("cpu")  # Change to "cuda" when you want to run on GPU
#model.to(device)


In [107]:
from transformers import DataCollatorForSeq2Seq

# Custom data collator
def custom_data_collator(features):
    if not isinstance(features[0], dict):
        # Ensure that features are converted to dicts
        features = [vars(f) if hasattr(f, '__dict__') else f for f in features]
    return DataCollatorForSeq2Seq(tokenizer)(features)

In [110]:
# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["func_code_string"],
    eval_dataset=tokenized_datasets_validation["func_code_string"],
    compute_metrics=compute_metrics,
    data_collator=custom_data_collator,
    tokenizer=tokenizer
)


In [109]:
from transformers import RobertaTokenizer

# Load the tokenizer
tokenizer = RobertaTokenizer.from_pretrained("microsoft/codebert-base")

# Tokenization function
def prepare_data(example):
    # Tokenizing code and documentation
    code_encoding = tokenizer(example['func_code_tokens'], truncation=True, padding='max_length', max_length=512)
    doc_encoding = tokenizer(example['func_documentation_tokens'], truncation=True, padding='max_length', max_length=128)

    return {
        'input_ids': code_encoding['input_ids'],  # Tokenized function code
        'attention_mask': code_encoding['attention_mask'],  # Attention mask
        'labels': doc_encoding['input_ids']  # Tokenized documentation
    }
