In [1]:
import os
import torch
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split

# Specify the full path to the directory containing language folders on your drive
path = '/kaggle/input/13datasets/Data'  # Change this to the actual path on your drive

# Define the folder names (in lowercase) and corresponding labels for programming languages
data_dirs = ['cpp', 'json', 'java', 'javascript', 'groovy', 'python','xml','yml','sql','scala','go','php','swift']
labels = ['cpp', 'json', 'java', 'javascript', 'groovy', 'python','xml','yml','sql','scala','go','php','swift']

# Initialize the tokenizer and model for CodeBERT
tokenizer = RobertaTokenizer.from_pretrained('microsoft/codebert-base')
model = RobertaForSequenceClassification.from_pretrained('microsoft/codebert-base', num_labels=len(labels))


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/498 [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/codebert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [2]:

# Function to load and filter files by UTF-8 encoding
def load_data_and_filter(path, data_dirs, labels):
    texts = []
    targets = []
    
    # Initialize a dictionary to count files per folder
    file_counts = {folder: 0 for folder in data_dirs}

    for idx, folder in enumerate(data_dirs):
        folder_path = os.path.join(path, folder)
        
        # Check if the folder exists
        if not os.path.exists(folder_path):
            print(f"Folder {folder} does not exist at {folder_path}")
            continue
        
        for filename in os.listdir(folder_path):
            abs_file = os.path.join(folder_path, filename)
            
            try:
                # Attempt to open the file with UTF-8 encoding
                with open(abs_file, encoding='utf-8', mode='r') as f:
                    texts.append(f.read())  # If the file is read without error, it's in UTF-8
                    targets.append(idx)      # Append the corresponding label
                    file_counts[folder] += 1 # Increment the count for the folder
            except UnicodeDecodeError:
                # If a UnicodeDecodeError occurs, print the filename and remove the file
                print(f"Removing non-UTF-8 file: {abs_file}")
                os.remove(abs_file)  # Remove non-UTF-8 file
    
    return texts, targets, file_counts


In [3]:

# Load and filter the dataset
texts, targets, file_counts = load_data_and_filter(path, data_dirs, labels)

# Print the number of files left in each folder after filtering
print("Number of files left in each folder after filtering non-UTF-8 files:")
for folder, count in file_counts.items():
    print(f"{folder}: {count} files")


Number of files left in each folder after filtering non-UTF-8 files:
cpp: 309 files
json: 6368 files
java: 592 files
javascript: 595 files
groovy: 86 files
python: 473 files
xml: 1985 files
yml: 6903 files
sql: 35 files
scala: 78 files
go: 97 files
php: 71 files
swift: 82 files


In [4]:

# Split the data into training and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(texts, targets, test_size=0.2, random_state=42)

# Tokenize the texts using the CodeBERT tokenizer
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=512)

# Convert the tokenized data into Dataset objects
train_dataset = Dataset.from_dict({'input_ids': train_encodings['input_ids'], 'attention_mask': train_encodings['attention_mask'], 'labels': train_labels})
val_dataset = Dataset.from_dict({'input_ids': val_encodings['input_ids'], 'attention_mask': val_encodings['attention_mask'], 'labels': val_labels})

# Create a DatasetDict
dataset = DatasetDict({
    'train': train_dataset,
    'validation': val_dataset
})

# # Define training arguments for fine-tuning CodeBERT
# training_args = TrainingArguments(
#     output_dir='./results',
#     num_train_epochs=5,
#     per_device_train_batch_size=8,
#     per_device_eval_batch_size=8,
#     warmup_steps=500,
#     weight_decay=0.01,
#     logging_dir='./logs',
#     logging_steps=10,
#     evaluation_strategy="epoch"
# )

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=5,  # Increase number of epochs
    per_device_train_batch_size=16,  # Larger batch size
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="steps",  # Evaluate more frequently
    eval_steps=500,
    learning_rate=2e-5,  # Lower learning rate
    gradient_accumulation_steps=4,  # Simulate larger batches
    fp16=True,  # Mixed precision training
    load_best_model_at_end=True,  # Load the best model based on validation
    save_total_limit=2,  # Keep only the last two models to save disk space
)


# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer
)



  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


In [5]:
# Fine-tune the model
trainer.train()


[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011112723577778323, max=1.0…

  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Step,Training Loss,Validation Loss
500,0.0174,0.047175


  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


TrainOutput(global_step=550, training_loss=0.5306421267715368, metrics={'train_runtime': 3738.3265, 'train_samples_per_second': 18.911, 'train_steps_per_second': 0.147, 'total_flos': 1.851958497429504e+16, 'train_loss': 0.5306421267715368, 'epoch': 4.97737556561086})

In [6]:

# Save the fine-tuned model and tokenizer
model.save_pretrained("/kaggle/working/fine_tuned_codebertj_")
tokenizer.save_pretrained("/kaggle/working/fine_tuned_codebertj_")



('/kaggle/working/fine_tuned_codebertj_/tokenizer_config.json',
 '/kaggle/working/fine_tuned_codebertj_/special_tokens_map.json',
 '/kaggle/working/fine_tuned_codebertj_/vocab.json',
 '/kaggle/working/fine_tuned_codebertj_/merges.txt',
 '/kaggle/working/fine_tuned_codebertj_/added_tokens.json')

In [7]:
# Evaluate the fine-tuned model on the validation set
trainer.evaluate()


  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


{'eval_loss': 0.047174595296382904,
 'eval_runtime': 61.0366,
 'eval_samples_per_second': 57.916,
 'eval_steps_per_second': 1.819,
 'epoch': 4.97737556561086}

In [8]:
import shutil

# Path to the directory containing the fine-tuned model and tokenizer
model_dir = "/kaggle/working/fine_tuned_codebertj_"

# Path where the zip file will be saved
zip_file_path = "/kaggle/working/fine_tuned_codebertj_.zip"

# Zip the folder
shutil.make_archive(zip_file_path.replace('.zip', ''), 'zip', model_dir)

print(f"Model and tokenizer zipped to: {zip_file_path}")


Model and tokenizer zipped to: /kaggle/working/fine_tuned_codebertj_.zip


In [9]:
import numpy as np
from sklearn.metrics import confusion_matrix, classification_report

# Evaluate the model on the validation set
eval_results = trainer.predict(val_dataset)

# Extract predicted labels and true labels
predicted_labels = np.argmax(eval_results.predictions, axis=1)
true_labels = eval_results.label_ids

# Generate confusion matrix and classification report
conf_matrix = confusion_matrix(true_labels, predicted_labels)
class_report = classification_report(true_labels, predicted_labels, target_names=labels)

# Save confusion matrix and classification report to a text file
output_file = '/kaggle/working/results.txt'
with open(output_file, 'w') as f:
    f.write("Predicted on {} files. Results are as follows:\n\n".format(len(true_labels)))
    
    f.write("Confusion Matrix:\n")
    np.savetxt(f, conf_matrix, fmt='%d', delimiter='\t')  # Save confusion matrix with tab-separated values
    
    f.write("\nClassification Report:\n")
    f.write(class_report)

# Print confirmation
print(f"Results saved to {output_file}")


Results saved to /kaggle/working/results.txt
