In [1]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [6]:
pip install transformers


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.27.4-py3-none-any.whl (6.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m96.9 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m107.8 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.3-py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.8/199.8 KB[0m [31m21.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.3 tokenizers-0.13.2 transformers-4.27.4


In [8]:
pip install wandb

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting wandb
  Downloading wandb-0.14.0-py3-none-any.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m47.0 MB/s[0m eta [36m0:00:00[0m
Collecting GitPython!=3.1.29,>=1.0.0
  Downloading GitPython-3.1.31-py3-none-any.whl (184 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m184.3/184.3 KB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pathtools
  Downloading pathtools-0.1.2.tar.gz (11 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting setproctitle
  Downloading setproctitle-1.3.2-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (30 kB)
Collecting sentry-sdk>=1.0.0
  Downloading sentry_sdk-1.18.0-py2.py3-none-any.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.8/194.8 KB[0m [31m21.4 MB/s[0m eta [36m0:00:00[0m

In [9]:
import torch
from torch.utils.data import DataLoader
from transformers import (
    AutoTokenizer,
    AutoModelForMaskedLM,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments,
)

# Load the tokenizer and the saved model
tokenizer = AutoTokenizer.from_pretrained("microsoft/codebert-base")
model = AutoModelForMaskedLM.from_pretrained("/content/drive/MyDrive/codebertcustom-main (1)/model")

# Load your custom dataset
code_names_file = "/content/drive/MyDrive/codebertcustom-main (1)/arma3/data/ready/code_names.txt"
descriptions_file = "/content/drive/MyDrive/codebertcustom-main (1)/arma3/data/ready/descriptions.txt"


with open(code_names_file, "r") as f:
    code_names = f.readlines()

with open(descriptions_file, "r") as f:
    descriptions = f.readlines()

# Tokenize the data
input_texts = [code.strip() for code in code_names]
target_texts = [desc.strip() for desc in descriptions]

inputs = tokenizer(input_texts, return_tensors="pt", padding=True, truncation=True)
targets = tokenizer(target_texts, return_tensors="pt", padding=True, truncation=True)

# Create a dataset
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, inputs, targets):
        self.inputs = inputs
        self.targets = targets

    def __len__(self):
        return len(self.inputs["input_ids"])

    def __getitem__(self, idx):
        input_ids = self.inputs["input_ids"][idx]
        attention_mask = self.inputs["attention_mask"][idx]
        target_ids = self.targets["input_ids"][idx]
        target_attention_mask = self.targets["attention_mask"][idx]

        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "labels": target_ids,
            "labels_attention_mask": target_attention_mask,
        }

dataset = CustomDataset(inputs, targets)

# Create a DataLoader
data_loader = DataLoader(dataset, batch_size=2, shuffle=True)

# Set training arguments
training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/codebertcustom-main (1)/model",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=2,
    save_steps=5000,
    save_total_limit=2,
    logging_dir="./logs",
    logging_steps=500,
)

# Define a data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

# Create a Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    data_collator=data_collator,
)

# Train the model
trainer.train()

try:
    trainer.save_model("/content/drive/MyDrive/codebertcustom-main (1)/model")
except Exception as e:
    print(f"Error saving model: {e}")

Downloading (…)lve/main/config.json:   0%|          | 0.00/498 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]



<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
500,3.9593
1000,3.7587
1500,3.3078
2000,3.3635
2500,2.836
3000,2.9013
3500,2.8752
4000,4.1203


now im going to retrain the the saved model on the same dataset as above 

In [10]:
# Load the saved model
retrained_model = AutoModelForMaskedLM.from_pretrained("/content/drive/MyDrive/codebertcustom-main (1)/model")

# Create a new Trainer instance with the loaded model
retrainer = Trainer(
    model=retrained_model,
    args=training_args,
    train_dataset=dataset,
    data_collator=data_collator,
)

# Train the model again using the same dataset
retrainer.train()

# Save the retrained model
try:
    retrainer.save_model("/content/drive/MyDrive/codebertcustom-main (1)/retrained_model")
except Exception as e:
    print(f"Error saving retrained model: {e}")


Step,Training Loss
500,1.3585
1000,1.4537
1500,1.3171
2000,1.2912
2500,1.051
3000,1.321
3500,1.5467
4000,3.4488


now that is trained and saved in Retrained model folder im going to up the learning rate.

Using a learning rate of 1e-5 (which is equal to 0.00001) is a more reasonable choice for fine-tuning a deep learning model. This value falls within the typical range of learning rates used for such tasks (between 1e-4 and 1e-6).

In [None]:
# Load the saved model from the specified directory
model_to_retrain = AutoModelForMaskedLM.from_pretrained("/content/drive/MyDrive/codebertcustom-main (1)/retrained_model")

# Create a new set of training arguments with the adjusted learning rate
adjusted_training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/codebertcustom-main (1)/retrained_model_adjusted_lr",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=2,
    save_steps=5000,
    save_total_limit=2,
    logging_dir="./logs",
    logging_steps=500,
    learning_rate=1e-5,  # Set the learning rate to 1e-5 (stay between (1e-4/1e-6))
)

# Create a new Trainer instance with the loaded model and the updated training arguments
adjusted_lr_trainer = Trainer(
    model=model_to_retrain,
    args=adjusted_training_args,
    train_dataset=dataset,
    data_collator=data_collator,
)

# Train the model again using the same dataset
adjusted_lr_trainer.train()

# Save the model trained with the adjusted learning rate
try:
    adjusted_lr_trainer.save_model("/content/drive/MyDrive/codebertcustom-main (1)/retrained_model_adjusted_lr")
except Exception as e:
    print(f"Error saving model with adjusted learning rate: {e}")




Step,Training Loss
500,0.3263
1000,0.3655
1500,0.3677
2000,0.1939
2500,0.1479
