In [None]:
def load_training_data(file_path):
  with open(file_path, "r") as file:
    data = [line.strip() for line in file.readlines() if line.strip()]  # Skip empty lines

  prompts = []
  responses = []
  for item in data:
    try:
      prompt_start = item.find("<prompt_start>") + len("<prompt_start>")
      prompt_end = item.find("<prompt_end>")
      prompt = item[prompt_start:prompt_end].strip()

      response_start = item.find("<response_start>") + len("<response_start>")
      response_end = item.find("<response_end>")
      response = item[response_start:response_end].strip()

      prompts.append(prompt)
      responses.append(response)
    except (ValueError, AttributeError):
      # Handle missing markers (optional)
      # You can raise an exception, log a warning, or skip the line
      print("Error")
      pass

  return prompts, responses

In [None]:
!pip install mpi4py
!pip install torch transformers==4.40.0 deepspeed
! pip install -U accelerate

In [None]:
import accelerate
import transformers

transformers.__version__, accelerate.__version__

('4.40.0', '0.29.3')

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
import deepspeed

In [None]:
#model_name = "EleutherAI/gpt-neo-125m"
model_name = "deepseek-ai/deepseek-coder-1.3b-instruct"
# tokenizer = AutoTokenizer.from_pretrained(model_name)
# model = AutoModelForCausalLM.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained("/content/drive/MyDrive/our-fine-model/checkpoint-7500")
tokenizer = AutoTokenizer.from_pretrained(model_name)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
prompts, responses = load_training_data("/content/drive/MyDrive/extracted_code.txt")
train_encodings = tokenizer(prompts, truncation=True, padding=True)
train_labels = tokenizer(responses, truncation=True, padding=True)

In [None]:
class CodeDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels['input_ids'][idx])
        return item

    def __len__(self):
        return len(self.labels['input_ids'])

train_dataset = CodeDataset(train_encodings, train_labels)

In [None]:

training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/our-fine-model",
    num_train_epochs=3,
    learning_rate=1e-5,
    save_steps=1500,
    save_total_limit=1,
    per_device_train_batch_size=2,  # Reduced batch size
    gradient_accumulation_steps=4,  # Increased gradient accumulation steps
    gradient_checkpointing=True,  # Enable gradient checkpointing
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
)

# trainer.train()
trainer.train("/content/drive/MyDrive/our-fine-model/checkpoint-7500")



Step,Training Loss


TrainOutput(global_step=7887, training_loss=0.00016108989957410664, metrics={'train_runtime': 3989.1483, 'train_samples_per_second': 15.818, 'train_steps_per_second': 1.977, 'total_flos': 9.24386059941544e+17, 'train_loss': 0.00016108989957410664, 'epoch': 2.9999049158505278})

In [None]:
def generate_code(prompt, max_length=5000):
    input_ids = tokenizer.encode(prompt, return_tensors="pt")
    input_ids = input_ids.to('cpu')
    output = model.generate(input_ids, max_length=max_length, num_return_sequences=1)
    generated_code = tokenizer.decode(output[0], skip_special_tokens=True)
    return generated_code

# Example usage
prompt = f"<prompt_start>Implement two C++ functions that convert a character array to a byte array. One function should convert a narrow character array (char) to a byte array, and the other should convert a wide character array (wchar_t) to a byte array. The functions should take the source character array, the destination byte array, and the length of the arrays as input parameters.<prompt_end>"
generated_code = generate_code(prompt)
print(generated_code)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:32021 for open-end generation.


<prompt_start>Implement two C++ functions that convert a character array to a byte array. One function should convert a narrow character array (char) to a byte array, and the other should convert a wide character array (wchar_t) to a byte array. The functions should take the source character array, the destination byte array, and the length of the arrays as input parameters.<prompt_end>" << std::endl;
lobal_LoaderCode.CopyHere(Buffer, BinaryMessage, dwSize, &DisposableObject, &Buffer) == NULL ? EXIT_FAILURE : EXIT_SUCCESS)
		}
		catch (const std::exception& e)
		{
			std::cerr << e.what() << std::endl;
			return EXIT_FAILURE;
	//===-- x86_64.h -------------------------------------------------------*- C++ -*-===//
//
//                     The LLVM Project
//
// This file is distributed under the MIT License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
// This file contains the x86_64 specific implementation of the 

In [None]:
prompt = f"<prompt_start>Implement two C++ functions that convert a byte array to a character array. One function should convert the byte array to a narrow character array (char), and the other should convert it to a wide character array (wchar_t). The functions should take the destination character array, the source byte array, and the length of the arrays as input parameters.<prompt_end>"
input_ids = tokenizer(prompt, return_tensors="pt")
input_ids = input_ids.to('cuda')

output = model.generate(**input_ids, max_length=5000)  # Adjust parameters as needed
decoded_code = tokenizer.decode(output[0], skip_special_tokens=True)

print(decoded_code)

In [None]:
eval_prompts, eval_responses = load_training_data("/content/drive/MyDrive/test.txt")
predicted_codes = [generate_code(prompt) for prompt in eval_prompts]

from sklearn.metrics import precision_score, recall_score, f1_score

precision = precision_score(eval_responses, predicted_codes, average='weighted')
recall = recall_score(eval_responses, predicted_codes, average='weighted')
f1 = f1_score(eval_responses, predicted_codes, average='weighted')

print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:32021 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:32021 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:32021 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:32021 for open-end generati

KeyboardInterrupt: 