In [1]:
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModelForQuestionAnswering, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import load_dataset, Dataset
import json
import os
import accelerate

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
data_dir = "/content/drive/MyDrive/Data"
model_dir = "/content/drive/MyDrive/Models"
model_dir

'/content/drive/MyDrive/Models'

In [4]:
# ====== Config ======
# model_name = "microsoft/phi-1"
model_name = "Salesforce/codegen-350M-mono"
json_path = data_dir + "/qas_data.json"  # <-- change this
output_dir = model_dir + "/finetuned-roqeto"

In [5]:
# ====== Load Dataset ======
def load_json_dataset(json_path):
    with open(json_path, "r", encoding="utf-8") as f:
        data = json.load(f)
    # Convert to Hugging Face Dataset format
    return Dataset.from_list([{
       "text": f"Question: {item['question']}\nAnswer: {item['answer']}"
    } for item in data['arxiv']])



In [6]:
dataset = load_json_dataset(json_path)

In [7]:
dataset[0]

{'text': 'Question: What is the exit velocity of gases?\nAnswer: Elementary concepts from general physics and thermodynamics have been used to\nanalyze rocket propulsion. Making some reasonable assumptions, an expression\nfor the exit velocity of the gases is found. From that expression one can\nconclude what are the desired properties for a rocket fuel.'}

In [8]:

# # Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of the model checkpoint at Salesforce/codegen-350M-mono were not used when initializing CodeGenForCausalLM: ['transformer.h.0.attn.causal_mask', 'transformer.h.1.attn.causal_mask', 'transformer.h.10.attn.causal_mask', 'transformer.h.11.attn.causal_mask', 'transformer.h.12.attn.causal_mask', 'transformer.h.13.attn.causal_mask', 'transformer.h.14.attn.causal_mask', 'transformer.h.15.attn.causal_mask', 'transformer.h.16.attn.causal_mask', 'transformer.h.17.attn.causal_mask', 'transformer.h.18.attn.causal_mask', 'transformer.h.19.attn.causal_mask', 'transformer.h.2.attn.ca

In [9]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [10]:
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

1

In [11]:
# Tokenize the dataset
def tokenize(example):
    return tokenizer(example["text"], truncation=True, padding="max_length", max_length=512)

tokenized_dataset = dataset.map(tokenize, batched=True)

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [12]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [13]:
# ====== Training Arguments ======
training_args = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=4,
    num_train_epochs=50,
    logging_steps=10,
    save_steps=500,
    save_total_limit=2,
    warmup_steps=10,
    weight_decay=0.01,
    logging_dir=os.path.join(output_dir, "logs"),
    fp16=torch.cuda.is_available(),
    report_to="none",
)

In [14]:
# ====== Trainer ======
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

  trainer = Trainer(


In [15]:
# ====== Train ======
trainer.train()

Step,Training Loss
10,3.7396
20,3.7677
30,3.1926
40,2.443
50,2.352
60,1.2266
70,1.2051
80,0.8072
90,0.3604
100,0.4114


TrainOutput(global_step=1250, training_loss=0.19056953365802765, metrics={'train_runtime': 641.584, 'train_samples_per_second': 7.793, 'train_steps_per_second': 1.948, 'total_flos': 4673796833280000.0, 'train_loss': 0.19056953365802765, 'epoch': 50.0})

In [16]:
def query_model(prompt, max_new_tokens=100):
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    outputs = model.generate(**inputs, max_new_tokens=max_new_tokens)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [17]:
print(query_model("Summarize:  Could be used to replace the Russian boosting capacity. The catch is that the rocket that took it up there (Atlas V) was made with Russian engines and all the ones we still have are spoken for.</p>\n \nCygnus has previously launched on the American-made Atlas V rocket. But this booster also uses Russian-made engines. Because of that, the Atlas V was already due to be phased out later this decade after completing two dozen more launches. The Atlas V rocket developer, United Launch Alliance, has taken delivery of all the Russian engines it needs for these flights. Although these missions are all booked, one solution may be for Amazon to give back some of the nine Atlas V launches it has reserved for its Project Kuiper satellite constellation. Another scenario involves launching Cygnus on a Falcon 9 rocket, something Northrop and SpaceX would probably agree upon in an emergency situation.</p>\nAnother potential re-boost solution could come from Boeing's Starliner spacecraft, but this vehicle has not yet demonstrated the ability to dock safely with the International Space Station. And it, too, is reliant upon launching on the Atlas V rocket.."))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Summarize:  Could be used to replace the Russian boosting capacity. The catch is that the rocket that took it up there (Atlas V) was made with Russian engines and all the ones we still have are spoken for.</p>
 
Cygnus has previously launched on the American-made Atlas V rocket. But this booster also uses Russian-made engines. Because of that, the Atlas V was already due to be phased out later this decade after completing two dozen more launches. The Atlas V rocket developer, United Launch Alliance, has taken delivery of all the Russian engines it needs for these flights. Although these missions are all booked, one solution may be for Amazon to give back some of the nine Atlas V launches it has reserved for its Project Kuiper satellite constellation. Another scenario involves launching Cygnus on a Falcon 9 rocket, something Northrop and SpaceX would probably agree upon in an emergency situation.</p>
Another potential re-boost solution could come from Boeing's Starliner spacecraft, but 

In [18]:
print(query_model("How to launch a rocket?"))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


How to launch a rocket?
Answer: Launching a rocket involves managing the significant development
limits that are required to survive the ascent. One
method is to employ a low-cost observing platform to guide the
navigation of the solar system and beyond. A high-power propulsion
system is the required fuel for every journey that is developed in
this paper. The launch procedure for each journey is discussed and the
results are compared. It is possible to redesign the propulsion system to
allow smaller fuel fraction of the


In [20]:
# Example usage
print(query_model("Explain the external forces that act on a launching rocket."))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Explain the external forces that act on a launching rocket.
Answer: Emerging studies are geared toward exploring new methods of nuclear rocket
propulsion to provide more efficient space transit beyond Earth's orbit. One
method is to employ a Fission Fragment Rocket Engine utilizing fissionable
layers embedded in a low-density aerogel. A quantitative understanding of
particle attenuation is essential for developing a functional prototype that
permits fission fragments to escape the layers and contribute to specific
impulse rather than being attenuated and generating waste


In [21]:
for param in model.parameters():
  param.requires_grad = False  # freeze the model - train adapters later
  if param.ndim == 1:
    # cast the small parameters (e.g. layernorm) to fp32 for stability
    param.data = param.data.to(torch.float32)

model.gradient_checkpointing_enable()  # reduce number of stored activations
model.enable_input_require_grads()

class CastOutputToFloat(nn.Sequential):
    def forward(self, x): return super().forward(x).to(torch.float32)

In [22]:
for name, module in model.named_modules():
    print(name)


transformer
transformer.wte
transformer.drop
transformer.h
transformer.h.0
transformer.h.0.ln_1
transformer.h.0.attn
transformer.h.0.attn.attn_dropout
transformer.h.0.attn.resid_dropout
transformer.h.0.attn.qkv_proj
transformer.h.0.attn.out_proj
transformer.h.0.mlp
transformer.h.0.mlp.fc_in
transformer.h.0.mlp.fc_out
transformer.h.0.mlp.act
transformer.h.0.mlp.dropout
transformer.h.1
transformer.h.1.ln_1
transformer.h.1.attn
transformer.h.1.attn.attn_dropout
transformer.h.1.attn.resid_dropout
transformer.h.1.attn.qkv_proj
transformer.h.1.attn.out_proj
transformer.h.1.mlp
transformer.h.1.mlp.fc_in
transformer.h.1.mlp.fc_out
transformer.h.1.mlp.act
transformer.h.1.mlp.dropout
transformer.h.2
transformer.h.2.ln_1
transformer.h.2.attn
transformer.h.2.attn.attn_dropout
transformer.h.2.attn.resid_dropout
transformer.h.2.attn.qkv_proj
transformer.h.2.attn.out_proj
transformer.h.2.mlp
transformer.h.2.mlp.fc_in
transformer.h.2.mlp.fc_out
transformer.h.2.mlp.act
transformer.h.2.mlp.dropout
tran

In [23]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [24]:
from peft import LoraConfig, get_peft_model, TaskType

config = LoraConfig(
    r=16, #attention heads
    lora_alpha=32, #alpha scaling
    target_modules=["mlp.fc_in"], #if you know the
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.QUESTION_ANS # set this for CLM or Seq2Seq
)

model = get_peft_model(model, config)
print_trainable_parameters(model)

trainable params: 1638400 || all params: 358350848 || trainable%: 0.45720555961960496


In [25]:
# ====== Training Arguments ======
training_args = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=4,
    num_train_epochs=700,
    logging_steps=10,
    save_steps=500,
    save_total_limit=2,
    warmup_steps=10,
    weight_decay=0.01,
    logging_dir=os.path.join(output_dir, "logs"),
    fp16=True,
    report_to="none",
)

In [26]:
# ====== Trainer ======
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)


  trainer = Trainer(
No label_names provided for model class `PeftModelForQuestionAnswering`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [None]:

# ====== Train ======
trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Step,Training Loss
10,0.0193
20,0.0172
30,0.0189
40,0.0188
50,0.017
60,0.0185
70,0.0187
80,0.0174
90,0.0178
100,0.0196


In [None]:
def query_model(prompt, max_new_tokens=100):
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    outputs = model.generate(**inputs, max_new_tokens=max_new_tokens)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [None]:
print(query_model("How to launch a rocket?"))

In [None]:
# ====== Save Model ======
trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir)