In [1]:
!pip install -U datasets huggingface_hub fsspec

Collecting fsspec
  Using cached fsspec-2025.5.1-py3-none-any.whl.metadata (11 kB)


In [2]:
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModelForQuestionAnswering, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import load_dataset, Dataset, concatenate_datasets
import json
import os
import accelerate

In [3]:
from google.colab import drive
drive.mount('/content/MyDrive')

Drive already mounted at /content/MyDrive; to attempt to forcibly remount, call drive.mount("/content/MyDrive", force_remount=True).


In [4]:
model_dir = "/content/drive/MyDrive/Models"

In [5]:
# ====== Config ======
# model_name = "microsoft/phi-1"
model_name = "shaddie/rocketry_roqeto_model" # "Salesforce/codegen-350M-mono"
output_dir = model_dir + "/rocketry-roqeto-model"

In [6]:
# ====== Load Dataset ======
def load_json_dataset1(data):
    return Dataset.from_list([{
       "text": f"Question: {item['question']}\nAnswer: {item['answer']}"
    } for item in dataset1["train"]["spacesystems"][0:4500] ])

def load_json_dataset2(data):
    return Dataset.from_list([{
       "text": f"Question: {item['question']}\nAnswer: {item['answer']}"
    } for item in dataset2["train"] ])

In [7]:
dataset1 = load_dataset("shaddie/space_systems_qas_dataset")
data1 = load_json_dataset1(dataset1)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [8]:
data1

Dataset({
    features: ['text'],
    num_rows: 4500
})

In [9]:
dataset2 = load_dataset("shaddie/rocketry_qas_dataset")
data2 = load_json_dataset2(dataset2)

In [10]:
data2

Dataset({
    features: ['text'],
    num_rows: 757
})

In [11]:
dataset = concatenate_datasets([data1, data2])

In [12]:
dataset

Dataset({
    features: ['text'],
    num_rows: 5257
})

In [13]:
# # Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto")

In [14]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [15]:
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

0

In [16]:
# Tokenize the dataset
def tokenize(example):
    # print(f'example {example["text"]}')
    return tokenizer(example["text"], truncation=True, padding="max_length", max_length=512)

tokenized_dataset = dataset.map(tokenize, batched=True)

Map:   0%|          | 0/5257 [00:00<?, ? examples/s]

In [17]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [18]:
# ====== Training Arguments ======
training_args = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=4,
    num_train_epochs=1,
    logging_steps=200,
    save_steps=50,
    save_total_limit=2,
    warmup_steps=10,
    weight_decay=0.01,
    logging_dir=os.path.join(output_dir, "logs"),
    fp16=torch.cuda.is_available(),
    report_to="none",
)

In [19]:
# ====== Trainer ======
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

  trainer = Trainer(


In [None]:
# ====== Train ======
trainer.train()

Step,Training Loss
200,2.1686


In [None]:
def query_model(prompt, max_new_tokens=368):
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    outputs = model.generate(**inputs, max_new_tokens=max_new_tokens)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [None]:
print(query_model("How would you launch a rocket with optimal fuel for reaching space?"))

In [None]:
print(query_model("Question: Can you describe how you would construct a habitable space station in geosynchronous orbit?"))

In [None]:
print(query_model("How would you design or create artificial gravity in a space-ship traveliing in outer space?"))

In [None]:
# ====== Save Model ======
# trainer.save_model(output_dir)
# tokenizer.save_pretrained(output_dir)

In [None]:
from huggingface_hub import notebook_login
notebook_login()

In [None]:
model.push_to_hub("shaddie/rocketry_roqeto_model",
                  use_auth_token=True,
                  commit_message="fine-tuning-for-rocketry-knowledge",
                  private=False)

In [None]:
tokenizer.push_to_hub("shaddie/rocketry_roqeto_model",
                  use_auth_token=True,
                  commit_message="tokenizer-fine-tuning-for-rocketry-knowledge",
                  private=False)