In [1]:
!pip install transformers datasets peft accelerate


Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.13.0->peft)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.13.0->peft)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.

In [20]:
from datasets import load_dataset
from transformers import AutoTokenizer

# Load the dataset
dataset = load_dataset('json', data_files='startup_pitches.jsonl')

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token

# Tokenize the dataset
def tokenize_function(example):
    return tokenizer(example['text'], padding='max_length', truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)


In [21]:
from peft import LoraConfig, get_peft_model
from transformers import AutoModelForCausalLM

# Load the pre-trained GPT-2 model
model = AutoModelForCausalLM.from_pretrained('gpt2')

# Define LoRA configuration
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=['c_attn'],
    lora_dropout=0.1,
    bias='none',
    task_type='CAUSAL_LM'
)

# Apply LoRA to the model
model = get_peft_model(model, lora_config)




In [22]:
from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling
import torch

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# Ensure labels are provided for loss calculation
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False  # For causal LM
)

# Training Arguments
training_args = TrainingArguments(
    output_dir='./lora_gpt2_startup_pitch',
    per_device_train_batch_size=4,
    num_train_epochs=3,
    logging_dir='./logs',
    logging_steps=10,
    save_steps=500,
    save_total_limit=2,
    prediction_loss_only=True,
    fp16=True,  # Enable mixed precision if GPU supports FP16
    no_cuda=False,
)

# Custom trainer with labels provided
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    data_collator=data_collator  # Added data_collator to handle labels
)

trainer.train()


No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss
10,4.528
20,4.675
30,4.54
40,4.454
50,4.3024
60,4.2824
70,4.2212
80,4.3062
90,4.1766
100,4.0202


TrainOutput(global_step=189, training_loss=4.157513391403925, metrics={'train_runtime': 74.5397, 'train_samples_per_second': 10.102, 'train_steps_per_second': 2.536, 'total_flos': 394870190505984.0, 'train_loss': 4.157513391403925, 'epoch': 3.0})

In [6]:
model.save_pretrained('startup-pitch-lora')
tokenizer.save_pretrained('startup-pitch-lora')

('startup-pitch-lora/tokenizer_config.json',
 'startup-pitch-lora/special_tokens_map.json',
 'startup-pitch-lora/vocab.json',
 'startup-pitch-lora/merges.txt',
 'startup-pitch-lora/added_tokens.json',
 'startup-pitch-lora/tokenizer.json')

In [7]:
!zip -r /content/startup-pitch-lora.zip /content/startup-pitch-lora

  adding: content/startup-pitch-lora/ (stored 0%)
  adding: content/startup-pitch-lora/adapter_model.safetensors (deflated 7%)
  adding: content/startup-pitch-lora/merges.txt (deflated 53%)
  adding: content/startup-pitch-lora/special_tokens_map.json (deflated 60%)
  adding: content/startup-pitch-lora/vocab.json (deflated 59%)
  adding: content/startup-pitch-lora/tokenizer.json (deflated 82%)
  adding: content/startup-pitch-lora/tokenizer_config.json (deflated 54%)
  adding: content/startup-pitch-lora/adapter_config.json (deflated 56%)
  adding: content/startup-pitch-lora/README.md (deflated 66%)


In [8]:
from google.colab import files
files.download('/content/startup-pitch-lora.zip')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [9]:
from transformers import GPT2Config

# Load the base configuration for GPT2
config = GPT2Config.from_pretrained('gpt2')

# Save the config to the desired directory
config.save_pretrained('startup-pitch-lora')


In [11]:
# After training is complete, save the model and tokenizer
trainer.save_model('./startup-pitch-lora')  # This saves the model weights (pytorch_model.bin)
tokenizer.save_pretrained('./startup-pitch-lora')  # This saves the tokenizer files

('./startup-pitch-lora/tokenizer_config.json',
 './startup-pitch-lora/special_tokens_map.json',
 './startup-pitch-lora/vocab.json',
 './startup-pitch-lora/merges.txt',
 './startup-pitch-lora/added_tokens.json',
 './startup-pitch-lora/tokenizer.json')

In [12]:
model.save_pretrained('./startup-pitch-lora')

In [23]:
from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# Data Collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False  # For causal LM
)

# Training Arguments
training_args = TrainingArguments(
    output_dir='./lora_gpt2_startup_pitch',
    per_device_train_batch_size=4,
    num_train_epochs=3,
    logging_dir='./logs',
    logging_steps=10,
    save_steps=500,  # Ensure checkpoints are saved
    save_total_limit=2,
    prediction_loss_only=True,
    fp16=True,  # Enable mixed precision if GPU supports FP16
    no_cuda=False,
)

# Custom trainer with labels provided
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    data_collator=data_collator  # Added data_collator to handle labels
)

trainer.train()

# After training is complete, save the model and tokenizer
trainer.save_model('./startup-pitch-lora')  # Saves the model weights (pytorch_model.bin)
tokenizer.save_pretrained('./startup-pitch-lora')  # Saves tokenizer files


No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss
10,3.7338
20,3.8583
30,3.7142
40,3.6571
50,3.4457
60,3.4817
70,3.3925
80,3.5038
90,3.338
100,3.2218


('./startup-pitch-lora/tokenizer_config.json',
 './startup-pitch-lora/special_tokens_map.json',
 './startup-pitch-lora/vocab.json',
 './startup-pitch-lora/merges.txt',
 './startup-pitch-lora/added_tokens.json',
 './startup-pitch-lora/tokenizer.json')

In [24]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel

# Load base model
base_model = AutoModelForCausalLM.from_pretrained("gpt2")
tokenizer = AutoTokenizer.from_pretrained("gpt2")

# Load LoRA adapter
peft_model = PeftModel.from_pretrained(base_model, "./startup-pitch-lora")
peft_model.eval()

# Merge adapter into base model
merged_model = peft_model.merge_and_unload()

# ✅ Save the final merged model (this creates pytorch_model.bin)
merged_model.save_pretrained("./final_gpt2_model")
tokenizer.save_pretrained("./final_gpt2_model")

print("Saved merged model with pytorch_model.bin ✅")


Saved merged model with pytorch_model.bin ✅


In [25]:
import os
print(os.listdir("./final_gpt2_model"))

['config.json', 'generation_config.json', 'model.safetensors', 'merges.txt', 'special_tokens_map.json', 'vocab.json', 'tokenizer.json', 'tokenizer_config.json']


In [26]:
from transformers import AutoModelForCausalLM

# Load from saved safetensors
model = AutoModelForCausalLM.from_pretrained("./final_gpt2_model", trust_remote_code=True)

# Re-save as bin
model.save_pretrained("./final_gpt2_model_bin", safe_serialization=False)

print("Saved as pytorch_model.bin ✅")


Saved as pytorch_model.bin ✅


In [27]:
from google.colab import files
files.download('/content/final_gpt2_model_bin/pytorch_model.bin')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [28]:
!zip -r /content/final_gpt2_model_bin.zip /content/final_gpt2_model_bin

  adding: content/final_gpt2_model_bin/ (stored 0%)
  adding: content/final_gpt2_model_bin/config.json (deflated 51%)
  adding: content/final_gpt2_model_bin/generation_config.json (deflated 24%)
  adding: content/final_gpt2_model_bin/pytorch_model.bin (deflated 7%)


In [30]:
from google.colab import files
files.download('/content/final_gpt2_model_bin.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>