In [1]:
# Cell 1 - Install these and other requirements python may ask
!pip install pypdf transformers datasets accelerate torch



In [2]:
# Cell 2 - Extract text from pdf
import os
from pypdf import PdfReader

def extract_text_from_pdfs(pdf_dir="data/pdfs"):
    """Extracts text from all PDFs in a directory and returns a single large string."""
    all_text = ""
    for filename in os.listdir(pdf_dir):
        if filename.endswith(".pdf"):
            filepath = os.path.join(pdf_dir, filename)
            try:
                reader = PdfReader(filepath)
                for page in reader.pages:
                    all_text += page.extract_text() + "\\n"
            except Exception as e:
                print(f"Could not read {filename}: {e}")

    # Save the combined text to a file for easy loading later
    with open("combined_documents.txt", "w", encoding="utf-8") as f:
        f.write(all_text)

    return "combined_documents.txt"

# Assume your PDFs are in a folder named 'data/pdfs'
text_file_path = extract_text_from_pdfs()
print(f"Text extracted and saved to: {text_file_path}")


Ignoring wrong pointing object 6 0 (offset 0)
Ignoring wrong pointing object 8 0 (offset 0)


Text extracted and saved to: combined_documents.txt


In [3]:
# Cell 3 - Load text into Dataset, load GPT-2, Define Tokenization
from transformers import AutoTokenizer
from datasets import load_dataset

# 1. Load the text file into a Hugging Face Dataset
# We use 'text' dataset builder which loads a file line-by-line
raw_datasets = load_dataset('text', data_files={'train': 'combined_documents.txt'})

# 2. Load the GPT-2 Tokenizer
tokenizer = AutoTokenizer.from_pretrained('gpt2')

# Add a padding token to GPT-2 (necessary for batch processing, though not strictly required
# for Causal Language Modeling if using DataCollatorForLanguageModeling)
tokenizer.pad_token = tokenizer.eos_token

# 3. Tokenization Function
def tokenize_function(examples):
    # The key in the dataset is 'text' since we used the 'text' dataset loader
    return tokenizer(examples["text"])

tokenized_datasets = raw_datasets.map(
    tokenize_function,
    batched=True,
    num_proc=4, # Use multiple processes for faster tokenization
    remove_columns=raw_datasets["train"].column_names # Remove original text column
)

  from .autonotebook import tqdm as notebook_tqdm
Generating train split: 1 examples [00:00, 161.31 examples/s]
num_proc must be <= 1. Reducing num_proc to 1 for dataset of size 1.
Map (num_proc=1): 100%|██████████| 1/1 [00:00<00:00,  9.31 examples/s]


In [4]:
# Prepare
block_size = 1024 # Standard block size for GPT-2

def group_texts(examples):
    # Concatenate all texts
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])

    # The line that drops the last small block is correctly commented out:
    # total_length = (total_length // block_size) * block_size 

    # Split by block_size, allowing the last one to be < block_size
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    # Add 'labels' for CLM, where labels are the input_ids shifted
    result["labels"] = result["input_ids"].copy()
    return result

lm_datasets = tokenized_datasets.map(
    group_texts,
    batched=True,
    batch_size=1000,
    num_proc=4,
)

# --- CORRECTED SPLIT LOGIC ---
# Get the grouped dataset (which currently only has a 'train' split)
main_dataset = lm_datasets['train']
total_blocks = len(main_dataset)

# If we have very few blocks (less than 2), train_test_split will fail.
# Use the whole set for both train and validation in this case.
if total_blocks < 2:
    print(f"Dataset too small ({total_blocks} blocks) for 95/5 split. Using all blocks for both train and eval.")
    train_dataset = main_dataset
    eval_dataset = main_dataset
else:
    # Perform the split for larger datasets
    split_datasets = main_dataset.train_test_split(test_size=0.05)
    train_dataset = split_datasets["train"]
    eval_dataset = split_datasets["test"]
    
print(f"Total training blocks: {len(train_dataset)}")
print(f"Total evaluation blocks: {len(eval_dataset)}")

num_proc must be <= 1. Reducing num_proc to 1 for dataset of size 1.
Map (num_proc=1): 100%|██████████| 1/1 [00:00<00:00, 10.67 examples/s]

Dataset too small (1 blocks) for 95/5 split. Using all blocks for both train and eval.
Total training blocks: 1
Total evaluation blocks: 1





In [5]:
# Load the Model
from transformers import GPT2LMHeadModel, DataCollatorForLanguageModeling
import torch

# Load the PRE-TRAINED GPT-2 model
model_name = 'gpt2'
model = GPT2LMHeadModel.from_pretrained(model_name)

# Data Collator: Prepares batches of data for the model
# mlm=False is crucial for Causal Language Modeling (CLM)
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)


In [6]:
# Define Hyperparameters
from transformers import TrainingArguments

output_dir = "gpt2-finetuned-custom-docs"
logging_steps = 100

training_args = TrainingArguments(
    output_dir=output_dir,
    # Core Training Parameters
    num_train_epochs=20,                     # Number of epochs to run
    per_device_train_batch_size=4,          # Adjust based on your GPU VRAM (e.g., 4, 8, or 16)
    per_device_eval_batch_size=4,
    learning_rate=5e-5,                     # Standard learning rate for fine-tuning
    weight_decay=0.01,

    # Evaluation and Logging
    eval_strategy="epoch",            # Evaluate at the end of each epoch
    logging_dir='./logs',
    logging_steps=logging_steps,
    save_strategy="epoch",                  # Save checkpoint at the end of each epoch
    load_best_model_at_end=True,            # Load the model with the best validation loss

    # Mixed Precision (Crucial for speed and VRAM on modern GPUs like A100/A40/30xx/40xx)
    fp16=torch.cuda.is_available() and torch.cuda.get_device_properties(0).major >= 7, # Enable if GPU supports it

    # Data Handling
    seed=42,
    gradient_accumulation_steps=8,          # Use small batch size, but simulate a larger one (4 * 8 = 32)
)


In [7]:
# Fine-Tune the model
from transformers import Trainer

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# Start Fine-Tuning!
print("Starting Fine-Tuning...")
trainer.train()
print("Fine-Tuning complete. Model saved.")


  trainer = Trainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 50256}.


Starting Fine-Tuning...


`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.


Epoch,Training Loss,Validation Loss
1,No log,4.518892
2,No log,3.548493
3,No log,2.952291
4,No log,2.455575
5,No log,2.014736
6,No log,1.644164
7,No log,1.348671
8,No log,1.092496
9,No log,0.856026
10,No log,0.668019


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


Fine-Tuning complete. Model saved.


In [8]:
print(f"Number of samples in train_dataset: {len(train_dataset)}")

Number of samples in train_dataset: 1


In [9]:
# Load the model into a pipeline and provide input, read output
from transformers import pipeline

# Load the trained model into a pipeline for easy generation
generator = pipeline(
    'text-generation',
    model=trainer.model,
    tokenizer=tokenizer,
    device=0 if torch.cuda.is_available() else -1 # Use GPU if available
)

prompt = "Based on the documents, who is Diogo Delgado Barros?"
output = generator(
    prompt,
    max_new_tokens=50,
    num_return_sequences=1,
    do_sample=True,          # Enable sampling for more creative output
    temperature=0.7,         # Controls randomness (lower is safer/less random)
    top_k=50,
    top_p=0.95
)[0]

print("\\n--- Generated Text ---")
print(output['generated_text'])
print("----------------------")

Device set to use mps:0


\n--- Generated Text ---
Based on the documents, who is Diogo Delgado Barros?

Diogo Delgado Barros is the leader of the alien colony Dinosaurus. He is the supreme leader of Dinosaurus. \n

Diogo Delgado Barros is the supreme leader of Dinosaurus. \n

The
----------------------
