In [1]:
%pip install pdfminer.six PyMuPDF pdf2image pytesseract
%pip install sentence-transformers faiss-cpu rank_bm25

Collecting pdfminer.six
  Downloading pdfminer_six-20250506-py3-none-any.whl.metadata (4.2 kB)
Collecting PyMuPDF
  Downloading pymupdf-1.26.3-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Collecting pdf2image
  Downloading pdf2image-1.17.0-py3-none-any.whl.metadata (6.2 kB)
Collecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Downloading pdfminer_six-20250506-py3-none-any.whl (5.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m47.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pymupdf-1.26.3-cp39-abi3-manylinux_2_28_x86_64.whl (24.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m59.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pdf2image-1.17.0-py3-none-any.whl (11 kB)
Downloading pytesseract-0.3.13-py3-none-any.whl (14 kB)
Installing collected packages: pytesseract, PyMuPDF, pdf2image, pdfminer.six
Successfully installed PyMuPDF-1.26.3 pdf2image-1.17.0 pdf

In [2]:
# ---------- Section 1: Imports & Constants ----------
import os
import re
import json
import pandas as pd
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer
from rank_bm25 import BM25Okapi
from typing import Dict, List, Tuple, Optional
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

In [3]:
# PDF file paths
PDF_PATHS = [
    "/content/Capgemini_-_2025-02-25_-_2024_Consolidated_Financial_Statements.pdf",
    "/content/Capgemini_-_2024-02-20_-_2023_Consolidated_Financial_Statements.pdf",
]

# Output paths
CSV_PATH = "/content/capgemini_financial_QA_pairs.csv"
JSONL_PATH = "/content/capgemini_financial_QA_pairs.jsonl"
MEMORY_FILE = "/content/qa_memory_bank.json"

In [4]:
# ---------- Section 2: Helper Functions ----------

# --- Text Extraction ---
def extract_text_pdfminer(path: str) -> str:
    """Extract text from PDF using pdfminer.six."""
    try:
        from pdfminer.high_level import extract_text
        text = extract_text(path) or ""
        if len(text.strip()) < 500:
            print(f"[Warning] PDFMiner extracted limited text for {path}")
        return text
    except Exception as e:
        print(f"[Error] PDFMiner failed for {path}: {e}")
        return ""

def extract_text_pymupdf(path: str) -> str:
    """Extract text using PyMuPDF."""
    try:
        import fitz
        doc = fitz.open(path)
        text = "\n".join([page.get_text() for page in doc])
        return text
    except Exception as e:
        print(f"[Error] PyMuPDF failed for {path}: {e}")
        return ""

def extract_text_ocr(path: str) -> str:
    """Extract text using OCR as last fallback."""
    try:
        from pdf2image import convert_from_path
        import pytesseract
        pages = convert_from_path(path, dpi=200)
        text = "\n".join([pytesseract.image_to_string(img) for img in pages])
        return text
    except Exception as e:
        print(f"[Error] OCR failed for {path}: {e}")
        return ""

def extract_text_safely(path: str) -> str:
    """Attempts PDFMiner -> PyMuPDF -> OCR."""
    for method, extractor in [("PDFMiner", extract_text_pdfminer), ("PyMuPDF", extract_text_pymupdf), ("OCR", extract_text_ocr)]:
        text = extractor(path)
        if len(text.strip()) >= 500:
            print(f"[Info] Extracted text using {method}")
            return text
    print(f"[Error] Extraction failed for {path}")
    return ""

In [5]:
# --- Cleaning ---
HEADER_PATTERNS = [r"^CAPGEMINI.*$", r"^\s*Page\s+\d+\s*$", r"^\s*\d+\s*$"]

def clean_text(raw: str) -> str:
    """Remove headers, extra spaces, and line breaks."""
    raw = raw.replace("\r", "")
    raw = re.sub(r"-\n", "", raw)
    lines = raw.split("\n")
    cleaned = [re.sub(r"[ \t]+", " ", ln).strip() for ln in lines if ln.strip() and not any(re.search(pat, ln, re.I) for pat in HEADER_PATTERNS)]
    return "\n".join(cleaned)

In [6]:
# --- Section Detection ---
SECTION_TITLES = ["Consolidated Income Statement", "Consolidated Statement of Financial Position", "Consolidated Statement of Cash Flows"]

def locate_sections(text: str) -> Dict[str, Tuple[int, int]]:
    positions = [(m.start(), title) for title in SECTION_TITLES for m in re.finditer(re.escape(title), text, re.I)]
    positions.sort()
    return {title: (start, positions[i + 1][0] if i + 1 < len(positions) else len(text)) for i, (start, title) in enumerate(positions)}


In [7]:
# --- Metric Parsing ---
NUM_RE = re.compile(r"\(?-?\d[\d,]*\)?")

def strip_parens(n: str) -> float:
    neg = n.startswith("(") and n.endswith(")")
    n = n.strip("()").replace(",", "")
    try:
        val = float(n)
    except:
        val = float("nan")
    return -val if neg else val

def parse_years_from_context(section_text: str) -> List[int]:
    yrs = list(dict.fromkeys(int(y) for y in re.findall(r"(20\d{2})", section_text)))
    return yrs[-3:]

def parse_metric_table(section_text: str, years: Optional[List[int]]) -> Dict[str, Dict[int, float]]:
    lines = section_text.split("\n")[1:]
    metrics = {}
    for ln in lines:
        nums = NUM_RE.findall(ln)
        if len(nums) >= 2:
            metric = re.split(NUM_RE, ln, 1)[0].strip(" .:-")
            if not metric: continue
            if not years: years = [0, 1]
            vals = [strip_parens(x) for x in nums[-len(years):]]
            year_map = {y: v for y, v in zip(reversed(years), reversed(vals))}
            metrics[metric] = {**metrics.get(metric, {}), **year_map}
    return metrics

# --- Q/A Generation ---
def euro_fmt(x: float) -> str:
    try: s = f"{abs(x):,.0f}"
    except: return str(x)
    return f"({s})" if x < 0 else f"€{s}"

def make_qa_from_metrics(metrics: Dict[str, Dict[int, float]], section_name: str) -> List[Tuple[str, str]]:
    qa = []
    for metric, year_vals in metrics.items():
        for year, val in sorted(year_vals.items()):
            if year in (0, 1): continue
            if "income" in section_name.lower():
                q = f"What was {metric.lower()} in {year}?"
                a = f"{metric} in {year} was {euro_fmt(val)} million."
            elif "financial" in section_name.lower():
                q = f"What was {metric.lower()} at December 31, {year}?"
                a = f"{metric} at December 31, {year} was {euro_fmt(val)} million."
            elif "cash" in section_name.lower():
                q = f"What was {metric.lower()} in {year}?"
                a = f"{metric} in {year} was {euro_fmt(val)} million."
            qa.append((q, a))
    return qa

def dedupe_keep_order(pairs: List[Tuple[str,str]]) -> List[Tuple[str,str]]:
    seen, out = set(), []
    for q,a in pairs:
        if q.lower() not in seen:
            seen.add(q.lower())
            out.append((q,a))
    return out


In [8]:
# ---------- Section 3: Main Pipeline (QA Generation) ----------
print("[Info] Starting PDF processing...")
all_docs = {}
for p in PDF_PATHS:
    if os.path.exists(p):
        raw = extract_text_safely(p)
        cleaned = clean_text(raw)
        sections = locate_sections(cleaned)
        all_docs[p] = {"text": cleaned, "sections": sections}
    else:
        print(f"[Warning] File not found: {p}")

section_key_map = {"Consolidated Income Statement": "Income Statement", "Consolidated Statement of Financial Position": "Financial Position", "Consolidated Statement of Cash Flows": "Cash Flows"}

qa_pairs = []
for path, info in all_docs.items():
    text = info["text"]
    for k, label in section_key_map.items():
        if k in info["sections"]:
            s, e = info["sections"][k]
            sec_text = text[s:e]
            years = parse_years_from_context(sec_text)
            metrics = parse_metric_table(sec_text, years)
            qa_pairs.extend(make_qa_from_metrics(metrics, label))

qa_pairs = dedupe_keep_order(qa_pairs)
pd.DataFrame(qa_pairs, columns=["Q","A"]).to_csv(CSV_PATH, index=False, encoding="utf-8")
with open(JSONL_PATH, "w", encoding="utf-8") as f:
    for q,a in qa_pairs:
        f.write(json.dumps({"question": q, "answer": a}, ensure_ascii=False)+"\n")
print(f"[Info] Generated {len(qa_pairs)} Q/A pairs")

[Info] Starting PDF processing...
[Info] Extracted text using PDFMiner
[Info] Extracted text using PDFMiner
[Info] Generated 151 Q/A pairs


In [9]:
# ---------- Section 4: Chunking & Metadata ----------
chunk_size_small, chunk_size_large, chunk_overlap = 100, 400, 20
document_chunks = {}
for path, info in all_docs.items():
    text = info["text"]
    def chunk_text(size):
        return [text[i:i+size] for i in range(0, len(text), size - chunk_overlap)]
    document_chunks[path] = {"small_chunks": chunk_text(chunk_size_small), "large_chunks": chunk_text(chunk_size_large)}

all_chunks_with_metadata, cid = [], 0
for path, chunk_sets in document_chunks.items():
    for size_label, chunks in chunk_sets.items():
        size = chunk_size_small if size_label == "small_chunks" else chunk_size_large
        for chunk in chunks:
            all_chunks_with_metadata.append({"id": f"chunk_{cid}", "source_file": path, "chunk_size": size, "text": chunk})
            cid += 1
print(f"[Info] Total chunks: {len(all_chunks_with_metadata)}")


[Info] Total chunks: 5303


In [10]:
# ---------- Section 5: Retrieval Setup ----------
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
chunk_texts = [c["text"] for c in all_chunks_with_metadata]
chunk_embeddings = embedding_model.encode(chunk_texts, show_progress_bar=True)
embeddings_array = np.array(chunk_embeddings).astype('float32')
index = faiss.IndexFlatL2(embeddings_array.shape[1])
index.add(embeddings_array)
print("[Info] FAISS index ready")

tokenized_corpus = [c["text"].split() for c in all_chunks_with_metadata]
bm25 = BM25Okapi(tokenized_corpus)
print("[Info] BM25 index ready")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/166 [00:00<?, ?it/s]

[Info] FAISS index ready
[Info] BM25 index ready


In [11]:
# ---------- Section 6: Retrieval Functions ----------
def preprocess_query(query: str) -> str:
    return re.sub(r'\W+', ' ', query.lower()).strip()

def dense_retrieve(query: str, top_k: int = 5) -> List[Dict]:
    q_emb = embedding_model.encode(preprocess_query(query)).reshape(1,-1).astype('float32')
    _, idx = index.search(q_emb, top_k)
    return [all_chunks_with_metadata[i] for i in idx[0]]

def sparse_retrieve(query: str, top_k: int = 5) -> List[Dict]:
    return bm25.get_top_n(preprocess_query(query).split(), all_chunks_with_metadata, n=top_k)

def hybrid_retrieve(query: str, top_k_dense=5, top_k_sparse=5) -> List[Dict]:
    seen, results = set(), []
    for r in dense_retrieve(query, top_k_dense)+sparse_retrieve(query, top_k_sparse):
        if r['id'] not in seen:
            seen.add(r['id'])
            results.append(r)
    return results

In [12]:
# ---------- Section 7: Memory Bank ----------
def load_memory_bank() -> dict:
    return json.load(open(MEMORY_FILE,"r",encoding="utf-8")) if os.path.exists(MEMORY_FILE) else {}

def save_memory_bank(bank: dict):
    json.dump(bank, open(MEMORY_FILE,"w",encoding="utf-8"), ensure_ascii=False, indent=2)

memory_bank = load_memory_bank()
for q,a in qa_pairs:
    memory_bank[q.lower()] = {"answer": a, "importance": 1}
save_memory_bank(memory_bank)

def retrieve_from_memory(query: str, top_k=3):
    tokens = set(query.lower().split())
    scored = [(len(tokens & set(q.split()))+data['importance'], q, data['answer']) for q,data in memory_bank.items() if tokens & set(q.split())]
    return sorted(scored,key=lambda x:x[0],reverse=True)[:top_k]


In [13]:
# ---------- Section 8: Generative Answer ----------
GEN_MODEL_NAME = "distilgpt2"
tokenizer_gen = AutoTokenizer.from_pretrained(GEN_MODEL_NAME)
model_gen = AutoModelForCausalLM.from_pretrained(GEN_MODEL_NAME).to("cuda" if torch.cuda.is_available() else "cpu")

def generate_final_answer(query: str, chunks: list, memory_hits: list, top_k=3) -> str:
    context = "\n\n".join([c['text'] for c in chunks[:top_k]]+[f"Q: {q}\nA: {a}" for _,q,a in memory_hits[:top_k]])
    prompt = f"Answer the question based on the following:\n\n{context}\n\nQuestion: {query}\nAnswer:"
    inputs = tokenizer_gen(prompt, return_tensors="pt", truncation=True, max_length=1024).to(model_gen.device)
    output_ids = model_gen.generate(**inputs, max_new_tokens=200, do_sample=True, temperature=0.7)
    gen_text = tokenizer_gen.decode(output_ids[0], skip_special_tokens=True)
    return gen_text.split("Answer:")[-1].strip()

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/353M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [14]:
# ---------- Section 9: Guardrails ----------
def validate_query(query: str) -> bool:
    if re.search(r"kill|suicide|attack|password|credit", query, re.I): return False
    if not any(k in query.lower() for k in ["revenue","income","profit","loss","cash","assets","liabilities","financial"]): return False
    return True

def check_factuality(answer: str) -> str:
    return answer if re.search(r"\d", answer) and len(answer.split())>5 else "I'm unable to provide a factual answer."

In [15]:
# ---------- Section 10: Example Execution ----------
query = "What was net income in 2024?"
if validate_query(query):
    chunks = hybrid_retrieve(query)
    memory_hits = retrieve_from_memory(query)
    raw_answer = generate_final_answer(query, chunks, memory_hits)
    print("\nFinal Answer:", check_factuality(raw_answer))
else:
    print("Query rejected by input validation.")

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



Final Answer: €2 million.
Q: what was december in 2024?
A: December in 2024 was €34 million.
Question: What was december in 2024?
A: December in 2024 was €30 million.
Question: What was december in 2024?
A: December in 2024 was €35 million.
Question: What was december in 2024?
A: December in 2024 was €36 million.
Question: What was december in 2024?
A: December in 2024 was €37 million.
Question: What was december in 2024?
A: December in 2024 was €38 million.
Question: What was december in 2024?
A: December in 2024 was €39 million.
Question: What was december in 2024?
A: December in 2024 was €40 million.
Question: What was december in 2024?
A: December in 2024 was €41 million.
Question: What was december


3. Fine-Tuned Model System Implementation

  3.1 Q/A Dataset Preparation

Use the same ~50 Q/A pairs as for RAG but convert into a fine-tuning dataset format.

In [16]:
import json
from datasets import Dataset

qa_dict_list = [{"question": q, "answer": a} for q, a in qa_pairs]

dataset = Dataset.from_list(qa_dict_list)

train_dataset, eval_dataset = dataset.train_test_split(test_size=0.2, seed=42).values()

print(f"Total Q/A pairs: {len(dataset)}")
print(f"Training pairs: {len(train_dataset)}")
print(f"Evaluation pairs: {len(eval_dataset)}")

Total Q/A pairs: 151
Training pairs: 120
Evaluation pairs: 31


3.2 Model Selection

Choose a small open-source language model suitable for fine-tuning:
Examples: DistilBERT, MiniLM, GPT-2 Small/Medium, Llama-2 7B, Falcon 7B, Mistral 7B.
Ensure no use of closed or proprietary APIs.

A great choice for this task is distilbert-base-uncased. It's a small, open-source model that provides a strong balance between performance and computational efficiency, making it ideal for fine-tuning without requiring a high-end GPU. We'll load this model and its corresponding tokenizer for the benchmarking and fine-tuning steps. The correct class for a Question Answering task is AutoModelForQuestionAnswering.

In [17]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
import torch

MODEL_NAME = "distilbert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForQuestionAnswering.from_pretrained(MODEL_NAME)
model.eval()

use_cuda = torch.cuda.is_available()
if use_cuda:
    model.to("cuda")

print(f"Selected Model: {MODEL_NAME}")
print(f"CUDA available: {use_cuda}")

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Selected Model: distilbert-base-uncased
CUDA available: True


3.3 Baseline Benchmarking (Pre-Fine-Tuning)

Evaluate the pre-trained base model on at least 10 test questions.
Record accuracy, confidence (if available), and inference speed.

In [18]:
import time
import re
import numpy as np


test_pairs = eval_dataset.select(range(10))


all_text_context = "\n".join([doc["text"] for doc in all_docs.values()])

def answer_question(question: str, context: str):
    """Answers a question using the base model."""
    inputs = tokenizer(question, context, return_tensors="pt", truncation=True, max_length=512)
    if use_cuda:
        inputs = {k: v.to("cuda") for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)

    start_idx = torch.argmax(outputs.start_logits)
    end_idx = torch.argmax(outputs.end_logits) + 1

    answer_tokens = inputs['input_ids'][0, start_idx:end_idx]
    answer_text = tokenizer.decode(answer_tokens).strip()

    return answer_text

def fuzzy_match(predicted: str, ground_truth: str) -> bool:
    """Simple check to see if key parts of the answer are present."""
    return re.search(re.escape(ground_truth), predicted, re.IGNORECASE) is not None

print("\n--- Running Baseline Benchmarking (10 questions) ---")
accuracies = []
latencies = []

for i, qa_pair in enumerate(test_pairs):
    q = qa_pair["question"]
    a = qa_pair["answer"]


    start_time = time.perf_counter()
    predicted_answer = answer_question(q, all_text_context)
    end_time = time.perf_counter()


    is_correct = fuzzy_match(predicted_answer, a)
    accuracies.append(is_correct)


    latency_ms = (end_time - start_time) * 1000
    latencies.append(latency_ms)

    print(f"\nQuestion {i+1}: {q}")
    print(f"  - Ground Truth: {a}")
    print(f"  - Predicted:    {predicted_answer}")
    print(f"  - Correct:      {is_correct}")
    print(f"  - Latency:      {latency_ms:.2f} ms")

avg_accuracy = np.mean(accuracies) * 100
avg_latency = np.mean(latencies)

print("\n--- Baseline Benchmarking Summary ---")
print(f"Average Accuracy: {avg_accuracy:.2f}%")
print(f"Average Inference Speed: {avg_latency:.2f} ms per question")


--- Running Baseline Benchmarking (10 questions) ---

Question 1: What was states in the amount of € in 2022?
  - Ground Truth: States in the amount of € in 2022 was €31 million.
  - Predicted:    8 ) – 7. 4 7. 4 – 2, 934 ( 578 ) 2, 356 ( 47 ) ( 681 ) ( 11 ) 1, 677 1, 671 13. 3 ( 2. 6 ) 10. 7 0. 3 ( 0. 2 ) 0. 1 ( 3. 2 ) – 7. 6 7. 6 – average number of shares outstanding during the period 171, 350, 138 170, 201, 409 basic earnings per share ( in euros ) diluted average number of shares outstanding diluted earnings per share ( in euros ) 9. 70 9. 82 177, 396, 346 176, 375, 256 9. 37 9. 47 ( 1 )
  - Correct:      False
  - Latency:      270.82 ms

Question 2: What was respect of at December 31, 2028?
  - Ground Truth: respect of at December 31, 2028 was €3 million.
  - Predicted:    was respect of at december 31, 2028? [SEP] december 31, 2024 consolidated financial statements capgemini december 31, 2024 5. 2 consolidated accounts 5. 2. 1 consolidated income statement ( in millions of eur

## 1. Fine-Tuning

In [19]:
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments
from datasets import load_dataset
import wandb
wandb.init(mode="disabled")

# Load base model and tokenizer (replace with chosen model)
model_name = "distilgpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model.resize_token_embeddings(len(tokenizer))

# Add PAD token if missing
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})

model = AutoModelForCausalLM.from_pretrained(model_name)
model.resize_token_embeddings(len(tokenizer))


# Load dataset (expects CSV with 'question' and 'answer' columns)
dataset = load_dataset("csv", data_files={"train": "/content/qa_train.csv", "test": "/content/qa_test.csv"})

def tokenize(batch):
    combined = [f"Question: {q} Answer: {a}" for q, a in zip(batch["question"], batch["answer"])]
    print(combined)
    tokenized = tokenizer(combined, truncation=True, padding="max_length")
    tokenized["labels"] = tokenized["input_ids"].copy()
    return tokenized


dataset = dataset.map(tokenize, batched=True)

# Training arguments (log hyperparameters here)
training_args = TrainingArguments(
    output_dir="./finetuned_model",
    learning_rate=5e-5,
    per_device_train_batch_size=4,
    num_train_epochs=3,
    logging_dir="./logs",
    save_total_limit=2,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
)

trainer.train()

trainer.save_model("./finetuned_model")
tokenizer.save_pretrained("./finetuned_model")


  | |_| | '_ \/ _` / _` |  _/ -_)
The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

['Question: What figure did Capgemini show for Total Assets 2023? Answer: €24,700 million', 'Question: Give me the number for Total Assets 2023. Answer: €24,700 million', 'Question: What figure did Capgemini show for Cash 2023? Answer: €3,517 million', "Question: What was Capgemini's Basic EPS 2023? Answer: €9.70", 'Question: Can you tell me the Revenue 2023 for Capgemini? Answer: €22,522 million', "Question: How much was Capgemini's Profit 2023? Answer: €1,668 million", 'Question: Give me the number for Profit 2023. Answer: €1,668 million', 'Question: Give me the number for Equity 2024. Answer: €11,797 million', 'Question: What amount was recorded as Cash 2024? Answer: €2,787 million', 'Question: What amount was recorded as Revenue 2024? Answer: €22,096 million', 'Question: Give me the number for Basic EPS 2024. Answer: €9.82', 'Question: State the Cash 2024 of Capgemini. Answer: €2,787 million', "Question: How much was Capgemini's Basic EPS 2023? Answer: €9.70", 'Question: State the 

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

['Question: Can you tell me the Operating Margin 2023 for Capgemini? Answer: €2,991 million (13.3%)', 'Question: What figure did Capgemini show for Operating Margin 2024? Answer: €2,934 million (13.3%)', 'Question: In the financial year, what was the Equity 2023? Answer: €10,473 million', 'Question: State the Cash 2024 of Capgemini. Answer: €2,787 million', 'Question: Provide the Revenue 2024 reported by Capgemini. Answer: €22,096 million', "Question: What was Capgemini's Profit 2023? Answer: €1,668 million", 'Question: How did Capgemini report its Revenue 2024? Answer: €22,096 million', 'Question: Provide the Dividends Paid 2023 reported by Capgemini. Answer: €559 million', 'Question: Can you tell me the Basic EPS 2024 for Capgemini? Answer: €9.82', 'Question: State the Total Assets 2023 of Capgemini. Answer: €24,700 million', 'Question: Can you tell me the Equity 2024 for Capgemini? Answer: €11,797 million', "Question: What was Capgemini's Equity 2024? Answer: €11,797 million", 'Ques

`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss


('./finetuned_model/tokenizer_config.json',
 './finetuned_model/special_tokens_map.json',
 './finetuned_model/vocab.json',
 './finetuned_model/merges.txt',
 './finetuned_model/added_tokens.json',
 './finetuned_model/tokenizer.json')

# 2. Advanced Fine-Tuning Technique (Continual Learning / Domain Adaptation)

In [20]:
# Load previously fine-tuned model for continual learning / domain adaptation
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments
from datasets import load_dataset

# Load previously fine-tuned model and tokenizer
model = AutoModelForCausalLM.from_pretrained("./finetuned_model")
tokenizer = AutoTokenizer.from_pretrained("./finetuned_model")

# Load new financial dataset for domain adaptation
new_data = load_dataset("csv", data_files={"train": "qa_finance.csv"})

def tokenize(batch):
    # Assumes your CSV has columns "question" and "answer"
    combined = [f"Question: {q} Answer: {a}" for q, a in zip(batch["question"], batch["answer"])]
    tokenized = tokenizer(
        combined,
        truncation=True,
        padding="max_length",
        max_length=128
    )
    # Set pad tokens in labels to -100 for ignored loss calculation
    tokenized["labels"] = [
        [(id if id != tokenizer.pad_token_id else -100) for id in ids]
        for ids in tokenized["input_ids"]
    ]
    return tokenized

new_data = new_data.map(tokenize, batched=True)

# Set up training arguments (tweak learning rate for continual learning!)
training_args = TrainingArguments(
    output_dir="./finetuned_model",
    learning_rate=5e-5,
    per_device_train_batch_size=4,
    num_train_epochs=3,
    logging_dir="./logs",
    save_total_limit=2,
)

# Set up trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=new_data["train"],
    # eval_dataset=new_data.get("test"), # If you have a test split
)

# Resume fine-tuning (omit resume_from_checkpoint if no checkpoint directory exists)
trainer.train()

# Save updated model and tokenizer
trainer.save_model("./finetuned_model")
tokenizer.save_pretrained("./finetuned_model")

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

Step,Training Loss


('./finetuned_model/tokenizer_config.json',
 './finetuned_model/special_tokens_map.json',
 './finetuned_model/vocab.json',
 './finetuned_model/merges.txt',
 './finetuned_model/added_tokens.json',
 './finetuned_model/tokenizer.json')

In [None]:
## 3. Guardrail Implementation

In [21]:
def input_guardrail(query: str) -> bool:
    """Block irrelevant or harmful queries."""
    blocked_keywords = ["joke", "politics", "violence"]
    return not any(word in query.lower() for word in blocked_keywords)

# Example usage
query = "What was Microsoft's revenue in 2023?"
if input_guardrail(query):
    print("✅ Query accepted:", query)
else:
    print("🚫 Blocked query")

✅ Query accepted: What was Microsoft's revenue in 2023?


In [22]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

# Load fine-tuned model and tokenizer
model_path = "./finetuned_model"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path)
model.eval()

def generate_answer(question, max_length=100):
    # Tokenize input
    inputs = tokenizer(question, return_tensors="pt")
    prompt = f"Question: {question} Answer:"
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids
    # Generate output
    with torch.no_grad():
        output_ids = model.generate(
            input_ids,
            max_length=max_length,
            num_return_sequences=1,
            pad_token_id=tokenizer.eos_token_id
        )
    output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    # Remove prompt from output (just show the answer)
    answer = output_text[len(prompt):].strip()
    return answer.strip()

# Sample test questions
sample_questions = [
    "What was Capgemini's revenue in 2023?",
    "What was Capgemini's revenue in 2024?",
    "What were the total assets at the end of 2023?",
    "What was the basic earnings per share in 2024?"
]

print("=== Testing Fine-Tuned Model ===")
for q in sample_questions:
    print(f"\nQuestion: {q}")
    print("Answer:", generate_answer(q))

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


=== Testing Fine-Tuned Model ===

Question: What was Capgemini's revenue in 2023?
Answer: Capgemini's revenue in 2023 was $1.2 billion.

Question: What was Capgemini's revenue in 2024?
Answer: Capgemini's revenue in 2024 was $1.2 billion.

Question: What were the total assets at the end of 2023?
Answer: The total assets at the end of 2023? Answer: The total assets at the end of 2023? Answer: The total assets at the end of 2023? Answer: The total assets at the end of 2023? Answer: The total assets at the end of 2023? Answer: The total assets at the end of 2023? Answer: The total assets at the end of 2023? Answer:

Question: What was the basic earnings per share in 2024?
Answer: The basic earnings per share in 2024 was $1.1 billion.
