In [None]:
## Installations; to be executed if running this notebook from Google Colab (recommended).
#!pip install pymupdf
#!pip install bitsandbytes



In [None]:
#Imports

import os
import torch
import pymupdf
from datasets import Dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling,
)
from peft import LoraConfig, PeftModel, get_peft_model

In [None]:
# Try to make pytorch let go of some of its unused allocated memory!
#   (to be used if running some cells multiple times in Google Colab)
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

In [None]:
####### CUDA and MPS checks #######

# Use this to check if your machine supports CUDA (and would have VRAM)
print("CUDA available:", torch.cuda.is_available())
print("Device:", torch.cuda.current_device() if torch.cuda.is_available() else "CPU")
print("CUDA device count:", torch.cuda.device_count())
print("Device name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No CUDA device")
print("Available memory:",
      torch.cuda.get_device_properties(0).total_memory if torch.cuda.is_available() else "No CUDA device")


# Use this to check if your machine (e.g. Mac Laptop) supports Metal Performance Shaders
print("MPS available:", torch.backends.mps.is_available())


In [None]:
###########################################################################
########################       1. CONFIGURATION        ####################
###########################################################################

# --- Model Configuration ---
BASE_MODEL_NAME = "meta-llama/Meta-Llama-3.1-8B-Instruct"
# Other base Models to try:
#   - meta-llama/Meta-Llama-3.1-8B-Instruct: a powerful and multilingual model, 8.03B params, Tensor type: BF16
#   - mistralai/Mistral-7B-Instruct-v0.3: 7.25B params, Tensor type: BF16
#   - microsoft/Phi-3-mini-4k-instruct: Small Language model with 3.82 billion parameters, Tensor Type: BF16
#   - google/gemma-2-2b-it: Compact/LightWeight model with 2.61 billion parameter, known for strong performance in multilingual applications. Tensor Type: BF16
#   - Qwen/Qwen2.5-7B-Instruct: 7.62B params, Tensor type: BF16
#   - openai/gpt-oss-20b: 21.5B params (but they say it works on 16GB RAM machines, Tensor type: BF16·U8
#   - malhajar/Mistral-7B-v0.1-arabic: A fine-tuned version of Mistral 7B specifically for Arabic.
#   - QCRI/Fanar-1-9B-Instruct: A model developed by Qatar's Computing Research Institute, based on a different architecture but highly specialized for Arabic.
#   - ALLaM-AI/ALLaM-7B-Instruct-preview: A Saudi-led initiative with a focus on Arabic.
#   - Mistral Saba? : 24 billion parameters and is specifically tailored for Arabic language and cultural nuances

NEW_MODEL_NAME = (BASE_MODEL_NAME + "-tuned-for-economics").replace('/', '-')
TASK_TYPE = "CAUSAL_LM"  #Causal LM is more suitable for Generative tasks than MLM (Masked Language Modeling - e.g. what BERT uses)

# Hugging Face Access Token has to be added in your Secrets tab in Google Colab, with variable name: "HF_TOKEN"
from google.colab import userdata
ACCESS_TOKEN = userdata.get('HF_TOKEN')
## Alternatively if running from local machine: store the token in .env file and load it:
#from dotenv import load_dotenv
# load_dotenv() # Load environment variables
# ACCESS_TOKEN = os.getenv("HF_TOKEN")

from huggingface_hub import login
login(token = ACCESS_TOKEN)

# --- Data Configuration ---
print("Current dir is: ", os.getcwd())
PDF_FOLDER_PATH = "./sample_data/"  # or ./docs on local machine


# --- PEFT (Parameter Efficient Fine-Tuning) & LoRA (Low Rank Adaptation) Configuration ---
# These settings configure the LoRA adapters for efficient training.
lora_config = LoraConfig(
    r=16,  # Rank of the update matrices. Lower rank means fewer parameters to train.
    lora_alpha=32,  # Alpha parameter for scaling.
    lora_dropout=0.05,  # Dropout probability for LoRA layers.
    bias="none",
    task_type=TASK_TYPE,
    # Target modules can be specific to the model architecture.
    #   For Llama 3, these are common layers to adapt.
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
)


# --- Training Configuration ---
training_arguments = TrainingArguments(
    output_dir="./results",
    num_train_epochs=1,  # A single epoch is often sufficient for domain adaptation.
    per_device_train_batch_size=2,  # Batch size per GPU. Adjust based on your VRAM.
    gradient_accumulation_steps=1,
    optim="paged_adamw_32bit",  # Memory-efficient optimizer.
    save_steps=50,  # Save a checkpoint every 50 steps.
    logging_steps=10,  # Log training progress every 10 steps.
    learning_rate=2e-4,
    weight_decay=0.001,
    #fp16=True,  # Use 16-bit precision for training.
    bf16=True, # Set to True if you have a modern GPU that supports it (e.g., Ampere).
    max_grad_norm=0.3,
    max_steps=-1,  # If > 0, overrides num_train_epochs.
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="constant",
    report_to="none"
)

print("Free memory at the moment:", torch.cuda.memory_reserved(0) - torch.cuda.memory_allocated(0))

In [None]:
###########################################################################
########################       2. DATA PREP         #######################
###########################################################################

def extract_text_from_pdfs(folder_path):
    """
    Extracts all text from PDF files in a given folder and combines them.
    """

    # Note: the two sample PDF documents used in this project were downloaded from:
    #   - UN World Economic Situation and Prospects 2024:
    #       https://www.un.org/development/desa/dpad/wp-content/uploads/sites/45/WESP_2024_Web.pdf
    #   - UK Parliament - Economic Indicators Report 2025:
    #       https://researchbriefings.files.parliament.uk/documents/CBP-9040/CBP-9040.pdf


    print(f"Reading PDFs from: {folder_path}")
    full_text = ""
    if not os.path.isdir(folder_path):
        print(f"Error: Directory not found at {folder_path}")
        print("Please put some PDFs in the sample_data folder (or in the docs folder).")
        return None

    pdf_files = [f for f in os.listdir(folder_path) if f.endswith(".pdf")]
    if not pdf_files:
        print(f"No PDF files found in {folder_path}.")
        return None

    for filename in pdf_files:
        file_path = os.path.join(folder_path, filename)
        try:
            with pymupdf.open(file_path) as doc:
                text = "".join(page.get_text() for page in doc)
                full_text += text + "\n"
                print(f"  - Successfully processed {filename}")
        except Exception as e:
            print(f"Could not read {filename}: {e}")
    return full_text


# Extract text and create a Hugging Face Dataset
corpus_text = extract_text_from_pdfs(PDF_FOLDER_PATH)
if corpus_text:
    # Create a dataset with a single column named "text", which the trainer will use to train the model.
    dataset = Dataset.from_dict({"text": [corpus_text]})
    print(f"\nSuccessfully created dataset with {len(corpus_text)} characters.")
else:
    print("\nData loading failure!")
    #exit()  #only works if you are executing locally, not on Google Colab.

print("Free memory at the moment:", torch.cuda.memory_reserved(0) - torch.cuda.memory_allocated(0))

In [None]:
###########################################################################
###################      3.  MODEL LOADING & PEFT SETUP     ###############
###########################################################################

# --- Load the model with 4-bit quantization (this significantly reduces the memory footprint) ---
if (dataset):  # prevent continuation if there were no PDFs or no text was extracted
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16,
        bnb_4bit_use_double_quant=False,
    )

    print("\nLoading base model...")
    base_model = AutoModelForCausalLM.from_pretrained(
        BASE_MODEL_NAME,
        quantization_config=bnb_config,
        device_map="auto" # Automatically maps the model to available devices (GPU/CPU).
    )
    base_model.config.use_cache = False
    base_model.config.pretraining_tp = 1

    print ("Model and its layers:")
    print(base_model)

    print("Currently used device by base_model: ", base_model.device)


In [None]:
if (dataset):
    # --- Load the tokenizer ---
    tokenizer = AutoTokenizer.from_pretrained(
        BASE_MODEL_NAME,
        trust_remote_code=True)
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "right"

    print("Base model and tokenizer loaded successfully.")

    # --- Tokenize the dataset ---
    def tokenize_function(examples):
        # We just return the tokenized text. The data collator will handle batching and masking.
        return tokenizer(examples["text"], truncation=True, max_length=512)

    tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["text"])

    print("Free memory at the moment:", torch.cuda.memory_reserved(0) - torch.cuda.memory_allocated(0))

In [None]:
if (dataset):
    # --- Prepare model for PEFT ---
    peft_model = get_peft_model(base_model, lora_config)
    print("\nModel prepared for PEFT.")

    print("Free memory at the moment:", torch.cuda.memory_reserved(0) - torch.cuda.memory_allocated(0))

In [None]:
###########################################################################
########################       4. TRAINING         ########################
###########################################################################

print("\nInitializing trainer...")
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# --- Use the standard `Trainer` for continued pre-training ---
trainer = Trainer(
    model=peft_model,
    args=training_arguments,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)


print("Starting training...")

print("Free memory at the beginning of training:", torch.cuda.memory_reserved(0) - torch.cuda.memory_allocated(0))
trainer.train()
print("Training complete.")

print("Free memory after training:", torch.cuda.memory_reserved(0) - torch.cuda.memory_allocated(0))

# --- Save the fine-tuned model ---
# This saves the LoRA adapter, not the full model.
trainer.model.save_pretrained(NEW_MODEL_NAME)
print(f"Fine-tuned model adapter saved to ./{NEW_MODEL_NAME}")



In [None]:
###########################################################################
###################     5. INFERENCE & COMPARISON       ###################
###########################################################################

# --- Define a test prompt ---
prompt = """Based on recent economic trends,
what is the outlook for global GDP growth in the coming year and what are the primary risks?"""


# --- Define a generation function ---
def generate_response(model, tokenizer, prompt_text):
    """
    Generates a response from a given model and tokenizer.
    """

    # Create the messages payload for the chat model

    # google/gemma-2-2b-it model does not support role "system"; it calls it "assistant"
    system_role = "assistant" if BASE_MODEL_NAME=="google/gemma-2-2b-it" else "system"

    messages = []

    # google/gemma-2-2b-it model is a bit classic and it insists that the user role starts first
    if BASE_MODEL_NAME=="google/gemma-2-2b-it":
        messages.append({"role": "user", "content": "Please assist me!"})

    messages.append([
        {"role": system_role, "content": "You are an expert economist. Provide a detailed and insightful analysis."},
        {"role": "user", "content": prompt_text},
    ])

    #print("#debugging at generate_response(): messages:", messages)

    print("Currently used device by the model: ", model.device)

    # Apply the chat template and encode the prompt
    input_ids = tokenizer.apply_chat_template(
        messages,
        add_generation_prompt=True,
        return_tensors="pt"
    ).to(model.device)

    # Generate the response
    outputs = model.generate(
        input_ids,
        max_new_tokens=5000,  # Limit the length of the generated response.
        do_sample=True,
        temperature=0.6,
        top_p=0.9,
    )

    # Decode the response
    response = outputs[0][input_ids.shape[-1]:]
    return tokenizer.decode(response, skip_special_tokens=True)


In [None]:
# Compare the response of the base model to the response of the fine-tuned model

##### The main/simple way of doing it
print("\n--- Generating response from ORIGINAL BASE MODEL ---")
# For the base model, we don't need to load any adapters.
# We can use the 'base_model' object we already have, but we'll set it to evaluation mode.
base_model.eval()
base_response = generate_response(base_model, tokenizer, prompt)
print(f"Prompt: {prompt}")
print(f"Response:\n{base_response}\n")


##### A potentially alternative way of doing it:
# # --- Determine the device ---
# # This is mainly for logging; device_map="auto" handles the actual placement.
# device_string = "mps" if torch.backends.mps.is_available() else "cuda" if torch.cuda.is_available() else "cpu"
# print(f"\nUsing device: {device_string.upper()}")
#
# print("\n--- Generating response from ORIGINAL BASE MODEL ---")
# # Clear memory first
# print("Free memory before clearing memory:", torch.cuda.memory_reserved(0) - torch.cuda.memory_allocated(0))
# print("Trying to clear some memory!")
# del peft_model
# del base_model
# del trainer
# if "cuda" in device_string:
#     torch.cuda.empty_cache()
# elif "mps" in device_string:
#     torch.mps.empty_cache()

# print("Free memory after clearing memory:", torch.cuda.memory_reserved(0) - torch.cuda.memory_allocated(0))
#
# original_model = AutoModelForCausalLM.from_pretrained(
#     BASE_MODEL_NAME,
#     quantization_config=bnb_config,
#     device_map="auto"
# )
#
# original_model.config.pad_token_id = tokenizer.eos_token_id
#
# base_response = generate_response(original_model, tokenizer, prompt)
# print(f"Prompt: {prompt}")
# print(f"Response:\n{base_response}\n")



In [None]:
print("\n--- Generating response from NEWLY TRAINED MODEL ---")
# To use our trained model, we load the base model again and then apply the LoRA adapter we just saved.
base_model_for_inference = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL_NAME,
    quantization_config=bnb_config,
    offload_folder="offload",
    device_map="auto"
)

In [None]:
# Load the PEFT model (the LoRA adapter)
trained_model = PeftModel.from_pretrained(base_model_for_inference,
                                          NEW_MODEL_NAME,
                                          is_trainable=False # this tells the PEFT library that we are loading the LoRA adapter only for inference, which allows it to use a more efficient loading method
                                          )
trained_model.eval()



In [None]:
trained_response = generate_response(trained_model, tokenizer, prompt)
print(f"Prompt: {prompt}")
print(f"Response:\n{trained_response}\n")

print(
    "Comparison complete. Observe how the newly trained model's response may be more aligned with the style and content of your training documents.")


In [None]:
# # Mount a Google Drive folder so that the adapted model can be downloaded from Google Colab folders:
# #    instructions from here: https://saturncloud.io/blog/how-to-download-multiple-files-or-an-entire-folder-from-google-colab/
# from google.colab import drive
# drive.mount('/content/drive')  # When you get to the authentication, I have to select all things to allow all!

# %cd /content/
# !zip -r my_model.zip <model-folder-name-here>/

# from google.colab import files
# files.download('my_model.zip')
