# 🛡️ UK Fraud Prevention Chatbot - LLaMA 2 Fine-Tuning

This notebook fine-tunes LLaMA 2 on UK fraud prevention Q&A data using QLoRA (4-bit quantization + LoRA) for memory efficiency in Google Colab.

## Dataset Overview
- **Total Q&A pairs**: 111
- **Training pairs**: 88
- **Validation pairs**: 23
- **Sources**: GetSafeOnline, FCA, UK Finance, Action Fraud, Which

## Training Approach
- **Model**: LLaMA 2-7B (4-bit quantized)
- **Method**: QLoRA (Quantized LoRA)
- **Format**: Alpaca instruction tuning
- **Focus**: UK-specific fraud prevention guidance

## 1. Environment Setup

In [None]:
import sys
IN_COLAB = 'google.colab' in sys.modules

if IN_COLAB:
    print(" Running in Google Colab")

    !pip install -q torch>=2.0.0 transformers>=4.35.0 datasets>=2.14.0 accelerate>=0.24.0
    !pip install -q peft>=0.6.0 bitsandbytes>=0.41.0 trl>=0.7.0
    !pip install -q scipy>=1.11.0 scikit-learn>=1.3.0 pandas>=2.0.0 numpy>=1.24.0
    !pip install -q tokenizers>=0.14.0 rouge-score>=0.1.2
    !pip install -q tqdm>=4.65.0 matplotlib>=3.7.0 seaborn>=0.12.0
    !pip install -q fsspec>=2025.3.2
    !pip install wandb
    print(" Packages installed successfully with proven compatibility")

else:
    print(" Running locally - using existing environment")

print(" Environment setup complete!")

🔧 Running in Google Colab
📦 Installing required packages with proven compatibility...
✅ Packages installed successfully with proven compatibility
✅ Environment setup complete!


In [None]:
# Import libraries with proven compatibility approach
import torch
import transformers
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    DataCollatorForSeq2Seq,
    BitsAndBytesConfig,
    DataCollatorForLanguageModeling
)
from peft import (
    LoraConfig,
    get_peft_model,
    TaskType,
    prepare_model_for_kbit_training  
)
from datasets import Dataset, DatasetDict
import json
import os
from datetime import datetime
import numpy as np
from typing import Dict, List, Any
from trl import SFTTrainer
import warnings
warnings.filterwarnings('ignore')

print(f" PyTorch version: {torch.__version__}")
print(f" Transformers version: {transformers.__version__}")
print(f" CUDA available: {torch.cuda.is_available()}")

if torch.cuda.is_available():
    print(f" GPU: {torch.cuda.get_device_name(0)}")
    print(f" GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
    print(f" CUDA version: {torch.version.cuda}")

try:
    import bitsandbytes as bnb
    print(" bitsandbytes imported successfully!")
except ImportError as e:
    print(f" bitsandbytes import issue: {e}")

print(" All libraries imported successfully!")

from huggingface_hub import notebook_login

🔥 PyTorch version: 2.6.0+cu124
🤗 Transformers version: 4.53.3
🎯 CUDA available: True
💾 GPU: Tesla T4
📊 GPU Memory: 15.8 GB
🔧 CUDA version: 12.4
✅ bitsandbytes imported successfully!
✅ All libraries imported successfully!


## 2. Data Preparation

In [None]:
# Mount Google Drive to access datasets
from google.colab import drive
drive.mount('/content/drive')

DATASET_PATH = "/content/drive/MyDrive/Dissertation/cyber-fraud-chatbot/model_training"
OUTPUT_PATH = "/content/drive/MyDrive/Dissertation/cyber-fraud-chatbot/trained_models"

In [None]:
def load_fraud_dataset(dataset_path):
  """Load and validate the fraud prevention dataset."""

  train_file = os.path.join(dataset_path, "train_fraud_qa_dataset.json")
  val_file = os.path.join(dataset_path, "val_fraud_qa_dataset.json")

  if not os.path.exists(train_file) or not os.path.exists(val_file):
      raise FileNotFoundError("Dataset files not found. Please upload them first.")

  # Load training data
  with open(train_file, 'r', encoding='utf-8') as f:
      train_data = json.load(f)

  # Load validation data
  with open(val_file, 'r', encoding='utf-8') as f:
      val_data = json.load(f)

  print(f" Loaded {len(train_data)} training examples")
  print(f" Loaded {len(val_data)} validation examples")

  # Display sample data
  print("\n Sample training example:")
  sample = train_data[0]
  print(f"Instruction: {sample['instruction'][:100]}...")
  print(f"Output: {sample['output'][:100]}...")
  print(f"Source: {sample.get('data_source', 'Unknown')}")

  return train_data, val_data

# Load the datasets
train_data, val_data = load_fraud_dataset(DATASET_PATH)

✅ Loaded 88 training examples
✅ Loaded 23 validation examples

📝 Sample training example:
Instruction: I received a letter mentioning 'Falcon North' - what does this mean for my fraud case?...
Output: If you received a letter mentioning 'Falcon North' (or Falcon East, South, or West), this means your...
Source: actionfraud


In [None]:
def format_instruction(sample):
    """Format sample into text format for SFTTrainer."""

    instruction = sample['instruction']
    input_text = sample.get('input', '')
    output = sample['output']

    # Create the formatted text following Alpaca format
    if input_text.strip():
        formatted_text = f"""Below is an instruction that describes a task, paired with an input that
provides further context. Write a response that appropriately completes the request.

### Instruction:
{instruction}

### Input:
{input_text}

### Response:
{output}"""
    else:
        formatted_text = f"""Below is an instruction that describes a task. Write a response that
appropriately completes the request.

### Instruction:
{instruction}

### Response:
{output}"""

    return {"text": formatted_text}

# Format datasets - return text strings, not tokenized data
formatted_train_data = [format_instruction(sample) for sample in train_data]
formatted_val_data = [format_instruction(sample) for sample in val_data]

# Create Hugging Face datasets with text field
train_dataset = Dataset.from_list(formatted_train_data)
val_dataset = Dataset.from_list(formatted_val_data)

print(f" Formatted datasets created")
print(f"Training samples: {len(train_dataset)}")
print(f"Validation samples: {len(val_dataset)}")

# Display formatted sample
print("\n Formatted sample:")
print(train_dataset[0]['text'][:300] + "...")

✅ Formatted datasets created
Training samples: 88
Validation samples: 23

📝 Formatted sample:
Below is an instruction that describes a task. Write a response that 
appropriately completes the request.

### Instruction:
I received a letter mentioning 'Falcon North' - what does this mean for my fraud case?

### Response:
If you received a letter mentioning 'Falcon North' (or Falcon East, South...


## 3. Model and Tokenizer Setup

In [None]:
# Model configuration
MODEL_NAME = "NousResearch/Llama-2-7b-chat-hf"  # Pre-trained LLaMA 2 7B Chat
MAX_LENGTH = 2048  # Maximum sequence length

# 4-bit quantization configuration for memory efficiency
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
)

print(f" Loading model: {MODEL_NAME}")
print(f" Max sequence length: {MAX_LENGTH}")
print(f" Using 4-bit quantization for memory efficiency")

🤖 Loading model: NousResearch/Llama-2-7b-chat-hf
📏 Max sequence length: 2048
🔢 Using 4-bit quantization for memory efficiency


In [None]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    MODEL_NAME,
    trust_remote_code=True,
    padding_side="left",
    add_eos_token=True,
    add_bos_token=True,
)

# Set pad token
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id

print(f" Tokenizer loaded")
print(f"Vocabulary size: {len(tokenizer)}")
print(f"Pad token: {tokenizer.pad_token}")
print(f"EOS token: {tokenizer.eos_token}")

tokenizer_config.json:   0%|          | 0.00/746 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/21.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/435 [00:00<?, ?B/s]

✅ Tokenizer loaded
Vocabulary size: 32001
Pad token: <unk>
EOS token: </s>


In [None]:
# Load model with quantization
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
    torch_dtype=torch.float16,
)

# Enable gradient checkpointing for memory efficiency
model.gradient_checkpointing_enable()

print(f" Model loaded with 4-bit quantization")
print(f"Model device: {next(model.parameters()).device}")
print(f"Model dtype: {next(model.parameters()).dtype}")

config.json:   0%|          | 0.00/583 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/200 [00:00<?, ?B/s]

✅ Model loaded with 4-bit quantization
Model device: cuda:0
Model dtype: torch.float16


## 4. LoRA Configuration

In [None]:
# LoRA configuration for parameter-efficient fine-tuning
lora_config = LoraConfig(
    r=16,  # Rank of adaptation
    lora_alpha=32,  # LoRA scaling parameter
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
        "lm_head",
    ],
    bias="none",
    lora_dropout=0.05,  # LoRA dropout
    task_type=TaskType.CAUSAL_LM,
)

# Apply LoRA to the model
model = get_peft_model(model, lora_config)

# Print trainable parameters
trainable_params = 0
all_param = 0
for _, param in model.named_parameters():
    all_param += param.numel()
    if param.requires_grad:
        trainable_params += param.numel()

print(f" LoRA applied to model")
print(f"Trainable params: {trainable_params:,} || All params: {all_param:,} || Trainable%: {100 * trainable_params / all_param:.2f}")

✅ LoRA applied to model
Trainable params: 40,554,496 || All params: 3,540,967,424 || Trainable%: 1.15


## 5. Training Configuration

In [None]:
training_args = TrainingArguments(
    output_dir=OUTPUT_PATH,
    num_train_epochs=3,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=4,
    optim="paged_adamw_32bit",
    save_steps=10,  
    logging_steps=5,
    learning_rate=2e-4,
    weight_decay=0.001,
    fp16=True,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="cosine",
    report_to="none",
    eval_strategy="steps",
    eval_steps=10,  # Same as save_steps
    save_strategy="steps",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    dataloader_pin_memory=False,
    save_total_limit=3,
)

print(f" Will save every {training_args.save_steps} steps")
print(" Training configuration:")
print(f"Epochs: {training_args.num_train_epochs}")
print(f"Batch size per device: {training_args.per_device_train_batch_size}")
print(f"Gradient accumulation steps: {training_args.gradient_accumulation_steps}")
print(f"Effective batch size: {training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps}")
print(f"Learning rate: {training_args.learning_rate}")
print(f"Optimizer: {training_args.optim}")

⚙️ Updated training configuration with frequent checkpointing
💾 Will save every 10 steps
⚙️ Training configuration:
Epochs: 3
Batch size per device: 2
Gradient accumulation steps: 4
Effective batch size: 8
Learning rate: 0.0002
Optimizer: OptimizerNames.PAGED_ADAMW


In [None]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,  # Causal LM, not masked LM
)

In [None]:
# Tokenize the text data first
def tokenize_function(examples):
    """Tokenize the formatted text"""
    tokenized = tokenizer(
        examples["text"],
        truncation=True,
        padding="max_length",
        max_length=MAX_LENGTH,
        return_tensors="pt"
    )
    tokenized["labels"] = tokenized["input_ids"].clone()
    return tokenized

In [54]:
# Tokenize datasets
tokenized_train = train_dataset.map(tokenize_function, batched=True, remove_columns=["text"])
tokenized_val = val_dataset.map(tokenize_function, batched=True, remove_columns=["text"])

Map:   0%|          | 0/88 [00:00<?, ? examples/s]

Map:   0%|          | 0/23 [00:00<?, ? examples/s]

In [None]:
# Initialize the SFT trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    data_collator=data_collator,
)
print(" SFT Trainer initialized successfully!")
print(f"Training dataset size: {len(trainer.train_dataset)}")
print(f"Validation dataset size: {len(trainer.eval_dataset)}")

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


✅ SFT Trainer initialized successfully!
Training dataset size: 88
Validation dataset size: 23


## 6. Model Training

In [None]:
# Clear memory before training
import gc
torch.cuda.empty_cache()
gc.collect()

print(" Memory cleared")
if torch.cuda.is_available():
    print(f"GPU memory allocated: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
    print(f"GPU memory reserved: {torch.cuda.memory_reserved() / 1024**3:.2f} GB")

🧹 Memory cleared
GPU memory allocated: 3.77 GB
GPU memory reserved: 7.54 GB


In [None]:
# Start training
print(" Starting fine-tuning...")
print("="*60)

# Train the model
trainer.train()

print("\n Training completed successfully!")

🚀 Starting fine-tuning...
This will take approximately 2-3 hours. You can monitor progress below.
Training logs will appear here:


Step,Training Loss,Validation Loss
10,1.8634,1.868693
20,1.7832,1.79529
30,1.7579,1.783939



🎉 Training completed successfully!


## 7. Save the Fine-tuned Model

In [None]:
# Save the fine-tuned model
os.makedirs(OUTPUT_PATH, exist_ok=True)

# Save the LoRA adapter
model_save_path = os.path.join(OUTPUT_PATH, "uk-fraud-chatbot-llama2-lora")
trainer.model.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)

print(f" Fine-tuned model saved to: {model_save_path}")
print(" Saved files:")
for file in os.listdir(model_save_path):
    print(f"  - {file}")

# Save training info
training_info = {
    "model_name": MODEL_NAME,
    "training_samples": len(train_dataset),
    "validation_samples": len(val_dataset),
    "epochs": training_args.num_train_epochs,
    "learning_rate": training_args.learning_rate,
    "lora_r": lora_config.r,
    "lora_alpha": lora_config.lora_alpha,
    "max_length": MAX_LENGTH,
    "sources": ["GetSafeOnline", "FCA", "UK Finance", "Action Fraud", "Which"]
}

with open(os.path.join(OUTPUT_PATH, "training_info.json"), 'w') as f:
    json.dump(training_info, f, indent=2)

print(" Training information saved")

✅ Fine-tuned model saved to: /content/drive/MyDrive/Dissertation/cyber-fraud-chatbot/trained_models/uk-fraud-chatbot-llama2-lora
📁 Saved files:
  - README.md
  - adapter_model.safetensors
  - adapter_config.json
  - tokenizer_config.json
  - special_tokens_map.json
  - added_tokens.json
  - tokenizer.model
  - tokenizer.json
ℹ️ Training information saved


## 8. Test the Fine-tuned Model

In [None]:
# Test the fine-tuned model
def test_fraud_chatbot(question, max_length=512):
    """Test the fine-tuned model with a fraud-related question."""

    # Format the input following Alpaca format
    prompt = f"""Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
{question}

### Response:
"""

    # Tokenize input
    inputs = tokenizer(
        prompt,
        return_tensors="pt",
        truncation=True,
        max_length=MAX_LENGTH
    ).to(model.device)

    # Generate response
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_length,
            do_sample=True,
            temperature=0.7,
            top_p=0.9,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id,
        )

    # Decode response
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Extract only the generated response
    response_start = response.find("### Response:") + len("### Response:")
    generated_response = response[response_start:].strip()

    return generated_response

# Test questions
test_questions = [
    "I think I've been a victim of online shopping fraud. What should I do?",
    "Someone called me claiming to be from my bank asking for my PIN. Is this legitimate?",
    "I received a suspicious email asking me to click a link. What should I do?",
    "How do I report fraud to Action Fraud?",
]

print(" Testing the fine-tuned UK Cyber Fraud Prevention Chatbot")
print("="*60)

for i, question in enumerate(test_questions, 1):
    print(f"\n Test {i}")
    print(f" Question: {question}")
    print(f" Response: ")

    try:
        response = test_fraud_chatbot(question)
        print(response)
    except Exception as e:
        print(f"Error: {e}")

    print("-" * 50)

🧪 Testing the fine-tuned UK Fraud Prevention Chatbot

🔹 Test 1
❓ Question: I think I've been a victim of online shopping fraud. What should I do?
🤖 Response: 
hopefully, this is not a scam. If you've been a victim of online shopping fraud, there are several steps you can take to report the incident and protect your personal information. First, contact the website or platform where you made the purchase and report the incident to their customer service department. You can also file a complaint with the Federal Trade Commission (FTC) or your state's Attorney General's office. Additionally, you may want to consider placing a fraud alert on your credit report to prevent any further unauthorized activity. It's also a good idea to review your bank and credit card statements regularly to detect any suspicious activity.
--------------------------------------------------

🔹 Test 2
❓ Question: Someone called me claiming to be from my bank asking for my PIN. Is this legitimate?
🤖 Response: 
obvio

## 9. Model Evaluation

In [None]:
# Evaluate the model on validation set
eval_results = trainer.evaluate()

print(" Model Evaluation Results:")
print("=" * 40)
for key, value in eval_results.items():
    if isinstance(value, float):
        print(f"{key}: {value:.4f}")
    else:
        print(f"{key}: {value}")

# Save evaluation results
with open(os.path.join(OUTPUT_PATH, "evaluation_results.json"), 'w') as f:
    json.dump(eval_results, f, indent=2)

print(f"\n Evaluation results saved to: {OUTPUT_PATH}/evaluation_results.json")

📊 Model Evaluation Results:
eval_loss: 1.7910
eval_runtime: 288.6235
eval_samples_per_second: 0.0800
eval_steps_per_second: 0.0420
epoch: 3.0000

✅ Evaluation results saved to: /content/drive/MyDrive/Dissertation/cyber-fraud-chatbot/trained_models/evaluation_results.json


# Upload to Hugging Face

In [64]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
from huggingface_hub import HfApi, create_repo
import os
import json

def upload_model_to_hub(
    local_model_path,
    repo_name,
    hf_username,
    commit_message="Upload UK Fraud Support Chatbot"
):
    repo_id = f"{hf_username}/{repo_name}"

    print(f" Uploading model to: {repo_id}")
    print(f" Local path: {local_model_path}")

    try:
        # Initialize API
        api = HfApi()

        create_repo(
            repo_id=repo_id,
            repo_type="model",
            exist_ok=True,
            private=False  
        )

        # Upload all files in the model directory
        print(" Uploading model files...")
        api.upload_folder(
            folder_path=local_model_path,
            repo_id=repo_id,
            repo_type="model",
            commit_message=commit_message
        )

        # Create model card
        model_card_content = f"""---
library_name: peft
base_model: meta-llama/Llama-2-7b-chat-hf
tags:
- fraud-prevention
- uk-specific
- victim-support
- fine-tuned
- llama2
- peft
- lora
license: llama2
language:
- en
datasets:
- custom
pipeline_tag: text-generation
---

# UK Fraud Support Chatbot

A specialized LLaMA 2-7B model fine-tuned for UK fraud victim support, trained on comprehensive fraud prevention data from official UK sources.

## Model Details

- **Base Model**: NousResearch/Llama-2-7b-chat-hf
- **Fine-tuning Method**: QLoRA (4-bit quantization + LoRA)
- **Training Data**: 111 Q&A pairs from official UK fraud prevention sources
- **Specialization**: UK fraud victim support and guidance

## Usage

```python
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel

# Load base model
base_model = AutoModelForCausalLM.from_pretrained(
    "NousResearch/Llama-2-7b-chat-hf",
    torch_dtype=torch.float16,
    device_map="auto"
)

# Load LoRA adapter
model = PeftModel.from_pretrained(base_model, "{repo_id}")
tokenizer = AutoTokenizer.from_pretrained("NousResearch/Llama-2-7b-chat-hf")

# Generate response
system_prompt = "You are a specialized UK fraud victim support assistant..."
prompt = f"<s>[INST] <<SYS>>\\n{{system_prompt}}\\n<</SYS>>\\n\\nYour question here [/INST]"
```

## Training Data Sources

- Action Fraud (UK's national fraud reporting centre)
- GetSafeOnline (UK's leading online safety resource)
- Financial Conduct Authority (FCA)
- UK Finance
- Which (consumer protection)

"""

        # Upload model card
        api.upload_file(
            path_or_fileobj=model_card_content.encode(),
            path_in_repo="README.md",
            repo_id=repo_id,
            repo_type="model",
            commit_message="Add model card"
        )

        return repo_id

    except Exception as e:
        print(f" Upload failed: {e}")
        return None

def main():
    # Configuration - UPDATING THESE VALUES
    LOCAL_MODEL_PATH = "/content/drive/MyDrive/Dissertation/cyber-fraud-chatbot/trained_models/uk-fraud-chatbot-llama2-lora" 
    HF_USERNAME = "misee" 
    REPO_NAME = "uk-fraud-chatbot-llama2"  
    
    # Check if model path exists
    if not os.path.exists(LOCAL_MODEL_PATH):
        print(f" Model path not found: {LOCAL_MODEL_PATH}")
        return

    # List files in model directory
    print(f" Files in model directory:")
    for file in os.listdir(LOCAL_MODEL_PATH):
        print(f"  - {file}")

    # Upload model
    repo_id = upload_model_to_hub(
        local_model_path=LOCAL_MODEL_PATH,
        repo_name=REPO_NAME,
        hf_username=HF_USERNAME
    )

    if repo_id:
        print(f"\n SUCCESS! Model is now available at:")
        print(f"🔗 https://huggingface.co/{repo_id}")

if __name__ == "__main__":
    main()