<a href="https://colab.research.google.com/github/pranav-85/cricket_chatbot/blob/main/finetune.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Finetuning Mistral-7B on cricket data

## 1. Data Cleaning and Preprocessing

In [None]:
import re
import json
import os
import PyPDF2
# !pip install PyMuPDF
# !pip install PyPDF2

: 

In [None]:
def extract_text_from_pdf(pdf_path):
    """Extract text from a PDF file."""
    try:
        with open(pdf_path, "rb") as file:
            reader = PyPDF2.PdfReader(file)
            text = "\n".join([page.extract_text() for page in reader.pages if page.extract_text()])
        return text if text.strip() else None  # Ensure non-empty text
    except Exception as e:
        print(f"Error extracting text from {pdf_path}: {e}")
        return None

def clean_text(text):
    """Remove unnecessary spaces, line breaks, and special characters."""
    text = re.sub(r"\s+", " ", text)  # Normalize whitespace
    text = re.sub(r"[^\x00-\x7F]+", "", text)  # Remove non-ASCII characters
    return text.strip()

def chunk_text(text, chunk_size=1024):
    """Split text into smaller chunks of specified size."""
    return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]

def categorize_file(filename, directory):
    """Determine file category based on directory and filename."""
    filename_lower = filename.lower()

    if directory.endswith("icc"):
        if re.search(r"rules|playing-conditions", filename_lower):
            return "cricket_rules"
        elif re.search(r"report|financial|annual", filename_lower):
            return "annual_reports"
        elif re.search(r"media|press|guide", filename_lower):
            return "media_guide"
    elif directory.endswith("mcc_rules"):
        return "mcc_rules"

    return "general"

def apply_template(category, text, filename):
    """Apply prompt template based on category."""
    templates = {
        "cricket_rules": {
            "instruction": f"Explain the following ICC rule in cricket.",
            "input": text,
            "output": ""
        },
        "mcc_rules": {
            "instruction": f"Summarize the MCC rule for {filename.replace('.txt', '').replace('mcc_', '').replace('_', ' ')}.",
            "input": text,
            "output": ""
        },
        "annual_reports": {
            "instruction": "Summarize the ICC Annual Report.",
            "input": text,
            "output": ""
        },
        "media_guide": {
            "instruction": "Extract key details from the ICC Media Guide.",
            "input": text,
            "output": ""
        },
        "general": {
            "instruction": "Provide information on the following document.",
            "input": text,
            "output": ""
        }
    }
    return templates.get(category, templates["general"])

def prepare_finetuning_dataset(root_dir, output_file):
    """Prepares a fine-tuning dataset by reading text from ICC PDFs and MCC rule text files."""

    dataset = []
    directories = ["icc", "mcc_rules"]  # Subdirectories inside 'data' folder

    for subdir in directories:
        dir_path = os.path.join(root_dir, subdir)

        # Step 1: Verify directory exists
        if not os.path.exists(dir_path):
            print(f"Warning: Directory '{dir_path}' does not exist. Skipping...")
            continue

        # Step 2: List files in directory
        files = os.listdir(dir_path)
        if not files:
            print(f"Warning: No files found in '{dir_path}'. Skipping...")
            continue

        print(f"\nFound {len(files)} files in '{dir_path}':")
        for file in files:
            print(f"  - {file}")

        # Step 3: Process each file
        for filename in files:
            file_path = os.path.join(dir_path, filename)

            try:
                content = None

                if filename.endswith(".txt"):
                    with open(file_path, "r", encoding="utf-8") as file:
                        content = file.read()
                elif filename.endswith(".pdf"):
                    content = extract_text_from_pdf(file_path)

                if not content:
                    print(f"Skipping empty file: {filename}")
                    continue

                # Clean and chunk text
                cleaned_text = clean_text(content)
                chunks = chunk_text(cleaned_text)

                # Categorize and apply prompt template to each chunk
                category = categorize_file(filename, subdir)

                for chunk in chunks:
                    template = apply_template(category, chunk, filename)
                    dataset.append({
                        "filename": filename,
                        "category": category,
                        "instruction": template["instruction"],
                        "input": template["input"],
                        "output": template["output"]
                    })

            except Exception as e:
                print(f"Error processing '{filename}': {e}")

    # Step 4: Save dataset to JSON file
    with open(output_file, "w", encoding="utf-8") as json_file:
        json.dump(dataset, json_file, indent=4)

    print("\nDataset preparation completed. Output saved as:", output_file)




In [None]:
PATH = 'c:\\Users\\msaip\\Projects\\python_chatbot\\data'
prepare_finetuning_dataset(PATH, "finetune_dataset.json")


Found 126 files in 'c:\Users\msaip\Projects\python_chatbot\data\icc':
  - annual-report_1.pdf
  - annual-report_10.pdf
  - annual-report_11.pdf
  - annual-report_12.pdf
  - annual-report_13.pdf
  - annual-report_14.pdf
  - annual-report_15.pdf
  - annual-report_16.pdf
  - annual-report_17.pdf
  - annual-report_18.pdf
  - annual-report_19.pdf
  - annual-report_2.pdf
  - annual-report_20.pdf
  - annual-report_21.pdf
  - annual-report_22.pdf
  - annual-report_23.pdf
  - annual-report_24.pdf
  - annual-report_25.pdf
  - annual-report_26.pdf
  - annual-report_27.pdf
  - annual-report_28.pdf
  - annual-report_29.pdf
  - annual-report_3.pdf
  - annual-report_30.pdf
  - annual-report_31.pdf
  - annual-report_32.pdf
  - annual-report_33.pdf
  - annual-report_4.pdf
  - annual-report_5.pdf
  - annual-report_6.pdf
  - annual-report_7.pdf
  - annual-report_8.pdf
  - annual-report_9.pdf
  - code-of-conduct_1.pdf
  - code-of-conduct_2.pdf
  - decision-review-system_1.pdf
  - decision-review-system_2

incorrect startxref pointer(1)


Error extracting text from c:\Users\msaip\Projects\python_chatbot\data\icc\playing-handbook_12.pdf: EOF marker not found
Skipping empty file: playing-handbook_12.pdf

Found 44 files in 'c:\Users\msaip\Projects\python_chatbot\data\mcc_rules':
  - appeals.txt
  - batsman-out-of-his-her-ground.txt
  - batsman-s-innings;-runners.txt
  - boundaries.txt
  - bowled.txt
  - bye-and-leg-bye.txt
  - caught.txt
  - covering-the-pitch.txt
  - dead-ball.txt
  - declaration-and-forfeiture.txt
  - fielders-absence;-substitutes.txt
  - hit-the-ball-twice.txt
  - hit-wicket.txt
  - innings.txt
  - intervals.txt
  - law-appendices.txt
  - leg-before-wicket.txt
  - no-ball.txt
  - obstructing-the-field.txt
  - players-conduct.txt
  - practice-on-the-field.txt
  - preamble-to-the-laws-spirit-of-cricket.txt
  - preparation-and-maintenance-of-the-playing-area.txt
  - run-out.txt
  - scoring-runs.txt
  - start-of-play;-cessation-of-play.txt
  - stumped.txt
  - the-ball.txt
  - the-bat.txt
  - the-creases.txt

## 2. Finetune mistral-7b

In [None]:
!pip install -q transformers bitsandbytes trl peft sentencepiece wandb
!pip install -q setuptools
import os
import json
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, BitsAndBytesConfig, HfArgumentParser, pipeline, logging
from peft import LoraConfig, PeftModel
from trl import SFTTrainer

: 

In [None]:
!huggingface-cli login

^C


In [None]:
MODEL_NAME = 'mistralai/Mistral-7B-v0.1'
DATASET_PATH = 'c:\\Users\\msaip\\Projects\\python_chatbot\\data\\finetune_dataset.json'
FINETUNED_MODEL = 'mistral-7b-finetuned-cricket'

#QLoRA parameters
lora_r = 64
lora_alpha = 16
lora_dropout = 0.1

#bitsandbytes parameters
use_4bit = True
bnb_4bit_compute_type = 'float16'
bnb_4bit_quant_type = 'nf4'
use_nested_quant = False

#TrainingArguments parameters
output_dir = './results'
num_train_epochs = 1
fp16 = True
bf16 = False
per_device_train_batch_size = 1
per_device_eval_batch_size = 1
gradient_accumulation_steps = 1
gradient_checkpointing = True
max_grad_norm = 0.3
learning_rate = 2e-4
weight_decay = 0.001
optim = "paged_adamw_32bit"
lr_scheduler_type = "cosine"
max_steps = -1
warmup_ratio = 0.03
group_by_length = True
save_steps = 0
logging_steps = 25

#SFT parameters
max_seq_length = None
packing = False
device_map = {"": 0}

In [None]:
# !pip uninstall torch torchvision torchaudio -y
# !python -m pip install --upgrade pip setuptools wheel
!pip install --no-cache-dir torch torchvision --index-url https://download.pytorch.org/whl/cu121


Looking in indexes: https://download.pytorch.org/whl/cu121


ERROR: Could not find a version that satisfies the requirement torch (from versions: none)
ERROR: No matching distribution found for torch


In [None]:
torch.cuda.empty_cache()
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

In [None]:
# dataset = json.load(open(DATASET_PATH, 'r'))
compute_dtype = getattr(torch, bnb_4bit_compute_type)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_type=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

if compute_dtype == torch.float16 and use_4bit:
  major, _ = torch.cuda.get_device_capability()
  if major >= 8:
    print("="*80)
    print("GPU supports. training with bf16=True")
    print("="*80)


AssertionError: Torch not compiled with CUDA enabled