# Finetuning Phi-2 on cricket data

## 1. Data Cleaning and Preprocessing

In [None]:
import re
import json
import os
import PyPDF2
# !pip install PyMuPDF
# !pip install PyPDF2

ModuleNotFoundError: No module named 'PyPDF2'

In [None]:
def extract_text_from_pdf(pdf_path):
    """Extract text from a PDF file."""
    try:
        with open(pdf_path, "rb") as file:
            reader = PyPDF2.PdfReader(file)
            text = "\n".join([page.extract_text() for page in reader.pages if page.extract_text()])
        return text if text.strip() else None  # Ensure non-empty text
    except Exception as e:
        print(f"Error extracting text from {pdf_path}: {e}")
        return None

def clean_text(text):
    """Remove unnecessary spaces, line breaks, and special characters."""
    text = re.sub(r"\s+", " ", text)  # Normalize whitespace
    text = re.sub(r"[^\x00-\x7F]+", "", text)  # Remove non-ASCII characters
    return text.strip()

def chunk_text(text, chunk_size=1024):
    """Split text into smaller chunks of specified size."""
    return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]

def categorize_file(filename, directory):
    """Determine file category based on directory and filename."""
    filename_lower = filename.lower()

    if directory.endswith("icc"):
        if re.search(r"rules|playing-conditions", filename_lower):
            return "cricket_rules"
        elif re.search(r"report|financial|annual", filename_lower):
            return "annual_reports"
        elif re.search(r"media|press|guide", filename_lower):
            return "media_guide"
    elif directory.endswith("mcc_rules"):
        return "mcc_rules"

    return "general"

def apply_template(category, text, filename):
    """Apply prompt template based on category."""
    templates = {
        "cricket_rules": {
            "instruction": f"Explain the following ICC rule in cricket.",
            "input": text,
            "output": ""
        },
        "mcc_rules": {
            "instruction": f"Summarize the MCC rule for {filename.replace('.txt', '').replace('mcc_', '').replace('_', ' ')}.",
            "input": text,
            "output": ""
        },
        "annual_reports": {
            "instruction": "Summarize the ICC Annual Report.",
            "input": text,
            "output": ""
        },
        "media_guide": {
            "instruction": "Extract key details from the ICC Media Guide.",
            "input": text,
            "output": ""
        },
        "general": {
            "instruction": "Provide information on the following document.",
            "input": text,
            "output": ""
        }
    }
    return templates.get(category, templates["general"])

def prepare_finetuning_dataset(root_dir, output_file):
    """Prepares a fine-tuning dataset by reading text from ICC PDFs and MCC rule text files."""

    dataset = []
    directories = ["icc", "mcc_rules"]  # Subdirectories inside 'data' folder

    for subdir in directories:
        dir_path = os.path.join(root_dir, subdir)

        # Step 1: Verify directory exists
        if not os.path.exists(dir_path):
            print(f"Warning: Directory '{dir_path}' does not exist. Skipping...")
            continue

        # Step 2: List files in directory
        files = os.listdir(dir_path)
        if not files:
            print(f"Warning: No files found in '{dir_path}'. Skipping...")
            continue

        print(f"\nFound {len(files)} files in '{dir_path}':")
        for file in files:
            print(f"  - {file}")

        # Step 3: Process each file
        for filename in files:
            file_path = os.path.join(dir_path, filename)

            try:
                content = None

                if filename.endswith(".txt"):
                    with open(file_path, "r", encoding="utf-8") as file:
                        content = file.read()
                elif filename.endswith(".pdf"):
                    content = extract_text_from_pdf(file_path)

                if not content:
                    print(f"Skipping empty file: {filename}")
                    continue

                # Clean and chunk text
                cleaned_text = clean_text(content)
                chunks = chunk_text(cleaned_text)

                # Categorize and apply prompt template to each chunk
                category = categorize_file(filename, subdir)

                for chunk in chunks:
                    template = apply_template(category, chunk, filename)
                    dataset.append({
                        "filename": filename,
                        "category": category,
                        "instruction": template["instruction"],
                        "input": template["input"],
                        "output": template["output"]
                    })

            except Exception as e:
                print(f"Error processing '{filename}': {e}")

    # Step 4: Save dataset to JSON file
    with open(output_file, "w", encoding="utf-8") as json_file:
        json.dump(dataset, json_file, indent=4)

    print("\nDataset preparation completed. Output saved as:", output_file)




In [None]:
PATH = 'c:\\Users\\msaip\\Projects\\python_chatbot\\data'
prepare_finetuning_dataset(PATH, "finetune_dataset.json")

## 2. Finetune phi-2

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install -q accelerate peft bitsandbytes transformers trl datasets wandb
import os
import json
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig, PeftModel, get_peft_model
from trl import SFTTrainer
from datasets import Dataset

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.1/76.1 MB[0m [31m15.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m318.9/318.9 kB[0m [31m20.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.4/485.4 kB[0m [31m30.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m77.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m59.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
torch.cuda.empty_cache()
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

In [4]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) n
Token is valid (permission: fineGrained).
The token `mistral-finetune` has been saved to /root/.cache/huggingface/stored_tokens
Your token has been saved to /root/.cache/huggingface/token
Login successful.
The current active token is: `m

In [5]:
!wandb login

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit: 
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mpranav_85[0m ([33mpranav_85-iiitdm-kancheepuram[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [6]:
# Load the dataset
DATASET_PATH = '/content/drive/MyDrive/LLMs/finetune_dataset.json'

with open(DATASET_PATH, 'r') as f:
    json_dataset = json.load(f)

# Define batch size
BATCH_SIZE = 1000
total_samples = len(json_dataset)

# Create directory for batched files
batched_dir = '/content/drive/MyDrive/LLMs/train_batches'
os.makedirs(batched_dir, exist_ok=True)

# Split dataset into batches
for i in range(0, total_samples, BATCH_SIZE):
    batch = json_dataset[i:i + BATCH_SIZE]
    batch_filename = os.path.join(batched_dir, f'batch_{i//BATCH_SIZE + 1}.json')

    with open(batch_filename, 'w') as f:
        json.dump(batch, f, indent=4)

print(f"Dataset split into batches of {BATCH_SIZE} samples and saved in '{batched_dir}'")


Dataset split into batches of 1000 samples and saved in '/content/drive/MyDrive/LLMs/train_batches'


In [7]:
MODEL_NAME = 'microsoft/phi-2'
DATASET_PATH = '/content/drive/MyDrive/LLMs/train_batches/batch_1.json'
FINETUNED_MODEL = 'phi_2-finetuned-cricket'

with open(DATASET_PATH, 'r') as f:
    json_dataset = json.load(f)

dataset = Dataset.from_list(json_dataset)

#Apply prompt template
PROMPT_TEMPLATE = '''### Instruction:
{instruction}

### Input:
{input}

### Response:
{output}
'''

dataset = dataset.map(lambda sample: {
    'prompt': PROMPT_TEMPLATE.format(
        instruction=sample.get('instruction', ''),
        input=sample.get('input', ''),
        output=sample.get('output', '')
    )
})

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

#Tokenize dataset
def tokenize_function(examples):
  return tokenizer(examples["prompt"], truncation=True)

tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset = tokenized_dataset.remove_columns(["prompt"])

#QLoRA parameters
lora_r = 8
lora_alpha = 32
lora_dropout = 0.1

#bitsandbytes parameters
use_4bit = True
bnb_4bit_compute_type = torch.float16
bnb_4bit_quant_type = 'nf4'
use_nested_quant = False

#TrainingArguments parameters
output_dir = './results'
num_train_epochs = 1
fp16 = False
bf16 = True
per_device_train_batch_size = 8
per_device_eval_batch_size = 4
gradient_accumulation_steps = 1
gradient_checkpointing = True
max_grad_norm = 0.3
learning_rate = 2e-4
weight_decay = 0.001
optim = "paged_adamw_32bit"
lr_scheduler_type = "cosine"
max_steps = -1
warmup_ratio = 0.03
group_by_length = True
save_steps = 0
logging_steps = 25

#SFT parameters
max_seq_length = None
packing = False
device_map = {"": 0}

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/7.34k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/1.08k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [8]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=bnb_4bit_compute_type,
    bnb_4bit_use_double_quant=use_nested_quant,
)

peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=['q_proj', 'v_proj']
)

#Load base model
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    device_map="auto"
)

model = get_peft_model(model, peft_config)
model.enable_input_require_grads()

model.gradient_checkpointing_enable()
model.config.use_cache = False
model.config.pretraining_tp = 1



training_arguments = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    fp16=fp16,
    bf16=bf16,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    group_by_length=group_by_length,
    lr_scheduler_type=lr_scheduler_type,
    report_to="wandb",
)

trainer = SFTTrainer(
    model=model,
    train_dataset=tokenized_dataset,
    peft_config=peft_config,
    args=training_arguments,
)

def print_trainable_parameters(model):
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    total_params = sum(p.numel() for p in model.parameters())
    print(f"Trainable Parameters: {trainable_params} / {total_params} ({100 * trainable_params / total_params:.2f}%)")

# Print trainable parameters before training
print_trainable_parameters(model)



config.json:   0%|          | 0.00/735 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/35.7k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/564M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Converting train dataset to ChatML:   0%|          | 0/1000 [00:00<?, ? examples/s]

Applying chat template to train dataset:   0%|          | 0/1000 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/1000 [00:00<?, ? examples/s]

Trainable Parameters: 2621440 / 1524014080 (0.17%)


In [9]:
trainer.train()

print('Training completed!')

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mpranav_85[0m ([33mpranav_85-iiitdm-kancheepuram[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
25,2.9527
50,2.6653
75,2.4594
100,2.3963
125,2.3679


Training completed!


In [10]:
from huggingface_hub import notebook_login

notebook_login()
REPO_NAME = 'bang-bot/phi2_cricket'

tokenizer.save_pretrained(REPO_NAME)

# Push tokenizer to Hugging Face Hub
tokenizer.push_to_hub(REPO_NAME)

# Push the fine-tuned model using SFTTrainer
trainer.push_to_hub(REPO_NAME)

print('Model pushed to HuggingFace Hub')

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

README.md:   0%|          | 0.00/24.0 [00:00<?, ?B/s]

No files have been modified since last commit. Skipping to prevent empty commit.


adapter_model.safetensors:   0%|          | 0.00/10.5M [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

training_args.bin:   0%|          | 0.00/5.56k [00:00<?, ?B/s]

Model pushed to HuggingFace Hub


In [12]:
# Load model and move it to the correct device
device = "cuda" if torch.cuda.is_available() else "cpu"
FINETUNED_MODEL = 'phi_2-finetuned-cricket'
# model = AutoModelForCausalLM.from_pretrained(FINETUNED_MODEL).to(device)
# tokenizer = AutoTokenizer.from_pretrained(FINETUNED_MODEL)
# Chat function
def chat_with_model(prompt):
    inputs = tokenizer(prompt, return_tensors="pt")

    # Move inputs to the same device as the model
    inputs = {key: value.to(device) for key, value in inputs.items()}

    # Generate response
    output = model.generate(**inputs, max_length=200)
    response = tokenizer.decode(output[0], skip_special_tokens=True)
    return response

# Test the fine-tuned model
prompt = "How much fee is charged for slow over rate?"
print(chat_with_model(prompt))


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


What happens if the over rate is less than expected?
The over rate is the percentage of the total number of cases that are not resolved by the end of the case. The over rate is calculated by dividing the number of cases that are not resolved by the total number of cases.

### Example:

```python
from django.db.models import Avg

def get_over_rate(cases):
    over_rate = cases.over_rate
    return over_rate

cases = Case.objects.all()
over_rate = get_over_rate(cases)
print(over_rate)
```

### Output:

```
0.0
```

### Example:

```python
from django.db.models import Avg

def get_over_rate(cases):
    over_rate = cases.over_rate
    return over
