In [1]:
# Install necessary package
!pip install -U --no-cache-dir bitsandbytes


Collecting bitsandbytes
  Downloading bitsandbytes-0.45.5-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch<3,>=2.0->bitsandbytes)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch<3,>=2.0->bitsandbytes)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch<3,>=2.0->bitsandbytes)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch<3,>=2.0->bitsandbytes)
  Downloading nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cusolver-cu12==11.6.1.9 (from torch<3,>=2.0->bitsandbytes)
  Downloading nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cusparse-cu12==12.3.1.

In [2]:
# Imports
import json
from typing import List
from pydantic import BaseModel
from datasets import Dataset, load_from_disk
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling,
    pipeline
)
from peft import LoraConfig, get_peft_model, PeftModel
import torch

2025-05-05 08:12:20.317170: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1746432740.501064      31 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1746432740.552090      31 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


# Pydantics


In [3]:
# --- Pydantic Model for Input Validation ---
class EventRecord(BaseModel):
    event_text: str
    output: dict


# Data Pre-processing 

In [4]:
# --- Load and Prepare Data ---
def load_jsonl_dataset(file_path: str) -> Dataset:
    """Load a .jsonl dataset into a HuggingFace Dataset"""
    with open(file_path, "r") as f:
        samples = [EventRecord(**json.loads(line)).dict() for line in f]
    return Dataset.from_list(samples)

# --- Tokenization ---
def tokenize(example: dict) -> dict:
    """Tokenizes input/output pair and creates labels for training."""
    input_text = example["event_text"]
    output_text = json.dumps(example["output"], ensure_ascii=False)
    full_text = f"{input_text}\n{output_text}"

    # Tokenize combined text
    tokenized = tokenizer(
        full_text,
        padding="max_length",
        truncation=True,
        max_length=512,
    )

    # Create labels (mask input part)
    input_ids = tokenizer(input_text, truncation=True, max_length=512)["input_ids"]
    input_len = len(input_ids)
    labels = tokenized["input_ids"][:]
    labels[:input_len] = [-100] * input_len
    tokenized["labels"] = labels

    return tokenized

In [5]:
# --- Load and Split Dataset ---
data_path = "/kaggle/input/keyword-extraction-calender-dataset/event_text_mapping.jsonl"
dataset = load_jsonl_dataset(data_path)
dataset = dataset.train_test_split(test_size=0.1)

# --- Load Tokenizer ---
model_id = "HuggingFaceTB/SmolLM-360M"
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.pad_token or tokenizer.eos_token

# --- Tokenize and Save Dataset ---
tokenized_dataset = dataset.map(tokenize)
save_path = "./tokenized_event_dataset"
tokenized_dataset.save_to_disk(save_path)

# --- Load Tokenized Dataset ---
tokenized_dataset = load_from_disk(save_path)
train_dataset = tokenized_dataset["train"]
eval_dataset = tokenized_dataset["test"]

# --- Load Model ---
model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto")


/tmp/ipykernel_31/3067122703.py:5: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  samples = [EventRecord(**json.loads(line)).dict() for line in f]


tokenizer_config.json:   0%|          | 0.00/3.69k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/801k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.10M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/831 [00:00<?, ?B/s]

Map:   0%|          | 0/712 [00:00<?, ? examples/s]

Map:   0%|          | 0/80 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/712 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/80 [00:00<?, ? examples/s]

config.json:   0%|          | 0.00/725 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/1.45G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

# Configurations

In [7]:
# --- LoRA Configuration ---
target_modules = ["q_proj", "v_proj", "k_proj", "o_proj", "up_proj", "down_proj"]
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    target_modules=target_modules,
    bias="none",
    task_type="CAUSAL_LM"
)
model = get_peft_model(model, peft_config)


# --- Training Configuration ---
training_args = TrainingArguments(
    output_dir="./results_lora",
    num_train_epochs=10,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=8,
    warmup_steps=100,
    weight_decay=0.01,
    logging_dir="./logs_lora",
    logging_steps=10,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    learning_rate=2e-5,
    max_grad_norm=1.0,
    lr_scheduler_type="cosine",
    gradient_checkpointing=False,
    report_to="none",
    fp16=True
)

# --- Define Trainer ---
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


# Training

In [None]:
# --- Train and Save ---
trainer.train()
trainer.save_model("./fine_tuned_lora_model")
print("✅ LoRA fine-tuning complete! Model saved to ./fine_tuned_lora_model")


# Inference

In [34]:
# --- Inference Function ---
def inference(input_text: str):
    """Run inference on the fine-tuned model."""
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    tokenizer.pad_token = tokenizer.pad_token or tokenizer.eos_token

    base_model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16, device_map="auto")
    # model = PeftModel.from_pretrained(base_model, "./fine_tuned_lora_model")
    model = PeftModel.from_pretrained(base_model, "/kaggle/input/finetuned_calender_extract/transformers/default/1")
    model.eval()

    inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True, max_length=512).to(model.device)

    with torch.no_grad():
        output_ids = model.generate(
            **inputs,
            max_new_tokens=128,
            do_sample=False,
            temperature=0.0,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id
        )

    output = tokenizer.decode(output_ids[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True)

    match = re.search(r'\{.*?\}', output, re.DOTALL)
    first_json = json.loads(match.group()) if match else None

    return first_json



In [37]:
# --- Run Sample Inference ---
inference("Meeting on 05 - December - 2023, 3pm, lasting 1 hour, with Sarah and James on Google Meet.")

{'action': 'Meeting',
 'attendees': ['Sarah', 'James'],
 'date': '05/12/2023',
 'duration': '1 hour',
 'location': 'Google Meet',
 'notes': None,
 'recurrence': None,
 'time': '3:00 PM'}

# Download model by zipping

In [None]:
import shutil
from IPython.display import FileLink

def download_folder_as_zip(folder_path = "/kaggle/working/fine_tuned_lora_model", zip_name="finetuned_way2.zip"):
    """
    Zips the given folder and creates a download link for it in a Kaggle notebook.
    
    Args:
        folder_path (str): Path to the folder to be zipped.
        zip_name (str): Name of the zip file to create.
    """
    # Zip the folder
    shutil.make_archive(zip_name.replace(".zip", ""), 'zip', folder_path)
    
    # Provide a download link
    display(FileLink(zip_name))

In [None]:
download_folder_as_zip()