In [13]:
# fine-tuning Phi-2 using the instruction-labeled dataset
import os


In [14]:
import json
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling
import torch

In [15]:
# Load training and evaluation datasets
train_path = "/content/eval_phi2.jsonl"
eval_path = "/content/eval_phi2.jsonl"



In [16]:
import pandas as pd
from datasets import Dataset

train_df = pd.read_json("train_phi2.jsonl", lines=True)
eval_df = pd.read_json("eval_phi2.jsonl", lines=True)

train_dataset = Dataset.from_pandas(train_df)
eval_dataset = Dataset.from_pandas(eval_df)



In [17]:
# Load tokenizer and set pad token
model_id = "microsoft/phi-2"
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token



In [18]:
# Tokenization function
def tokenize_function(example):
    return tokenizer(
        example["prompt"],
        padding="max_length",
        truncation=True,
        max_length=512
    )



In [19]:

train_dataset = train_dataset.map(tokenize_function)
eval_dataset = eval_dataset.map(tokenize_function)



Map:   0%|          | 0/4290 [00:00<?, ? examples/s]

Map:   0%|          | 0/1073 [00:00<?, ? examples/s]

In [20]:
# Load model
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
    device_map="auto"
)



config.json:   0%|          | 0.00/735 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/564M [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [22]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="./phi2_finetuned",
    save_strategy="epoch",
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=1,
    logging_dir="./logs",
    logging_steps=10,
    report_to="none"
)

# Define trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False)
)



  trainer = Trainer(


In [None]:
# Train and save
trainer.train()
trainer.save_model("./phi2_finetuned")



In [None]:
# Inference on sample inputs
model.eval()
sample_inputs = [
    "I want to cancel my subscription now!",
    "Great service as always, thank you!",
    "I need to update my email address.",
    "Why was I charged extra this month?"
]

for sample in sample_inputs:
    prompt = f"Classify the following:\\n{sample}\\nReturn JSON:"
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512).to(model.device)
    with torch.no_grad():
        outputs = model.generate(**inputs, max_new_tokens=128)
    decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print(f"\\n Input: {sample}")
    print("Output:", decoded.split("Return JSON:")[-1].strip())