In [1]:
import json
import torch
from datasets import Dataset, load_metric
!pip install numpy==1.24.4


Collecting numpy==1.24.4
  Downloading numpy-1.24.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.6 kB)
Downloading numpy-1.24.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.3/17.3 MB[0m [31m41.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 2.0.2
    Uninstalling numpy-2.0.2:
      Successfully uninstalled numpy-2.0.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
jaxlib 0.5.1 requires numpy>=1.25, but you have numpy 1.24.4 which is incompatible.
treescope 0.1.9 requires numpy>=1.25.2, but you have numpy 1.24.4 which is incompatible.
pymc 5.23.0 requires numpy>=1.25.0, but you have numpy 1.24.4 which is incompatible.
thinc 8.3.6 requires numpy<3.

ModuleNotFoundError: Could not import module 'AutoTokenizer'. Are this object's requirements defined correctly?

In [1]:
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling,
)
from sklearn.model_selection import train_test_split



ImportError: cannot import name 'PreTrainedModel' from 'transformers' (/usr/local/lib/python3.11/dist-packages/transformers/__init__.py)

In [None]:
# === Step 1: Load and Format Dataset ===
def format_sample(example):
    user_message = None
    for msg in example.get("messages", []):
        if msg["role"] == "user":
            user_message = msg["content"]
            break
    if not user_message:
        user_message = "N/A"
    target = {
        "intent": example["intent"][0],
        "sentiment": example["sentiment"][0],
        "topic": example["topic"][0],
        "entities": example["entities"]
    }
    prompt = f"<|system|>You are a helpful assistant.<|user|>Classify the following: {user_message}<|assistant|>{json.dumps(target)}"
    return {"text": prompt}

file_path = "/content/labeled_2025-06-28.jsonl"
with open(file_path, "r", encoding="utf-8") as f:
    raw_data = [json.loads(line) for line in f]

formatted_data = [format_sample(item) for item in raw_data if "messages" in item]
train_data, eval_data = train_test_split(formatted_data, test_size=0.2, random_state=42)

train_dataset = Dataset.from_list(train_data)
eval_dataset = Dataset.from_list(eval_data)



In [None]:
# === Step 2: Tokenize Dataset ===
model_name = "distilgpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token  # Fix for padding error

def tokenize_fn(examples):
    return tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=256
    )

train_dataset = train_dataset.map(tokenize_fn, batched=True)
eval_dataset = eval_dataset.map(tokenize_fn, batched=True)

train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask"])
eval_dataset.set_format(type="torch", columns=["input_ids", "attention_mask"])





In [None]:
# === Step 3: Load Model and Data Collator ===
model = AutoModelForCausalLM.from_pretrained(model_name)
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)




In [None]:
# === Step 4: Training Configuration ===
training_args = TrainingArguments(
    output_dir="./results",
    overwrite_output_dir=True,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=1,
    save_strategy="epoch",
    logging_steps=10,
    save_total_limit=1,
    fp16=torch.cuda.is_available()
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

In [None]:

# === Step 6: Train ===
trainer.train()

In [None]:


# === Step 7: Save Model ===
model_path = "./distilgpt2-finetuned"
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)


In [None]:
# === Step 8: Test Inference ===
model.eval()
sample_text = train_data[0]["text"]
inputs = tokenizer(sample_text, return_tensors="pt").to(model.device)
outputs = model.generate(**inputs, max_new_tokens=128)
print("=== Inference ===")
print(tokenizer.decode(outputs[0], skip_special_tokens=True))
