In [1]:
!pip install transformers datasets trl torch accelerate bitsandbytes peft

Collecting transformers
  Using cached transformers-4.49.0-py3-none-any.whl.metadata (44 kB)
Collecting datasets
  Using cached datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting trl
  Using cached trl-0.15.2-py3-none-any.whl.metadata (11 kB)
Collecting torch
  Using cached torch-2.6.0-cp312-cp312-win_amd64.whl.metadata (28 kB)
Collecting accelerate
  Using cached accelerate-1.4.0-py3-none-any.whl.metadata (19 kB)
Collecting bitsandbytes
  Using cached bitsandbytes-0.45.3-py3-none-win_amd64.whl.metadata (5.1 kB)
Collecting peft
  Using cached peft-0.14.0-py3-none-any.whl.metadata (13 kB)
Collecting filelock (from transformers)
  Using cached filelock-3.17.0-py3-none-any.whl.metadata (2.9 kB)
Collecting huggingface-hub<1.0,>=0.26.0 (from transformers)
  Using cached huggingface_hub-0.29.2-py3-none-any.whl.metadata (13 kB)
Collecting numpy>=1.17 (from transformers)
  Using cached numpy-2.2.3-cp312-cp312-win_amd64.whl.metadata (60 kB)
Collecting pyyaml>=5.1 (from transformers)
  

In [None]:
!pip install setuptools





In [1]:
import pandas as pd

# Load the dataset (assuming it's saved as 'Dataset.xlsx')
df = pd.read_excel("Dataset.xlsx")

# Basic dataset info
print(f"Total dialogues: {len(df)}")
print("Columns:", df.columns.tolist())

# Preview first few rows
df.head(3)


Total dialogues: 21434
Columns: ['npc_role', 'player_input', 'npc_response', 'emotion']


Unnamed: 0,npc_role,player_input,npc_response,emotion
0,Mechanic,"Hey, I heard you're the best when it comes to ...","Absolutely, I know my way around an engine lik...",Confidence
1,Mechanic,"Hey, I heard you're the best when it comes to ...",No doubt about it. I take pride in delivering ...,Confidence
2,Mechanic,"Hey, I heard you're the best when it comes to ...",You can count on me—I've handled tougher engin...,Confidence


In [2]:
print(df['npc_role'].value_counts().head(5))


npc_role
Tourist                    3743
Mechanic                   3079
Office Corporate Worker    3042
Security Guard             2669
Sweeper                    2346
Name: count, dtype: int64


In [3]:
df['emotion'].nunique()

40

In [4]:
df['player_length'] = df['player_input'].str.split().str.len()
df['response_length'] = df['npc_response'].str.split().str.len()
print("\nAverage player_input length:", df['player_length'].mean())
print("Average npc_response length:", df['response_length'].mean())


Average player_input length: 9.274610432023888
Average npc_response length: 11.670523467388263


In [5]:
import json
from datasets import Dataset

# Create JSON Schema prompt for each example using the given column names
def create_prompt(row):
    data = {
        "npc_role": row["npc_role"],
        "player_input": row["player_input"],
        "emotion": row["emotion"],
        "npc_response": row["npc_response"]
    }
    return json.dumps(data)

df["text"] = df.apply(create_prompt, axis=1)

# Convert DataFrame to Hugging Face Dataset (using only the JSON schema text)
dataset = Dataset.from_pandas(df[["text"]])
print("\nFirst 3 examples from the dataset:")
for i in range(3):
    print(dataset[i])


  from .autonotebook import tqdm as notebook_tqdm



First 3 examples from the dataset:
{'text': '{"npc_role": "Mechanic", "player_input": "Hey, I heard you\'re the best when it comes to fixing engines.", "emotion": "Confidence", "npc_response": "Absolutely, I know my way around an engine like the back of my hand. Trust me, you\'re in good hands."}'}
{'text': '{"npc_role": "Mechanic", "player_input": "Hey, I heard you\'re the best when it comes to fixing engines.", "emotion": "Confidence", "npc_response": "No doubt about it. I take pride in delivering top-notch work every time."}'}
{'text': '{"npc_role": "Mechanic", "player_input": "Hey, I heard you\'re the best when it comes to fixing engines.", "emotion": "Confidence", "npc_response": "You can count on me\\u2014I\'ve handled tougher engines than this one."}'}


In [6]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import LoraConfig, get_peft_model

model_id = "HuggingFaceTB/SmolLM2-135M-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_id)

# Load the model on CPU (no quantization; bitsandbytes is not used)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="cpu"
)

# Set up LoRA configuration for parameter-efficient fine-tuning
lora_config = LoraConfig(
    r=16,             # Low-rank dimension (adjustable)
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

# Attach LoRA adapters to the model
model = get_peft_model(model, lora_config)
print("\nTrainable parameters:")
model.print_trainable_parameters()


The installed version of bitsandbytes was compiled without GPU support. 8-bit optimizers, 8-bit multiplication, and GPU quantization are unavailable.



Trainable parameters:
trainable params: 921,600 || all params: 135,436,608 || trainable%: 0.6805


In [7]:
# Tokenization function for the JSON schema text
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, max_length=512)

tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["text"])
tokenized_dataset = tokenized_dataset.filter(lambda x: len(x["input_ids"]) > 0)
print("\nSample tokenized output:")
print(tokenized_dataset[0])


Map: 100%|██████████| 21434/21434 [00:01<00:00, 18083.20 examples/s]
Filter: 100%|██████████| 21434/21434 [00:01<00:00, 14135.98 examples/s]


Sample tokenized output:
{'input_ids': [39428, 4413, 83, 79, 10305, 1799, 476, 6407, 2410, 286, 1002, 476, 15284, 79, 4525, 1799, 476, 22234, 28, 339, 3984, 346, 2316, 260, 1450, 645, 357, 2216, 288, 21276, 9396, 14069, 476, 391, 6430, 1799, 476, 19168, 1667, 1002, 476, 4413, 83, 79, 7639, 1799, 476, 42686, 8234, 28, 339, 699, 957, 970, 1130, 354, 2327, 702, 260, 1056, 282, 957, 1369, 30, 10306, 549, 28, 346, 2316, 281, 1123, 3288, 1270, 109], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}





In [8]:
# Custom data collator to pad and create labels for causal LM training
def custom_data_collator(features):
    batch = tokenizer.pad(features, return_tensors="pt")
    batch["input_ids"] = batch["input_ids"].long()
    if "attention_mask" in batch:
        batch["attention_mask"] = batch["attention_mask"].long()
    batch["labels"] = batch["input_ids"].clone()
    return batch


In [9]:
import json

# Create and save a DeepSpeed configuration file for efficient training
deepspeed_config = {
  "zero_optimization": {
    "stage": 2,
    "offload_optimizer": {
      "device": "cpu",
      "pin_memory": True,
      "buffer_count": 4
    },
    "contiguous_gradients": True,
    "overlap_comm": False,
    "allgather_partitions": True,
    "reduce_scatter": True
  },
  "fp16": {
    "enabled": True
  }
}

with open("deepspeed_config.json", "w") as f:
    json.dump(deepspeed_config, f, indent=2)

print("DeepSpeed config saved as 'deepspeed_config.json'.")


DeepSpeed config saved as 'deepspeed_config.json'.


In [11]:
# Split the tokenized dataset into training and validation sets (90% train, 10% eval)
split_datasets = tokenized_dataset.train_test_split(test_size=0.1, seed=42)
train_dataset = split_datasets["train"]
eval_dataset = split_datasets["test"]

from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./SmolLM2-finetuned",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=8,
    save_steps=500,
    logging_steps=50,
    learning_rate=5e-5,
    weight_decay=0.01,
    evaluation_strategy="epoch",  # Evaluate at the end of each epoch
    save_strategy="epoch",
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=custom_data_collator,
)

# Train the model with tqdm progress bars automatically shown by the Trainer,
# then save the fine-tuned model and tokenizer.
trainer.train()
trainer.save_model("./SmolLM2-finetuned")
tokenizer.save_pretrained("./SmolLM2-finetuned")


No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Epoch,Training Loss,Validation Loss
0,1.3983,1.233498
1,1.2994,1.167496
2,1.2829,1.147907


('./SmolLM2-finetuned\\tokenizer_config.json',
 './SmolLM2-finetuned\\special_tokens_map.json',
 './SmolLM2-finetuned\\vocab.json',
 './SmolLM2-finetuned\\merges.txt',
 './SmolLM2-finetuned\\added_tokens.json',
 './SmolLM2-finetuned\\tokenizer.json')

In [12]:
from transformers import pipeline

import json

# Define a function to generate an NPC response using JSON schema,
# extracting only the 'npc_response' field from the output.
def generate_npc_response(npc_role: str, player_input: str, emotion: str, generator, max_length: int = 100) -> str:
    prompt_dict = {
        "npc_role": npc_role,
        "player_input": player_input,
        "emotion": emotion,
        "npc_response": ""
    }
    prompt = json.dumps(prompt_dict)
    output = generator(prompt, max_length=max_length, num_return_sequences=1)
    generated_text = output[0]["generated_text"]
    try:
        generated_json = json.loads(generated_text)
    except json.JSONDecodeError:
        # Attempt to extract JSON substring if full decoding fails
        start = generated_text.find("{")
        end = generated_text.rfind("}") + 1
        json_str = generated_text[start:end]
        try:
            generated_json = json.loads(json_str)
        except Exception:
            return "Error in JSON decoding"
    return generated_json.get("npc_response", "").strip()

# Initialize the text-generation pipeline using the fine-tuned model
generator = pipeline("text-generation", model="./SmolLM2-finetuned", tokenizer="./SmolLM2-finetuned")

# Generate a sample NPC response and print only the npc_response in JSON format
npc_response = generate_npc_response(
    npc_role="Mechanic",
    player_input="My car's engine is making weird sounds; can you help?",
    emotion="Confidence",
    generator=generator
)
print(json.dumps({"NPC_Response": npc_response}, indent=2))


Device set to use cpu
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


{
  "NPC_Response": ""
}


In [13]:
import math
from nltk.translate.bleu_score import sentence_bleu
import nltk
nltk.download('punkt')

# Evaluate model performance using perplexity and BLEU score

# 1. Compute Perplexity using the Trainer's evaluation (ensure your evaluation dataset is set up)
eval_results = trainer.evaluate()
eval_loss = eval_results.get("eval_loss", eval_results.get("loss"))
perplexity = math.exp(eval_loss)
print("\nValidation Perplexity:", perplexity)

# 2. Compute average BLEU score on 10 samples from the original dataset
bleu_scores = []
for i in range(10):
    sample = json.loads(dataset[i]["text"])
    prompt_dict = {
        "npc_role": sample["npc_role"],
        "player_input": sample["player_input"],
        "emotion": sample["emotion"],
        "npc_response": ""
    }
    prompt = json.dumps(prompt_dict)
    output = generator(prompt, max_length=100, num_return_sequences=1)
    generated_text = output[0]["generated_text"]
    try:
        generated_json = json.loads(generated_text)
    except:
        start = generated_text.find("{")
        end = generated_text.rfind("}") + 1
        json_str = generated_text[start:end]
        try:
            generated_json = json.loads(json_str)
        except:
            generated_json = {"npc_response": ""}
    reference = sample["npc_response"].split() 
    hypothesis = generated_json.get("npc_response", "").split()
    bleu = sentence_bleu([reference], hypothesis)
    bleu_scores.append(bleu)

avg_bleu = sum(bleu_scores) / len(bleu_scores)
print("Average BLEU score on 10 samples:", avg_bleu)


ModuleNotFoundError: No module named 'nltk'