In [1]:
import torch
import os
import json
import math
import evaluate
from torch.utils.data import Dataset, DataLoader, IterableDataset
import torch.nn.functional as F
from datasets import load_dataset, Dataset as HFDataset
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, DataCollatorForLanguageModeling, get_scheduler
from huggingface_hub import login
# If you encounter ModuleNotFoundError for '_bz2', you need to install the bzip2 system library.
# For Ubuntu/Debian: sudo apt-get install libbz2-dev
# For MacOS (Homebrew): brew install bzip2
# After installing, you may need to reinstall Python or rebuild it with bzip2 support.
# If using conda: conda install bzip2
# Then reinstall the affected Python packages if needed.
login("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")

In [2]:
# loading gpt2 tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Training parameters
max_length = 256
max_steps = 1200
learning_rate = 1e-4
batch_size = 8
gradient_accumulation_steps = 1
num_train_epochs = 10
weight_decay = 0.1
warmup_steps = 120
save_steps = 200
logging_steps = 50

print(f"Training configuration:")
print(f"Max length: {max_length}")
print(f"Max steps: {max_steps}")
print(f"Learning rate: {learning_rate}")
print(f"Batch size: {batch_size}")
print(f"Epochs: {num_train_epochs}")
print(f"Weight decay: {weight_decay}")
print(f"Warmup steps: {warmup_steps}")

Using device: cuda
Training configuration:
Max length: 256
Max steps: 1200
Learning rate: 0.0001
Batch size: 8
Epochs: 10
Weight decay: 0.1
Warmup steps: 120


In [3]:
# Loading adolescent chatbot dataset
def load_adolescent_chatbot_dataset(file_path):
    """Load and format the adolescent chatbot dataset"""
    conversations = []
    
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            data = json.loads(line.strip())
            # Format as conversation: user message + assistant response
            conversation = ""
            for message in data['messages']:
                if message['role'] == 'user':
                    conversation += f"User: {message['content']}\n"
                elif message['role'] == 'assistant':
                    conversation += f"Assistant: {message['content']}\n"
            conversations.append(conversation.strip())
    
    return conversations

# Load the dataset
dataset_path = "adolescent_chatbot_train.jsonl"
conversations = load_adolescent_chatbot_dataset(dataset_path)

print(f"Loaded {len(conversations)} conversations")
print(f"Sample conversation:\n{conversations[0][:200]}...")

# Split into train/validation (80/20)
train_size = int(0.8 * len(conversations))
train_conversations = conversations[:train_size]
val_conversations = conversations[train_size:]

print(f"Training samples: {len(train_conversations)}")
print(f"Validation samples: {len(val_conversations)}")

# Convert to HuggingFace Dataset format
train_dataset = HFDataset.from_dict({"text": train_conversations})
val_dataset = HFDataset.from_dict({"text": val_conversations})

Loaded 600 conversations
Sample conversation:
User: How does your relationship with your mother affect your autonomy?
Assistant: In this case, the respondent scored 30, 27, and 32 for mother-related items, and 27 for autonomy. This suggests that ...
Training samples: 480
Validation samples: 120


In [4]:
# Dataset class for adolescent chatbot training
class AdolescentChatbotDataset(Dataset):
    def __init__(self, dataset, tokenizer, max_length=256):
        self.dataset = dataset
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        text = self.dataset[idx]["text"]

        # tokenize dataset with proper padding and truncation
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding="max_length",
            max_length=self.max_length,
            return_tensors="pt"
        )

        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
        }

# dataset instances
train_dataset_custom = AdolescentChatbotDataset(train_dataset, tokenizer, max_length)
val_dataset_custom = AdolescentChatbotDataset(val_dataset, tokenizer, max_length)

print(f"Train dataset size: {len(train_dataset_custom)}")
print(f"Validation dataset size: {len(val_dataset_custom)}")

# tokenization
sample = train_dataset_custom[0]
print(f"Sample input_ids shape: {sample['input_ids'].shape}")
print(f"Sample attention_mask shape: {sample['attention_mask'].shape}")
print(f"Sample text: {tokenizer.decode(sample['input_ids'][:50])}")

Train dataset size: 480
Validation dataset size: 120
Sample input_ids shape: torch.Size([256])
Sample attention_mask shape: torch.Size([256])
Sample text: User: How does your relationship with your mother affect your autonomy?
Assistant: In this case, the respondent scored 30, 27, and 32 for mother-related items, and 27 for autonomy. This suggests that perceived maternal support or control may be


In [5]:
# DataLoaders for training and validation
train_dataloader = DataLoader(
    train_dataset_custom,
    batch_size=batch_size,
    shuffle=True,
    num_workers=2,
    pin_memory=True if device == "cuda" else False
)

val_dataloader = DataLoader(
    val_dataset_custom,
    batch_size=batch_size,
    shuffle=False,
    num_workers=2,
    pin_memory=True if device == "cuda" else False
)

print(f"Train batches: {len(train_dataloader)}")
print(f"Validation batches: {len(val_dataloader)}")

# Test batch loading
sample_batch = next(iter(train_dataloader))
print(f"Batch input_ids shape: {sample_batch['input_ids'].shape}")
print(f"Batch attention_mask shape: {sample_batch['attention_mask'].shape}")

Train batches: 60
Validation batches: 15
Batch input_ids shape: torch.Size([8, 256])
Batch attention_mask shape: torch.Size([8, 256])


In [6]:
def estimate_loss_and_accuracy(model, eval_dataloader, device):
    # estimate the loss and accuracy of a model on an evaluation dataset.

    model.eval()
    total_loss = 0.0
    total_correct = 0
    total_tokens = 0
    num_batches = 0

    with torch.no_grad():
        for batch in eval_dataloader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = input_ids.clone()

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            logits = outputs.logits

            total_loss += loss.item()

            # Calculate accuracy
            predictions = torch.argmax(logits, dim=-1)

            mask = (labels != tokenizer.pad_token_id) & (attention_mask == 1)
            correct = ((predictions == labels) & mask).sum().item()
            total_correct += correct
            total_tokens += mask.sum().item()

            num_batches += 1

    model.train()

    # Calculate average metrics
    avg_loss = total_loss / num_batches if num_batches > 0 else float("inf")
    accuracy = total_correct / total_tokens if total_tokens > 0 else 0.0

    return avg_loss, accuracy

# evaluation function
print("Testing evaluation function...")
model_test = GPT2LMHeadModel.from_pretrained("gpt2")
model_test.to(device)
test_loss, test_acc = estimate_loss_and_accuracy(model_test, val_dataloader, device)
print(f"Initial test loss: {test_loss:.4f}, accuracy: {test_acc:.4f}")
# del model_test

Testing evaluation function...


`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Initial test loss: 10.2425, accuracy: 0.0197


In [7]:
# tokenization function for the datasets
def tokenize_function(examples):
    # dataset tokenizaiton for training
    return tokenizer(
        examples["text"],
        truncation=True,
        padding="max_length",
        max_length=max_length,
        return_tensors="pt"
    )

print("Tokenizing datasets...")
tokenized_train = train_dataset.map(tokenize_function, batched=True, remove_columns=["text"])
tokenized_val = val_dataset.map(tokenize_function, batched=True, remove_columns=["text"])

print(f"Tokenized train dataset: {len(tokenized_train)} samples")
print(f"Tokenized validation dataset: {len(tokenized_val)} samples")

# data collator for language modeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False, # will try the masked modelling later
    pad_to_multiple_of=8,  # optimize gopu memory
    return_tensors="pt",
)

print("Data collator configured for causal language modeling")

# adding labels to the tokenized datasets
def add_labels(examples):
    examples["labels"] = examples["input_ids"].copy()
    return examples

print("Adding labels to datasets...")
tokenized_train = tokenized_train.map(add_labels, batched=True)
tokenized_val = tokenized_val.map(add_labels, batched=True)

print(f"Enhanced tokenized train dataset: {len(tokenized_train)} samples")
print(f"Enhanced tokenized validation dataset: {len(tokenized_val)} samples")

Tokenizing datasets...


Map:   0%|          | 0/480 [00:00<?, ? examples/s]

Map:   0%|          | 0/120 [00:00<?, ? examples/s]

Tokenized train dataset: 480 samples
Tokenized validation dataset: 120 samples
Data collator configured for causal language modeling
Adding labels to datasets...


Map:   0%|          | 0/480 [00:00<?, ? examples/s]

Map:   0%|          | 0/120 [00:00<?, ? examples/s]

Enhanced tokenized train dataset: 480 samples
Enhanced tokenized validation dataset: 120 samples


In [8]:
# loading GPT-2 model
print("Loading GPT-2 model...")
model = GPT2LMHeadModel.from_pretrained("gpt2")
model.to(device)

# resize token embeddings
model.resize_token_embeddings(len(tokenizer))

# training arguments for adolescent chatbot dataset
training_args = TrainingArguments(
    output_dir="./gpt2_adolescent_chatbot_train_output",
    eval_strategy="steps",
    eval_steps=100,
    save_strategy="steps",
    save_steps=save_steps,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    num_train_epochs=num_train_epochs,
    max_steps=max_steps,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    warmup_steps=warmup_steps,
    logging_dir="./logs",
    logging_steps=logging_steps,
    save_total_limit=3,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    fp16=True if device == "cuda" else False,
    optim="adamw_torch",
    report_to="none",
    remove_unused_columns=False,
    dataloader_pin_memory=True if device == "cuda" else False,
    label_smoothing_factor=0.1,
    include_inputs_for_metrics=True,
    prediction_loss_only=False,
)

print("Training arguments configured:")
print(f"  Output directory: {training_args.output_dir}")
print(f"  Batch size: {training_args.per_device_train_batch_size}")
print(f"  Learning rate: {training_args.learning_rate}")
print(f"  Max steps: {training_args.max_steps}")
print(f"  Warmup steps: {training_args.warmup_steps}")
print(f"  Weight decay: {training_args.weight_decay}")
print(f"  FP16: {training_args.fp16}")

Loading GPT-2 model...


Using `include_inputs_for_metrics` is deprecated and will be removed in version 5 of 🤗 Transformers. Please use `include_for_metrics` list argument instead.


Training arguments configured:
  Output directory: ./gpt2_adolescent_chatbot_train_output
  Batch size: 8
  Learning rate: 0.0001
  Max steps: 1200
  Warmup steps: 120
  Weight decay: 0.1
  FP16: True


In [9]:
# optimizer and learning rate scheduler
optimizer = torch.optim.AdamW(
    model.parameters(),
    lr=learning_rate,
    weight_decay=weight_decay,
    betas=(0.9, 0.999),
    eps=1e-8
)

lr_scheduler = get_scheduler(
    name="cosine",
    optimizer=optimizer,
    num_warmup_steps=warmup_steps,
    num_training_steps=max_steps
)

print(f"Optimizer configured with learning rate: {learning_rate}")
print(f"Scheduler configured with warmup steps: {warmup_steps}")

# Load accuracy metric
accuracy_metric = evaluate.load("accuracy")

# convert logits to predicted token IDs
def preprocess_logits_for_metrics(logits, labels):
    """Preprocess logits for metric computation"""
    return torch.argmax(logits, dim=-1)

# accuracy and other metrics
def compute_metrics(eval_preds):

    if hasattr(eval_preds, 'predictions') and hasattr(eval_preds, 'label_ids'):
        predictions = eval_preds.predictions
        labels = eval_preds.label_ids
    elif isinstance(eval_preds, tuple):
        if len(eval_preds) == 2:
            predictions, labels = eval_preds
        elif len(eval_preds) == 3:
            predictions, labels, _ = eval_preds
        else:
            print(f"Unexpected eval_preds format with {len(eval_preds)} elements")
            predictions = eval_preds[0]
            labels = eval_preds[1]
    else:
        print(f"eval_preds is not a tuple: {type(eval_preds)}")
        return {"accuracy": 0.0, "num_tokens": 0}

    #----------------------------------------------------------------------
    print(f"Predictions shape: {predictions.shape}, Labels shape: {labels.shape}")
    #----------------------------------------------------------------------

    if predictions.shape != labels.shape:
        min_length = min(predictions.shape[1], labels.shape[1])
        predictions = predictions[:, :min_length]
        labels = labels[:, :min_length]

    # for easier processing
    predictions_flat = predictions.reshape(-1)
    labels_flat = labels.reshape(-1)

    # creating mask for non-padded tokens
    mask = labels_flat != -100

    # apply mask
    predictions_masked = predictions_flat[mask]
    labels_masked = labels_flat[mask]

    #----------------------------------------------------------------------
    print(f"Predictions shape: {predictions.shape}, Labels shape: {labels.shape}")
    #----------------------------------------------------------------------

    # accuracy calculation
    if len(predictions_masked) == 0:
        print("No valid token found.")
        return {"accuracy": 0.0, "num_tokens": 0}

    # token-level accuracy
    correct = (predictions_masked == labels_masked).sum().item() if hasattr(predictions_masked, 'sum') else sum(predictions_masked == labels_masked)
    total = len(predictions_masked)
    accuracy = correct / total

    return {
        "accuracy": accuracy,
        "num_tokens": total
    }

print("Metrics computation functions configured")

Optimizer configured with learning rate: 0.0001
Scheduler configured with warmup steps: 120
Metrics computation functions configured


In [10]:
# Initialize trainer
print("Initializing trainer...")
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    data_collator=data_collator,
    optimizers=(optimizer, lr_scheduler),
    preprocess_logits_for_metrics=preprocess_logits_for_metrics,
    compute_metrics=compute_metrics,
)

print("Trainer initialized successfully!")

# model info
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Total parameters: {total_params:,}")
print(f"Trainable parameters: {trainable_params:,}")

# start training
print("\n" + "="*50)
print("STARTING TRAINING")
print("="*50)

try:
    # train model
    trainer.train()

    # save the final model
    trainer.save_model("./gpt2_adolescent_chatbot_final")
    tokenizer.save_pretrained("./gpt2_adolescent_chatbot_final")

    print("\n" + "="*50)
    print("TRAINING COMPLETED SUCCESSFULLY!")
    print("="*50)

    print("\nRunning final evaluation...")
    final_eval_results = trainer.evaluate()
    print(f"Final evaluation results: {final_eval_results}")

except Exception as e:
    print(f"Training failed with error: {e}")
    import traceback
    traceback.print_exc()

Initializing trainer...
Trainer initialized successfully!
Total parameters: 124,439,808
Trainable parameters: 124,439,808

STARTING TRAINING


  trainer = Trainer(


Step,Training Loss,Validation Loss,Accuracy,Num Tokens
100,1.7448,1.628745,0.0,6080
200,1.6075,1.584407,0.0,6080
300,1.5812,1.57679,0.0,6080
400,1.5708,1.567137,0.0,6080
500,1.559,1.565491,0.0,6080
600,1.556,1.556952,0.0,6080
700,1.5502,1.560713,0.0,6080
800,1.5492,1.560583,0.0,6080
900,1.546,1.561579,0.0,6080
1000,1.5414,1.562045,0.0,6080


Predictions shape: (120, 256), Labels shape: (120, 256)
Predictions shape: (120, 256), Labels shape: (120, 256)
Predictions shape: (120, 256), Labels shape: (120, 256)
Predictions shape: (120, 256), Labels shape: (120, 256)
Predictions shape: (120, 256), Labels shape: (120, 256)
Predictions shape: (120, 256), Labels shape: (120, 256)
Predictions shape: (120, 256), Labels shape: (120, 256)
Predictions shape: (120, 256), Labels shape: (120, 256)
Predictions shape: (120, 256), Labels shape: (120, 256)
Predictions shape: (120, 256), Labels shape: (120, 256)
Predictions shape: (120, 256), Labels shape: (120, 256)
Predictions shape: (120, 256), Labels shape: (120, 256)
Predictions shape: (120, 256), Labels shape: (120, 256)
Predictions shape: (120, 256), Labels shape: (120, 256)
Predictions shape: (120, 256), Labels shape: (120, 256)
Predictions shape: (120, 256), Labels shape: (120, 256)
Predictions shape: (120, 256), Labels shape: (120, 256)
Predictions shape: (120, 256), Labels shape: (12

KeyboardInterrupt: 

In [None]:
# evaluation and text generation
print("Post-training evaluation and testing...")

# perplexity calculation from evaluation loss
try:
    eval_results = trainer.evaluate()
    eval_loss = eval_results["eval_loss"]
    perplexity = math.exp(eval_loss)
    print(f"Final Evaluation Loss: {eval_loss:.4f}")
    print(f"Perplexity: {perplexity:.2f}")
    print(f"Accuracy: {eval_results.get('eval_accuracy', 'N/A')}")
except Exception as e:
    print(f"Evaluation failed: {e}")

# test text generation with adolescent psychology prompts
test_prompts = [
    "User: How does peer pressure affect teenagers?\nAssistant:",
    "User: What are the signs of depression in adolescents?\nAssistant:",
    "User: How can teenagers manage stress during exams?\nAssistant:",
    "User: Why do teenagers take risks?\nAssistant:"
]

print("\n" + "="*50)
print("TESTING TEXT GENERATION")
print("="*50)

# Set model to evaluation mode
model.eval()

for i, prompt in enumerate(test_prompts, 1):
    print(f"\nTest {i}:")
    print(f"Prompt: {prompt}")

    # Tokenize the prompt
    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    # response
    with torch.no_grad():
        output_ids = model.generate(
            inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            max_length=inputs["input_ids"].shape[1] + 100,
            num_return_sequences=1,
            do_sample=True,
            temperature=0.7,
            top_k=50,
            top_p=0.95,
            pad_token_id=tokenizer.eos_token_id,
            repetition_penalty=1.2,
            early_stopping=True
        )

    generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    response = generated_text[len(prompt):].strip()
    print(f"Generated Response: {response}")
    print("-" * 50)

In [None]:
# Interactive chatbot testing
print("Loading trained model for interactive testing...")

# Load the fine-tuned model
try:
    model_path = "./gpt2_adolescent_chatbot_final"
    model = GPT2LMHeadModel.from_pretrained(model_path)
    tokenizer = GPT2Tokenizer.from_pretrained(model_path)
    model.to(device)
    model.eval()
    print("Model loaded successfully!")
except:
    print("Using current model from training...")

def generate_adolescent_response(user_message, max_length=200, temperature=0.7):
    """Generate response for adolescent psychology questions"""
    
    # Format the input like the training data
    prompt = f"User: {user_message}\nAssistant:"
    
    # Tokenize
    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    
    # Generate response
    with torch.no_grad():
        output_ids = model.generate(
            inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            max_length=inputs["input_ids"].shape[1] + max_length,
            num_return_sequences=1,
            do_sample=True,
            temperature=temperature,
            top_k=50,
            top_p=0.95,
            pad_token_id=tokenizer.eos_token_id,
            repetition_penalty=1.2,
            early_stopping=True,
            eos_token_id=tokenizer.eos_token_id
        )
    
    # Decode and extract response
    generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    response = generated_text[len(prompt):].strip()
    
    # Clean up response (remove extra newlines, etc.)
    response = response.split('\n')[0].strip()
    
    return response

# Interactive testing loop
print("\n" + "="*50)
print("INTERACTIVE ADOLESCENT PSYCHOLOGY CHATBOT")
print("="*50)
print("Ask questions about adolescent psychology. Type 'exit' to quit.")

try:
    while True:
        user_input = input("\nYou: ").strip()
        
        if user_input.lower() in ['exit', 'quit', 'stop']:
            print("Chatbot session ended.")
            break
        
        if not user_input:
            continue
            
        # Generate response
        response = generate_adolescent_response(user_input)
        print(f"Assistant: {response}")
        
except KeyboardInterrupt:
    print("\nChatbot session interrupted.")
except Exception as e:
    print(f"Error in chatbot: {e}")
    
print("Interactive testing completed.")