In [2]:
import json
from datasets import Dataset
from transformers import AutoTokenizer

# Load your dataset
with open('hierarchical_books.json', 'r', encoding='utf-8') as f:
    books_data = json.load(f)

def prepare_dataset(books_data):
    """Convert your book data into the required format"""
    processed_data = []
    
    for book in books_data:
        # Create input text from available metadata
        input_components = []
        
        # Add title and author
        if book.get('title') and book.get('author'):
            input_components.append(f"Title: {book['title']} by {book['author']}")
        
        # Add description (main content for classification)
        if book.get('description'):
            input_components.append(f"Description: {book['description']}")
        
        # Add additional metadata if available
        if book.get('pages'):
            input_components.append(f"Pages: {book['pages']}")
        
        if book.get('releaseDate'):
            input_components.append(f"Release Date: {book['releaseDate']}")
        
        input_text = " ".join(input_components)
        
        # Create target text from tags and genre_category
        target_components = []
        
        # Add genre_category if available
        if book.get('genre_category'):
            target_components.append(book['genre_category'])
        
        # Add tags (split by comma and clean)
        if book.get('tags'):
            tags = [tag.strip() for tag in book['tags'].split(',') if tag.strip()]
            target_components.extend(tags)
        
        # Remove duplicates while preserving order
        seen = set()
        unique_targets = []
        for item in target_components:
            if item not in seen:
                seen.add(item)
                unique_targets.append(item)
        
        target_text = ", ".join(unique_targets)
        
        processed_data.append({
            "input_text": input_text,
            "target_text": target_text,
            "book_id": book.get('id'),
            "title": book.get('title')
        })
    
    return processed_data

# Process your data
processed_books = prepare_dataset(books_data)
print(f"Processed {len(processed_books)} books")


Processed 3422 books


In [3]:
tokenizer = AutoTokenizer.from_pretrained("KamilAin/bart-base-booksum")

def preprocess_function(examples):
    # Tokenize inputs (book metadata + description)
    model_inputs = tokenizer(
        examples["input_text"], 
        max_length=1024, 
        truncation=True, 
        padding="max_length"
    )
    
    # Tokenize targets (genres/tags)
    labels = tokenizer(
        examples["target_text"], 
        max_length=50,  # Increased for multiple tags
        truncation=True, 
        padding="max_length"
    )
    
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Convert to Dataset and tokenize
dataset = Dataset.from_list(processed_books)
tokenized_dataset = dataset.map(preprocess_function, batched=True)

# Train/validation split
train_test = tokenized_dataset.train_test_split(test_size=0.2, seed=42)
train_dataset = train_test["train"]
eval_dataset = train_test["test"]

print(f"Training samples: {len(train_dataset)}")
print(f"Validation samples: {len(eval_dataset)}")


Map: 100%|██████████| 3422/3422 [00:00<00:00, 4050.35 examples/s]

Training samples: 2737
Validation samples: 685





In [9]:
from transformers import (
    Seq2SeqTrainingArguments, 
    Seq2SeqTrainer, 
    AutoModelForSeq2SeqLM,
    AutoTokenizer
)
import torch
import numpy as np

# Load model and tokenizer
model = AutoModelForSeq2SeqLM.from_pretrained("KamilAin/bart-base-booksum")
tokenizer = AutoTokenizer.from_pretrained("KamilAin/bart-base-booksum")

# Training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./book-genre-classifier",
    evaluation_strategy="steps",
    eval_steps=500,
    logging_steps=100,
    save_steps=1000,
    learning_rate=3e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir="./logs",
    save_total_limit=3,
    predict_with_generate=True,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    warmup_steps=100,
    fp16=torch.cuda.is_available(),
    generation_max_length=50,
    generation_num_beams=4,
)

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    
    # Handle nested predictions
    if isinstance(predictions, tuple):
        predictions = predictions[0]
    
    # Replace -100 with pad token id
    predictions = np.where(predictions != -100, predictions, tokenizer.pad_token_id)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    
    # Decode
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # Clean
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [label.strip() for label in decoded_labels]
    
    # Metrics
    exact_matches = sum(pred == label for pred, label in zip(decoded_preds, decoded_labels))
    exact_match_accuracy = exact_matches / len(decoded_preds)
    
    return {"exact_match_accuracy": exact_match_accuracy}

# Initialize trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# Start training
print("Starting training...")
trainer.train()

# Evaluate
print("Evaluating model...")
eval_results = trainer.evaluate()
print(f"Final evaluation results: {eval_results}")

# Save model
trainer.save_model("./finetuned-bart-book-classifier")
tokenizer.save_pretrained("./finetuned-bart-book-classifier")
print("Model saved successfully!")


  trainer = Seq2SeqTrainer(


Starting training...


Step,Training Loss,Validation Loss,Exact Match Accuracy
500,0.2618,0.22588,0.013139
1000,0.1951,0.202037,0.014599
1500,0.1758,0.185301,0.018978
2000,0.1684,0.17946,0.026277
2500,0.1417,0.179568,0.024818
3000,0.1217,0.185594,0.026277


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


Evaluating model...


Final evaluation results: {'eval_loss': 0.1794600784778595, 'eval_exact_match_accuracy': 0.026277372262773723, 'eval_runtime': 530.8065, 'eval_samples_per_second': 1.29, 'eval_steps_per_second': 0.324, 'epoch': 5.0}
Model saved successfully!


In [10]:
def predict_book_genres(book_description, title=None, author=None, model_path="./finetuned-bart-book-classifier"):
    """Predict genres for a new book"""
    from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
    import torch
    
    # Load the fine-tuned model
    model = AutoModelForSeq2SeqLM.from_pretrained(model_path)
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    
    # Prepare input text
    input_components = []
    if title and author:
        input_components.append(f"Title: {title} by {author}")
    input_components.append(f"Description: {book_description}")
    
    input_text = " ".join(input_components)
    
    # Tokenize
    inputs = tokenizer(
        input_text, 
        return_tensors="pt", 
        truncation=True, 
        max_length=1024
    )
    
    # Generate
    with torch.no_grad():
        summary_ids = model.generate(
            **inputs, 
            max_length=50, 
            num_beams=4,
            early_stopping=True,
            no_repeat_ngram_size=2
        )
    
    predicted_genres = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return [genre.strip() for genre in predicted_genres.split(",") if genre.strip()]

# Example usage after training
# test_description = "A young orphan discovers a magical world beyond her wildest dreams..."
# predicted_genres = predict_book_genres(test_description, title="Example Book", author="Example Author")
# print(f"Predicted genres: {predicted_genres}")


new trial more epochs changed variation to test since accuracy is increasing 

In [None]:
from transformers import (
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    EarlyStoppingCallback,
)
import evaluate
import torch
import numpy as np

# Load model and tokenizer
model = AutoModelForSeq2SeqLM.from_pretrained("KamilAin/bart-base-booksum")
tokenizer = AutoTokenizer.from_pretrained("KamilAin/bart-base-booksum")

# Load ROUGE metric for soft evaluation

# Load ROUGE metric from evaluate
rouge = evaluate.load("rouge")

# Training arguments (improved)
training_args = Seq2SeqTrainingArguments(
    output_dir="./book-genre-classifier_new",
    evaluation_strategy="steps",
    eval_steps=250,
    logging_steps=100,
    save_steps=500,
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=10,
    weight_decay=0.01,
    logging_dir="./logs",
    save_total_limit=3,
    predict_with_generate=True,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    warmup_steps=300,
    fp16=torch.cuda.is_available(),
    generation_max_length=75,
    generation_num_beams=4,
)

# Metric computation
def compute_metrics(eval_pred):
    predictions, labels = eval_pred

    if isinstance(predictions, tuple):
        predictions = predictions[0]

    predictions = np.where(predictions != -100, predictions, tokenizer.pad_token_id)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [label.strip() for label in decoded_labels]

    exact_matches = sum(pred == label for pred, label in zip(decoded_preds, decoded_labels))
    exact_match_accuracy = exact_matches / len(decoded_preds)

    rouge_result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    rouge_l = rouge_result["rougeL"].mid.fmeasure

    return {
        "exact_match_accuracy": exact_match_accuracy,
        "rougeL": rouge_l,
    }

# Initialize trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],
)

# Start training
print("Starting training...")
trainer.train()

# Evaluate
print("Evaluating model...")
eval_results = trainer.evaluate()
print(f"Final evaluation results: {eval_results}")

# Save model
trainer.save_model("./finetuned-bart-book-classi" \
"                                                       fier_new")
tokenizer.save_pretrained("./finetuned-bart-book-classifier_new")
print("Model saved successfully!")


  from .autonotebook import tqdm as notebook_tqdm


NameError: name 'train_dataset' is not defined