In [6]:
import json
from datasets import Dataset
from transformers import AutoTokenizer

# Load your dataset
with open('hierarchical_books.json', 'r', encoding='utf-8') as f:
    books_data = json.load(f)

def prepare_dataset(books_data):
    """Convert your book data into the required format"""
    processed_data = []
    
    for book in books_data:
        # Create input text from available metadata
        input_components = []
        
        # Add title and author
        if book.get('title') and book.get('author'):
            input_components.append(f"Title: {book['title']} by {book['author']}")
        
        # Add description (main content for classification)
        if book.get('description'):
            input_components.append(f"Description: {book['description']}")
        
        # Add additional metadata if available
        if book.get('pages'):
            input_components.append(f"Pages: {book['pages']}")
        
        if book.get('releaseDate'):
            input_components.append(f"Release Date: {book['releaseDate']}")
        
        input_text = " ".join(input_components)
        
        # Create target text from tags and genre_category
        target_components = []
        
        # Add genre_category if available
        if book.get('genre_category'):
            target_components.append(book['genre_category'])
        
        # Add tags (split by comma and clean)
        if book.get('tags'):
            tags = [tag.strip() for tag in book['tags'].split(',') if tag.strip()]
            target_components.extend(tags)
        
        # Remove duplicates while preserving order
        seen = set()
        unique_targets = []
        for item in target_components:
            if item not in seen:
                seen.add(item)
                unique_targets.append(item)
        
        target_text = ", ".join(unique_targets)
        
        processed_data.append({
            "input_text": input_text,
            "target_text": target_text,
            "book_id": book.get('id'),
            "title": book.get('title')
        })
    
    return processed_data

# Process your data
processed_books = prepare_dataset(books_data)
print(f"Processed {len(processed_books)} books")


Processed 3359 books


In [7]:
tokenizer = AutoTokenizer.from_pretrained("KamilAin/bart-base-booksum")

def preprocess_function(examples):
    # Tokenize inputs (book metadata + description)
    model_inputs = tokenizer(
        examples["input_text"], 
        max_length=1024, 
        truncation=True, 
        padding="max_length"
    )
    
    # Tokenize targets (genres/tags)
    labels = tokenizer(
        examples["target_text"], 
        max_length=50,  # Increased for multiple tags
        truncation=True, 
        padding="max_length"
    )
    
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Convert to Dataset and tokenize
dataset = Dataset.from_list(processed_books)
tokenized_dataset = dataset.map(preprocess_function, batched=True)

# Train/validation split
train_test = tokenized_dataset.train_test_split(test_size=0.2, seed=42)
train_dataset = train_test["train"]
eval_dataset = train_test["test"]

print(f"Training samples: {len(train_dataset)}")
print(f"Validation samples: {len(eval_dataset)}")


Map: 100%|██████████| 3359/3359 [00:01<00:00, 1766.36 examples/s]

Training samples: 2687
Validation samples: 672





In [None]:
from transformers import TrainingArguments, Trainer, AutoModelForSeq2SeqLM
import torch

model = AutoModelForSeq2SeqLM.from_pretrained("KamilAin/bart-base-booksum")

# Optimized training arguments for your dataset size
training_args = TrainingArguments(
    output_dir="./book-genre-classifier",
    evaluation_strategy="steps",
    eval_steps=500,
    logging_steps=100,
    save_steps=1000,
    learning_rate=3e-5,  # Slightly higher for smaller dataset
    per_device_train_batch_size=4,  # Adjust based on GPU memory
    per_device_eval_batch_size=4,
    num_train_epochs=5,  # More epochs for smaller dataset
    weight_decay=0.01,
    logging_dir="./logs",
    save_total_limit=3,
    predict_with_generate=True,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    warmup_steps=100,
    fp16=torch.cuda.is_available(),  # Use mixed precision if GPU available
)
