In [2]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, TrainingArguments, Trainer
import torch
import datasets
from datasets import Dataset
import accelerate
import pandas as pd

df = pd.read_csv("new_updated_data.csv")

In [5]:
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer
import pandas as pd

# Load dataset
df = pd.read_csv("new_updated_data.csv")

# Ensure descriptions are strings and handle missing values
df["description"] = df["description"].fillna("").astype(str)

# Ensure tropes are lists of labels
df["tags"] = df["tags"].apply(lambda x: x.split(", ") if isinstance(x, str) else [])

# Split data into training and testing
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df["description"].tolist(), df["tags"].tolist(), test_size=0.2, random_state=42
)

# Check for any unexpected non-string values
print(f"Sample training text: {train_texts[:5]}")  # Should print valid book descriptions

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("KamilAin/bart-base-booksum")

# Tokenize descriptions only
train_encodings = tokenizer(
    train_texts, truncation=True, padding=True, max_length=512
)
test_encodings = tokenizer(
    test_texts, truncation=True, padding=True, max_length=512
)

# Print sample output
print(train_encodings.keys())  # Should print: dict_keys(['input_ids', 'attention_mask'])


Sample training text: ['At a time when computers were a short step removed from mechanical data processors, Licklider was writing treatises on "human-computer symbiosis," "computers as communication devices," and a now not-so-unfamiliar "Intergalactic Network." His ideas became so influential, his passion so contagious, that Waldrop coined him "computing\'s Johnny Appleseed." In a simultaneously compelling personal narrative and comprehensive historical exposition, Waldrop tells the story of the man who not only instigated the work that led to the internet, but also shifted our understanding of what computers were and could be.', "Winner of the Pulitzer Prize, this book applies Godel's seminal contribution to modern mathematics to the study of the human mind and the development of artificial intelligence.", '', "The Gene: An Intimate History is a book written by Siddhartha Mukherjee, an Indian-born American physician and oncologist. It was published on 17 May 2016 by Scribner. The book

In [6]:
from sklearn.preprocessing import MultiLabelBinarizer

# Convert tags (tropes) into multi-hot encoding
mlb = MultiLabelBinarizer()
mlb.fit(df["tags"])  # Learn all unique tags
train_labels = mlb.transform(train_labels)  # Convert to binary labels
test_labels = mlb.transform(test_labels)

# Print sample labels
print("Unique Tags:", mlb.classes_)  # Check all unique tropes
print("Sample Encoded Labels:", train_labels[:5])  # One-hot encoded output

Unique Tags: [' American' ' Ancient' ' Australian' ' China' ' Chinese' ' Classified'
 ' Dutch' ' English' ' Folk Tales' ' Greek' ' Human' ' Korean'
 ' Latin American' ' Legends & Mythology' ' Mind & Spirit' ' Mythical'
 ' Personal' ' Subterranean' ' Swiss (German)' ' Viking' ' artistic'
 'Abandoned children' 'Abduction' 'Ability' 'Abnormalities'
 'Absentee fathers' 'Abuse' 'Abuse of administrative power' 'Abused'
 'Abused teenagers' 'Abused wives' 'Abused women' 'Abusive men'
 'Academic writing' 'Accident victims' 'Accidents' 'Action'
 'Action and adventure' 'Activism' 'Actresses' 'Addicts' 'Adolescence'
 'Adolescent psychology' 'Adoptees' 'Adoption' 'Adult Fiction' 'Adultery'
 'Adulthood' 'Adventure' 'Adventure and adventurers' 'Adventure stories'
 'Advice' 'African American authors' 'African American families'
 'African American gay men' 'African American girls'
 'African American teenage girls' 'African Americans'
 'African Americans in mass media' 'Africans' 'Aging' 'Agriculture'
 

In [7]:
import torch
from datasets import Dataset, DatasetDict

# Convert tokenized data into dataset format
train_dataset = Dataset.from_dict({
    "input_ids": train_encodings["input_ids"],
    "attention_mask": train_encodings["attention_mask"],
    "labels": train_labels.tolist(),
})

test_dataset = Dataset.from_dict({
    "input_ids": test_encodings["input_ids"],
    "attention_mask": test_encodings["attention_mask"],
    "labels": test_labels.tolist(),
})

# Convert to Hugging Face DatasetDict format
dataset = DatasetDict({"train": train_dataset, "test": test_dataset})

In [3]:
import optuna
from transformers import Trainer, TrainingArguments, EarlyStoppingCallback
from transformers import AutoModelForSeq2SeqLM
from datasets import DatasetDict

# Define objective function for Optuna
def objective(trial):
    # Sample hyperparameters
    learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 5e-4)
    batch_size = trial.suggest_categorical("batch_size", [8, 16, 32])
    weight_decay = trial.suggest_loguniform("weight_decay", 1e-5, 0.1)
    
    # Define training arguments
    training_args = TrainingArguments(
        output_dir="./results",
        evaluation_strategy="epoch",
        save_strategy="epoch",
        logging_strategy="epoch",
        save_total_limit=2,
        load_best_model_at_end=True,
        metric_for_best_model="eval_loss",
        greater_is_better=False,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=10,  # Higher number since early stopping is enabled
        learning_rate=learning_rate,
        weight_decay=weight_decay,
        logging_dir="./logs",
        report_to="none",  # Prevents logging to external services
    )
    
    # Load model
    model = AutoModelForSeq2SeqLM.from_pretrained("KamilAin/bart-base-booksum")
    
    # Define Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=dataset["train"],
        eval_dataset=dataset["test"],
        callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],  # Stops if no improvement for 2 epochs
    )
    
    # Train model
    trainer.train()
    
    # Evaluate model on validation set
    eval_results = trainer.evaluate()
    
    return eval_results["eval_loss"]

# Run Optuna optimization
study = optuna.create_study(direction="minimize")  # Minimize validation loss
study.optimize(objective, n_trials=10)  # Run 10 trials (increase if needed)

# Get best hyperparameters
best_params = study.best_params
print(f"Best Hyperparameters: {best_params}")


[I 2025-02-24 12:27:46,603] A new study created in memory with name: no-name-8b8eefdc-d085-4f53-bc6b-7cbe0db24ee4
  learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 5e-4)
  weight_decay = trial.suggest_loguniform("weight_decay", 1e-5, 0.1)
[W 2025-02-24 12:27:47,480] Trial 0 failed with parameters: {'learning_rate': 1.9225378797319263e-05, 'batch_size': 16, 'weight_decay': 0.004201119575329948} because of the following error: NameError("name 'dataset' is not defined").
Traceback (most recent call last):
  File "c:\Users\maheit\dev\book\lib\site-packages\optuna\study\_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
  File "C:\Users\maheit\AppData\Local\Temp\ipykernel_2492\1754616738.py", line 39, in objective
    train_dataset=dataset["train"],
NameError: name 'dataset' is not defined. Did you mean: 'Dataset'?
[W 2025-02-24 12:27:47,481] Trial 0 failed with value None.


NameError: name 'dataset' is not defined