In [33]:
import pandas as pd
import re
import os
from utilities import read_xml
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from utilities import clean_description
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments, DistilBertTokenizerFast, DistilBertForSequenceClassification
from datasets import Dataset, DatasetDict
import torch

In [34]:
# Import
PATH_DEV = os.path.join(os.getcwd(), 'origdata', 'BlurbGenreCollection_EN_dev.txt')
PATH_TEST = os.path.join(os.getcwd(), 'origdata', 'BlurbGenreCollection_EN_test.txt')
PATH_TRAIN = os.path.join(os.getcwd(), 'origdata', 'BlurbGenreCollection_EN_train.txt')

df_train = read_xml(PATH_TRAIN)
df_test = read_xml(PATH_TEST)
df_dev = read_xml(PATH_DEV)

frames = [df_train, df_test, df_dev]
df = pd.concat(frames).reset_index(drop=True)

In [35]:
# 166 topics reduced to 7
reduced_topics = [
    "Fiction", "Children’s Books", "Nonfiction",
    "Poetry", "Humor", "Classics", "Young Adult"
]

def assign_primary_topic(topic_string):
    # Split topic-string into a list of individual topics
    topics = [t.strip() for t in re.split(r',\s*', topic_string)]
    for reduced_topic in reduced_topics:
        for t in topics:
            if reduced_topic.lower() in t.lower():
                return reduced_topic
    return "Other"

df['TOPIC_MAIN'] = df['TOPICS'].apply(assign_primary_topic)
df = df[df['TOPIC_MAIN'] != "Other"]

In [36]:
# Clean Descriptions
df['DESCRIPTION'] = df['DESCRIPTION'].fillna('').apply(clean_description)
df.drop(77427, inplace=True)

In [37]:
df['word_count'] = df['DESCRIPTION'].str.split().str.len()

In [38]:
df[df['word_count'] ==0] # no zero word descriptions found

Unnamed: 0,TITLE,AUTHOR,PUBLISHED,ISBN,PAGE_NUM,URL,TOPICS,COPYRIGHT,DESCRIPTION,DATE,LANGUAGE,TOPIC_MAIN,word_count


In [39]:
train_df, test_df = train_test_split(df[['DESCRIPTION', 'TOPIC_MAIN']], test_size=0.2, random_state=42)

In [40]:
print(train_df.shape) # (73515, 3)
print(test_df.shape) # (18379, 3)

(73514, 2)
(18379, 2)


In [41]:
train_df

Unnamed: 0,DESCRIPTION,TOPIC_MAIN
63724,"This daring, star-packed collection is the fan...",Fiction
14608,"With her soft heart and angelic face, Madeline...",Fiction
72346,NEW YORK TIMES BESTSELLER * A deeply affecting...,Fiction
12024,"ONE CHILL EVENING IN BETHLEHEM, young Naomi he...",Children’s Books
16203,All original stories about the return of Cthul...,Fiction
...,...,...
6265,"Following a wild and raging storm, the Swiss f...",Children’s Books
54886,A family-focused guidebook to Italy for travel...,Fiction
76820,A single volume of the most beautiful texts by...,Poetry
860,The Truth Chasers Book Three. Someone's trying...,Fiction


In [42]:
test_df

Unnamed: 0,DESCRIPTION,TOPIC_MAIN
62081,This lexicon of modern Western philosophical c...,Fiction
18176,"Emily-and her new band, the Strangers-won the ...",Children’s Books
19140,A perfect graudation gift all about growing up...,Children’s Books
16534,Even a bookish big sister is drawn in by the p...,Children’s Books
21974,Where Can You Find the Kind of Love You Truly ...,Fiction
...,...,...
458,"In a small, dusty town in India, Sripathi Rao ...",Fiction
25026,The Limits to Capital provides one of the best...,Fiction
73035,How climate change will affect our political t...,Fiction
43431,When a woman's body washes up on the shore of ...,Fiction


In [43]:
print(df['DESCRIPTION'].isnull().sum())

0


In [44]:
# Convert categorical labels (topics) to numerical values
label_encoder = LabelEncoder()
train_df['label'] = label_encoder.fit_transform(train_df['TOPIC_MAIN'])
test_df['label'] = label_encoder.transform(test_df['TOPIC_MAIN'])

dataset = DatasetDict({
    'train': Dataset.from_pandas(train_df[['DESCRIPTION', 'label']]),
    'test': Dataset.from_pandas(test_df[['DESCRIPTION', 'label']])
})

# Load  pre-trained DistilBERT tokenizer
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

def tokenize(batch):
    texts = [str(text) if text is not None else "" for text in batch['DESCRIPTION']]
    # tokenize text with padding and truncate for consistent input length
    return tokenizer(texts, padding="max_length", truncation=True)

# Apply function to entire dataset
dataset = dataset.map(tokenize, batched=True)

# Remove 'DESCRIPTION' column, as it's not needed after tokenization
dataset = dataset.remove_columns(['DESCRIPTION'])

# Set the format of the dataset to PyTorch tensors
dataset.set_format("torch")

                                                                   

In [45]:
# Setup device for GPU usage
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)} is available.")
else:
    print("No GPU available. Training will run on CPU.")

GPU: NVIDIA GeForce RTX 4070 Ti is available.


In [46]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [48]:
# Load the pre-trained DistilBERT model for sequence classification
# + specify num_labels (number of unique topics)
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=len(label_encoder.classes_))

training_args = TrainingArguments(
    output_dir="./results", # directory where model/checkpoints are saved
    evaluation_strategy="epoch", # evaluate at the end of epoch
    save_strategy="epoch", # save model each epoch
    learning_rate=2e-5,
    per_device_train_batch_size=16, # consumes about 7GB of VRAM
    per_device_eval_batch_size=16,
    num_train_epochs=2, # two training epochs
    weight_decay=0.01,# weight decay to prevent overfitting
    logging_dir="./logs",
    logging_steps=10, # log each ten steps
    no_cuda=False, # force gpu usage
    fp16=True, # mixed precision to speed up training
)

class CustomTrainer(Trainer):
    # Override create_optimizer method to use AdamW optimizer with fused kernel for efficiency
    def create_optimizer(self):
        if self.optimizer is None:
            # Create the AdamW optimizer and pass in model parameters and the learning rate
            self.optimizer = torch.optim.AdamW(self.model.parameters(), lr=training_args.learning_rate, fused=True)
        return self.optimizer

trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"]
)

# Start training the model
trainer.train()

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.bias', 'pre_classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss
1,0.1006,0.137772
2,0.0407,0.147777


TrainOutput(global_step=9190, training_loss=0.1401493052330772, metrics={'train_runtime': 1237.0915, 'train_samples_per_second': 118.85, 'train_steps_per_second': 7.429, 'total_flos': 1.947745869232128e+16, 'train_loss': 0.1401493052330772, 'epoch': 2.0})

In [65]:
# Load saved model from results directory
saved_model = DistilBertForSequenceClassification.from_pretrained("./results/checkpoint-9190", num_labels=len(label_encoder.classes_))

# Set device to GPU if available, otherwise use CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Move model to chosen device (GPU or CPU)
saved_model.to(device)

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [77]:
def predict_topic(description):
    # Tokenize the description
    inputs = tokenizer(description, padding=True, truncation=True, return_tensors="pt")

    # Move inputs to the device (GPU or CPU)
    inputs = {key: value.to(device) for key, value in inputs.items()}

    # Get the model's logits (raw prediction scores)
    with torch.no_grad():
        logits = saved_model(**inputs).logits

    # Get the predicted label (index of the maximum logit)
    predicted_label_idx = torch.argmax(logits, dim=-1).item()

    # Convert the label index to the topic name using the label_encoder
    predicted_label = label_encoder.inverse_transform([predicted_label_idx])[0]

    return predicted_label

def predict_top_3(description):
    # Tokenize the input description
    inputs = tokenizer(description, padding=True, truncation=True, return_tensors="pt")

    # Move inputs to the device (GPU or CPU)
    inputs = {key: value.to(device) for key, value in inputs.items()}

    # Get the model's logits (raw prediction scores)
    with torch.no_grad():
        logits = saved_model(**inputs).logits

    # Get the top 3 predictions based on the logits (softmax to get probabilities)
    probs = torch.nn.functional.softmax(logits, dim=-1)

    # Get top 3 predictions
    top_3_values, top_3_indices = torch.topk(probs, 3, dim=-1)

    # Convert to numpy and get the predicted labels
    top_3_values = top_3_values.cpu().numpy().flatten()  # Get probabilities
    top_3_indices = top_3_indices.cpu().numpy().flatten()  # Get indices of the topics

    top_3_topics = label_encoder.inverse_transform(top_3_indices)

    return list(zip(top_3_topics, top_3_values))

In [78]:
new_description = "A thrilling adventure set in a fantastical world."
predicted_topic = predict_topic(new_description)
predicted_top_3 = predict_top_3(new_description)

print(f"The predicted topic for the description is: {predicted_topic}")
for topic, prob in predicted_top_3:
    print(f"{topic}: {prob:.4f} %")

The predicted topic for the description is: Children’s Books
Children’s Books: 0.9812 %
Fiction: 0.0184 %
Humor: 0.0001 %
