In [20]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from utilities import load_data_basiccleanup
from transformers import Trainer, TrainingArguments, DistilBertTokenizerFast, DistilBertForSequenceClassification
from datasets import Dataset, DatasetDict
import torch
import warnings
warnings.filterwarnings("ignore", category=FutureWarning) # supress FutureWarning

## Load Data

In [21]:
# Load clean data
df = load_data_basiccleanup()



In [22]:
# Look at topic distribution
df['TOPIC_MAIN'].value_counts()

TOPIC_MAIN
Fiction             62596
Children’s Books    19643
Young Adult          3749
Classics             3199
Humor                1450
Poetry               1256
Name: count, dtype: int64

In [23]:
# Ensure data is consistent
print(f"{df['TOPIC_MAIN'].value_counts().sum()} == {df.shape[0]}")

91893 == 91893


In [24]:
# Split dataset into train and test
train_df, test_df = train_test_split(df[['DESCRIPTION', 'TOPIC_MAIN']], test_size=0.2, random_state=42)

In [25]:
print(train_df.shape) # (73515, 3)
print(test_df.shape) # (18379, 3)

(73514, 2)
(18379, 2)


In [26]:
train_df

Unnamed: 0,DESCRIPTION,TOPIC_MAIN
63724,"This daring, star-packed collection is the fan...",Fiction
14608,"With her soft heart and angelic face, Madeline...",Fiction
72346,NEW YORK TIMES BESTSELLER * A deeply affecting...,Fiction
12024,"ONE CHILL EVENING IN BETHLEHEM, young Naomi he...",Children’s Books
16203,All original stories about the return of Cthul...,Fiction
...,...,...
6265,"Following a wild and raging storm, the Swiss f...",Children’s Books
54886,A family-focused guidebook to Italy for travel...,Fiction
76820,A single volume of the most beautiful texts by...,Poetry
860,The Truth Chasers Book Three. Someone's trying...,Fiction


In [27]:
test_df

Unnamed: 0,DESCRIPTION,TOPIC_MAIN
62081,This lexicon of modern Western philosophical c...,Fiction
18176,"Emily-and her new band, the Strangers-won the ...",Children’s Books
19140,A perfect graudation gift all about growing up...,Children’s Books
16534,Even a bookish big sister is drawn in by the p...,Children’s Books
21974,Where Can You Find the Kind of Love You Truly ...,Fiction
...,...,...
458,"In a small, dusty town in India, Sripathi Rao ...",Fiction
25026,The Limits to Capital provides one of the best...,Fiction
73035,How climate change will affect our political t...,Fiction
43431,When a woman's body washes up on the shore of ...,Fiction


In [28]:
# Convert categorical labels (topics) to numerical values
label_encoder = LabelEncoder()
train_df['label'] = label_encoder.fit_transform(train_df['TOPIC_MAIN'])
test_df['label'] = label_encoder.transform(test_df['TOPIC_MAIN'])
train_df.head()

Unnamed: 0,DESCRIPTION,TOPIC_MAIN,label
63724,"This daring, star-packed collection is the fan...",Fiction,2
14608,"With her soft heart and angelic face, Madeline...",Fiction,2
72346,NEW YORK TIMES BESTSELLER * A deeply affecting...,Fiction,2
12024,"ONE CHILL EVENING IN BETHLEHEM, young Naomi he...",Children’s Books,0
16203,All original stories about the return of Cthul...,Fiction,2


In [29]:
# Convert Pandas DataFrames to HuggingFace Dataset-format
dataset = DatasetDict({
    'train': Dataset.from_pandas(train_df[['DESCRIPTION', 'label']]),
    'test': Dataset.from_pandas(test_df[['DESCRIPTION', 'label']])
})
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['DESCRIPTION', 'label', '__index_level_0__'],
        num_rows: 73514
    })
    test: Dataset({
        features: ['DESCRIPTION', 'label', '__index_level_0__'],
        num_rows: 18379
    })
})


In [30]:
# Load pre-trained DistilBERT tokenizer
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

def tokenize(batch):
    texts = [str(text) if text is not None else "" for text in batch['DESCRIPTION']]
    # tokenize text with padding and truncate for consistent input length
    return tokenizer(texts, padding="max_length", truncation=True)

# Apply function to the dataset
dataset = dataset.map(tokenize, batched=True)

# Remove 'DESCRIPTION' column, as it's not needed after tokenization
dataset = dataset.remove_columns(['DESCRIPTION'])

# Set the format of the dataset to PyTorch tensors
dataset.set_format("torch")

                                                                   

## Setup Model Training

In [31]:
# Check GPU availability
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)} is available.")
else:
    print("No GPU available. Training will run on CPU.")

GPU: NVIDIA GeForce RTX 4070 Ti is available.


In [32]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device) # cuda == GPU usage

cuda


## Train model

In [33]:
# Load the pre-trained DistilBERT model for sequence classification
# + specify num_labels (number of unique topics)
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=len(label_encoder.classes_))

training_args = TrainingArguments(
    output_dir="./results", # directory where model/checkpoints are saved
    evaluation_strategy="epoch", # evaluate at the end of epoch
    save_strategy="epoch", # save model each epoch
    learning_rate=2e-5,
    per_device_train_batch_size=16, # consumes about 7GB of VRAM
    per_device_eval_batch_size=16,
    num_train_epochs=2, # two training epochs
    weight_decay=0.01,# weight decay to prevent overfitting
    logging_dir="./logs",
    logging_steps=10, # log each ten steps
    no_cuda=False, # force gpu usage
    fp16=True, # mixed precision to speed up training
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"]
)

# Start training the model
trainer.train()

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.2308,0.218945
2,0.0697,0.231831


TrainOutput(global_step=9190, training_loss=0.22332028450365296, metrics={'train_runtime': 1289.9374, 'train_samples_per_second': 113.981, 'train_steps_per_second': 7.124, 'total_flos': 1.9477806026563584e+16, 'train_loss': 0.22332028450365296, 'epoch': 2.0})

In [34]:
# Save final model
model.save_pretrained("./final_model")
tokenizer.save_pretrained("./final_model")

('./final_model\\tokenizer_config.json',
 './final_model\\special_tokens_map.json',
 './final_model\\vocab.txt',
 './final_model\\added_tokens.json',
 './final_model\\tokenizer.json')

## Test trained model

In [35]:
# Load saved model
saved_model = DistilBertForSequenceClassification.from_pretrained("./final_model", num_labels=len(label_encoder.classes_))

# Load saved tokenizer
tokenizer = DistilBertTokenizerFast.from_pretrained("./final_model")

# Set device to GPU if available, otherwise use CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Move model to chosen device (GPU or CPU)
saved_model.to(device)

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [36]:
def predict_topic(description):
    # Tokenize the description
    inputs = tokenizer(description, padding=True, truncation=True, return_tensors="pt")

    # Move inputs to the device (GPU or CPU)
    inputs = {key: value.to(device) for key, value in inputs.items()}

    # Get the model's logits (raw prediction scores)
    with torch.no_grad():
        logits = saved_model(**inputs).logits

    # Get the predicted label (index of the maximum logit)
    predicted_label_idx = torch.argmax(logits, dim=-1).item()

    # Convert the label index to the topic name using the label_encoder
    predicted_label = label_encoder.inverse_transform([predicted_label_idx])[0]

    return predicted_label

def predict_top_3(description):
    # Tokenize the input description
    inputs = tokenizer(description, padding=True, truncation=True, return_tensors="pt")

    # Move inputs to the device (GPU or CPU)
    inputs = {key: value.to(device) for key, value in inputs.items()}

    # Get the model's logits (raw prediction scores)
    with torch.no_grad():
        logits = saved_model(**inputs).logits

    # Get the top 3 predictions based on the logits (softmax to get probabilities)
    probs = torch.nn.functional.softmax(logits, dim=-1)

    # Get top 3 predictions
    top_3_values, top_3_indices = torch.topk(probs, 3, dim=-1)

    # Convert to numpy and get the predicted labels
    top_3_values = top_3_values.cpu().numpy().flatten()  # Get probabilities
    top_3_indices = top_3_indices.cpu().numpy().flatten()  # Get indices of the topics

    top_3_topics = label_encoder.inverse_transform(top_3_indices)

    return list(zip(top_3_topics, top_3_values))

## Test model prediction

In [37]:
new_description = "A thrilling adventure set in a fantastical world."
predicted_topic = predict_topic(new_description)
predicted_top_3 = predict_top_3(new_description)

print(f"The predicted topic for the description is: {predicted_topic}")
for topic, prob in predicted_top_3:
    print(f"{topic}: {prob:.4f} %")

The predicted topic for the description is: Children’s Books
Children’s Books: 0.9850 %
Young Adult: 0.0091 %
Fiction: 0.0054 %
