In [1]:
import pandas as pd
import re
import os
from utilities import read_xml
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset, DatasetDict
import torch


In [2]:
# Load XML files
PATH_DEV = os.path.join(os.getcwd(), 'origdata', 'BlurbGenreCollection_EN_dev.txt')
PATH_TEST = os.path.join(os.getcwd(), 'origdata', 'BlurbGenreCollection_EN_test.txt')
PATH_TRAIN = os.path.join(os.getcwd(), 'origdata', 'BlurbGenreCollection_EN_train.txt')

df = pd.concat([read_xml(PATH_TRAIN), read_xml(PATH_TEST), read_xml(PATH_DEV)])


In [3]:
reduced_topics = [
    "Fiction", "Children’s Books", "Nonfiction",
    "Poetry", "Humor", "Classics", "Young Adult"
]

def assign_primary_topic(topic_string):
    topics = [t.strip() for t in re.split(r',\s*', topic_string)]
    for reduced_topic in reduced_topics:
        for t in topics:
            if reduced_topic.lower() in t.lower():
                return reduced_topic
    return "Other"

df['TOPIC_MAIN'] = df['TOPICS'].apply(assign_primary_topic)
df = df[df['TOPIC_MAIN'] != "Other"]


In [4]:
def clean_description(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df['DESCRIPTION_CLEAN'] = df['DESCRIPTION'].fillna('').apply(clean_description)


In [5]:
train_df, test_df = train_test_split(df[['DESCRIPTION_CLEAN', 'TOPIC_MAIN']], test_size=0.2, random_state=42)


In [6]:
label_encoder = LabelEncoder()
train_df['label'] = label_encoder.fit_transform(train_df['TOPIC_MAIN'])
test_df['label'] = label_encoder.transform(test_df['TOPIC_MAIN'])

dataset = DatasetDict({
    'train': Dataset.from_pandas(train_df[['DESCRIPTION_CLEAN', 'label']]),
    'test': Dataset.from_pandas(test_df[['DESCRIPTION_CLEAN', 'label']])
})

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

def tokenize(batch):
    return tokenizer(batch['DESCRIPTION_CLEAN'], padding="max_length", truncation=True)

dataset = dataset.map(tokenize, batched=True)
dataset = dataset.remove_columns(['DESCRIPTION_CLEAN'])
dataset.set_format("torch")


Map:   0%|          | 0/73515 [00:00<?, ? examples/s]

Map:   0%|          | 0/18379 [00:00<?, ? examples/s]

In [7]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=len(label_encoder.classes_))

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=2,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"]
)

trainer.train()


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/18380 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
predictions = trainer.predict(dataset["test"])
pred_labels = predictions.predictions.argmax(axis=1)

from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

print("Classification Report:")
print(classification_report(dataset["test"]["label"], pred_labels, target_names=label_encoder.classes_))

cm = confusion_matrix(dataset["test"]["label"], pred_labels)
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
plt.xlabel("Predicted")
plt.ylabel("True")
plt.title("BERT Confusion Matrix")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
