In [29]:
import torch
print(torch.cuda.is_available())
print(torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU found")

True
NVIDIA GeForce GTX 1660


In [6]:
pip install transformers datasets accelerate evaluate

Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
Installing collected packages: evaluate
Successfully installed evaluate-0.4.6
Note: you may need to restart the kernel to use updated packages.


In [30]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import Trainer, TrainingArguments
from datasets import Dataset
import torch
import pandas as pd

file_path = "../data/processed/full_2k.csv"
df = pd.read_csv(file_path)

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['label'] = le.fit_transform(df['Category'])
num_classes = len(le.classes_)

from sklearn.model_selection import train_test_split
X_temp, X_test, y_temp, y_test = train_test_split(
    df['Description'], df['label'],
    test_size=0.1, stratify=df['label'], random_state=13
)
X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp,
    test_size=0.1111111,
    stratify=y_temp, random_state=13
)

train_df = pd.DataFrame({'text': X_train, 'label': y_train})
val_df   = pd.DataFrame({'text': X_val,   'label': y_val})
test_df  = pd.DataFrame({'text': X_test,  'label': y_test})

In [31]:
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize(batch):
    return tokenizer(batch["text"],
                     truncation=True,
                     padding="max_length",
                     max_length=256)

# Convert to HF Dataset
train_ds = Dataset.from_pandas(train_df)
val_ds   = Dataset.from_pandas(val_df)
test_ds  = Dataset.from_pandas(test_df)

# Map tokenizer over data
train_ds = train_ds.map(tokenize, batched=True, batch_size=len(train_ds))
val_ds   = val_ds.map(tokenize, batched=True, batch_size=len(val_ds))
test_ds  = test_ds.map(tokenize, batched=True, batch_size=len(test_ds))

# Set the correct tensor columns
train_ds.set_format("torch", columns=["input_ids", "attention_mask", "label"])
val_ds.set_format("torch", columns=["input_ids", "attention_mask", "label"])
test_ds.set_format("torch", columns=["input_ids", "attention_mask", "label"])

Map:   0%|          | 0/27543 [00:00<?, ? examples/s]

Map:   0%|          | 0/3443 [00:00<?, ? examples/s]

Map:   0%|          | 0/3443 [00:00<?, ? examples/s]

In [32]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_classes
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [33]:
from transformers import TrainingArguments, Trainer
from evaluate import load

metric = load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(-1)
    return {"accuracy": metric.compute(predictions=preds, references=labels)["accuracy"]}

training_args = TrainingArguments(
    output_dir="./bert_runs",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    logging_dir="./logs",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

  trainer = Trainer(


In [34]:
trainer.train()

results = trainer.evaluate(test_ds)
print(f"Test accuracy: {results['eval_accuracy']:.4f}")

Epoch,Training Loss,Validation Loss,Accuracy
1,0.6238,0.55759,0.806274
2,0.4578,0.538438,0.817891
3,0.3149,0.551876,0.821086


Test accuracy: 0.8222


In [35]:
from sklearn.metrics import classification_report
preds = trainer.predict(test_ds).predictions.argmax(-1)
labels = trainer.predict(test_ds).label_ids
print(classification_report(labels, preds, target_names=le.classes_))

              precision    recall  f1-score   support

   Biography       0.68      0.74      0.70       212
    Business       0.93      0.94      0.94       254
     Cooking       0.98      0.99      0.99       289
     General       0.47      0.40      0.43       227
     History       0.78      0.74      0.76       204
    Juvenile       0.93      0.95      0.94       911
    Literary       0.61      0.54      0.58       213
     Mystery       0.66      0.71      0.69       227
    Religion       0.92      0.90      0.91       346
     Romance       0.84      0.87      0.86       330
   Thrillers       0.70      0.71      0.71       230

    accuracy                           0.82      3443
   macro avg       0.77      0.77      0.77      3443
weighted avg       0.82      0.82      0.82      3443

