In [1]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.34.1-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m22.3 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m38.2 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m56.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m59.9 MB/s[0m eta [36m0:00:00[0m
Col

In [2]:
import torch
from transformers.file_utils import is_tf_available, is_torch_available, is_torch_tpu_available
from transformers import BertTokenizerFast, BertForSequenceClassification
from transformers import Trainer, TrainingArguments
import numpy as np
import random
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split

In [3]:
def set_seed(seed: int):
    random.seed(seed)
    np.random.seed(seed)
    if is_torch_available():
        torch.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)

    if is_tf_available():
        import tensorflow as tf

        tf.random.set_seed(seed)

set_seed(1)

Initial analysis on dataset

In [4]:
# Loading 20 Newsgroups dataset
from sklearn.datasets import fetch_20newsgroups

newsgroups_train = fetch_20newsgroups(subset='train')

from pprint import pprint
pprint(list(newsgroups_train.target_names))

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']


In [5]:
newsgroups_train.filenames.shape

(11314,)

In [6]:
print(newsgroups_train.filenames[:10])

['/root/scikit_learn_data/20news_home/20news-bydate-train/rec.autos/102994'
 '/root/scikit_learn_data/20news_home/20news-bydate-train/comp.sys.mac.hardware/51861'
 '/root/scikit_learn_data/20news_home/20news-bydate-train/comp.sys.mac.hardware/51879'
 '/root/scikit_learn_data/20news_home/20news-bydate-train/comp.graphics/38242'
 '/root/scikit_learn_data/20news_home/20news-bydate-train/sci.space/60880'
 '/root/scikit_learn_data/20news_home/20news-bydate-train/talk.politics.guns/54525'
 '/root/scikit_learn_data/20news_home/20news-bydate-train/sci.med/58080'
 '/root/scikit_learn_data/20news_home/20news-bydate-train/comp.sys.ibm.pc.hardware/60249'
 '/root/scikit_learn_data/20news_home/20news-bydate-train/comp.os.ms-windows.misc/10008'
 '/root/scikit_learn_data/20news_home/20news-bydate-train/comp.sys.mac.hardware/50502']


In [7]:
unique_values = list(set(newsgroups_train.target))
print(unique_values)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]


In [8]:
# To check if class occurrences are comparable

from collections import Counter

class_frequencies = Counter(newsgroups_train.target)

for class_value, frequency in class_frequencies.items():
    print(f'Class {class_value}: {frequency} occurrences')

Class 7: 594 occurrences
Class 4: 578 occurrences
Class 1: 584 occurrences
Class 14: 593 occurrences
Class 16: 546 occurrences
Class 13: 594 occurrences
Class 3: 590 occurrences
Class 2: 591 occurrences
Class 8: 598 occurrences
Class 19: 377 occurrences
Class 6: 585 occurrences
Class 0: 480 occurrences
Class 12: 591 occurrences
Class 5: 593 occurrences
Class 10: 600 occurrences
Class 9: 597 occurrences
Class 15: 599 occurrences
Class 17: 564 occurrences
Class 18: 465 occurrences
Class 11: 595 occurrences


Next section includes, preprocessing (tokenization), model implementation, training and evaluation of the Base Uncased BERT model

In [9]:
model_name = "bert-base-uncased"
# setting max sequence length for each data point
max_length = 512

In [10]:
tokenizer = BertTokenizerFast.from_pretrained(model_name, do_lower_case=True)

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [11]:
def load_data(test_size=0.2):

  dataset = fetch_20newsgroups(subset="all", shuffle=True, remove=("headers", "footers", "quotes"))
  documents = dataset.data
  labels = dataset.target

  return train_test_split(documents, labels, test_size=test_size), dataset.target_names


(train_texts, test_texts, train_labels, test_labels), target_names = load_data()

In [12]:
# Remaining train data split into validation and train data
train_texts, valid_texts, train_labels, valid_labels = train_test_split(train_texts, train_labels, test_size=0.2, random_state=42)

In [13]:
# tokenize the dataset
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=max_length)
valid_encodings = tokenizer(valid_texts, truncation=True, padding=True, max_length=max_length)
test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=max_length)

In [14]:
print(train_encodings[:5])

[Encoding(num_tokens=512, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]), Encoding(num_tokens=512, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]), Encoding(num_tokens=512, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]), Encoding(num_tokens=512, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]), Encoding(num_tokens=512, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])]


In [15]:
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item["labels"] = torch.tensor([self.labels[idx]])
        return item

    def __len__(self):
        return len(self.labels)

# convert our tokenized data into a Custom Dataset
train_dataset = CustomDataset(train_encodings, train_labels)
test_dataset = CustomDataset(test_encodings, test_labels)
valid_dataset = CustomDataset(valid_encodings, valid_labels)

In [16]:
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=len(target_names)).to("cuda")

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
# DataLoaders
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

train_batch_size = 8
train_dataloader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=train_batch_size)
test_dataloader = DataLoader(test_dataset, sampler=SequentialSampler(test_dataset), batch_size=train_batch_size)
valid_dataloader = DataLoader(valid_dataset, sampler=RandomSampler(valid_dataset), batch_size=train_batch_size)

In [18]:
# Training the model on train set

from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
import torch

optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

for epoch in range(3):

  # Training loop
  model.train()
  total_loss = 0.0

  for batch in train_dataloader:
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['labels'].to(device)

    optimizer.zero_grad()
    outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
    loss = outputs.loss
    total_loss += loss.item()

    loss.backward()
    optimizer.step()

  print(f"Epoch {epoch+1}, Training Loss: {total_loss}")


  # Validation loop
  model.eval()
  val_loss = 0.0
  num_val_steps = 0
  val_predictions = []

  with torch.no_grad():
    for batch in valid_dataloader:
      input_ids = batch['input_ids'].to(device)
      attention_mask = batch['attention_mask'].to(device)
      labels = batch['labels'].to(device)

      outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
      loss = outputs.loss
      val_loss += loss.item()
      num_val_steps += 1

      logits = outputs.logits
      predictions = logits.argmax(dim=1).cpu().numpy()
      val_predictions.extend(predictions)

  avg_val_loss = val_loss / num_val_steps
  val_accuracy = accuracy_score(valid_labels, val_predictions)
  print(f"Epoch {epoch+1}, Validation Loss: {avg_val_loss}, Validation Accuracy: {val_accuracy:.2%}")


Epoch 1, Training Loss: 2465.319599688053
Epoch 1, Validation Loss: 1.0332684209397365, Validation Accuracy: 4.94%
Epoch 2, Training Loss: 1303.1971308365464
Epoch 2, Validation Loss: 0.8798447947563797, Validation Accuracy: 4.77%
Epoch 3, Training Loss: 906.9374239481986
Epoch 3, Validation Loss: 0.8778640803947689, Validation Accuracy: 4.94%


In [19]:
# Evaluate the model on the test set
model.eval()
test_loss = 0.0
num_test_steps = 0
test_predictions = []

with torch.no_grad():
  for batch in test_dataloader:
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['labels'].to(device)

    outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
    loss = outputs.loss
    test_loss += loss.item()
    num_test_steps += 1

    logits = outputs.logits
    predictions = logits.argmax(dim=1).cpu().numpy()
    test_predictions.extend(predictions)


avg_test_loss = test_loss / num_test_steps
test_accuracy = accuracy_score(test_labels, test_predictions)
print(f"Test Loss: {avg_test_loss}, Test Accuracy: {test_accuracy:.2%}")

Test Loss: 0.8423283352690228, Test Accuracy: 74.99%


In [20]:
# Calculate other metrics
from sklearn.metrics import f1_score, precision_score, recall_score

test_precision = precision_score(test_labels, test_predictions, average='weighted')
test_recall = recall_score(test_labels, test_predictions, average='weighted')
test_f1 = f1_score(test_labels, test_predictions, average='weighted')

print(f"Test Precision: {test_precision}")
print(f"Test Recall: {test_recall}")
print(f"Test F1 Score: {test_f1}")
print(f"Test Loss: {avg_test_loss}, Test Accuracy: {test_accuracy:.2%}")

Test Precision: 0.7615671637582642
Test Recall: 0.749867374005305
Test F1 Score: 0.74968333483584
Test Loss: 0.8423283352690228, Test Accuracy: 74.99%


Next section includes model predictions on some random texts

In [21]:
def get_prediction(text):
    inputs = tokenizer(text, padding=True, truncation=True, max_length=max_length, return_tensors="pt").to("cuda")
    outputs = model(**inputs)
    probs = outputs[0].softmax(1)
    return target_names[probs.argmax()]

In [22]:
text = """
The first thing is first.
If you purchase a Macbook, you should not encounter performance issues that will prevent you from learning to code efficiently.
However, in the off chance that you have to deal with a slow computer, you will need to make some adjustments.
Having too many background apps running in the background is one of the most common causes.
The same can be said about a lack of drive storage.
For that, it helps if you uninstall xcode and other unnecessary applications, as well as temporary system junk like caches and old backups.
"""
print(get_prediction(text))

comp.sys.mac.hardware


In [23]:
text = """
A black hole is a place in space where gravity pulls so much that even light can not get out.
The gravity is so strong because matter has been squeezed into a tiny space. This can happen when a star is dying.
Because no light can get out, people can't see black holes.
They are invisible. Space telescopes with special tools can help find black holes.
The special tools can see how stars that are very close to black holes act differently than other stars.
"""
print(get_prediction(text))

sci.space
