In [1]:
import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import BertTokenizer, BertForSequenceClassification
from utils import FileNames
import random
# import os


In [2]:
# Load pre-trained BERT model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=4)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [3]:
text_data = FileNames()
text_data.get_df()

The labels and corresponding indexes are:
(1, 'black')
(2, 'blue')
(3, 'green')
(4, 'other')


Unnamed: 0,_label,_filename
0,1,take to recycle rgba
1,1,take to recycle rgba
2,1,take to recycle rgba
3,1,take to recycle rgba
4,1,take to recycle rgba
...,...,...
5316,4,worn shoe
5317,4,worn tshirt
5318,4,wrench
5319,4,xbox controller


In [4]:
targets = text_data.get_label_classes()
targets

['black', 'blue', 'green', 'other']

In [5]:
df = text_data.get_df()
df.shape

(5321, 2)

In [6]:
# Checking for duplicates
df.loc[df["_filename"].duplicated()]

Unnamed: 0,_label,_filename
1,1,take to recycle rgba
2,1,take to recycle rgba
3,1,take to recycle rgba
4,1,take to recycle rgba
17,1,beef jerky bag
...,...,...
5282,4,water bottle
5290,4,wine bottle
5295,4,winter jacket
5297,4,wired headphones


In [7]:
# dropping duplicates
df = df.drop_duplicates(subset="_filename")
df.loc[df["_filename"].duplicated()]

Unnamed: 0,_label,_filename


In [8]:
df.shape

(3961, 2)

In [9]:
data_tuples = text_data.get_tuples(df)
random.shuffle(data_tuples)

In [10]:
texts = [line for _, line in data_tuples]
labels = [int(label)-1 for label, _ in data_tuples]

In [11]:
len(labels)

3961

In [12]:
len(texts)

3961

In [13]:
labels

[3,
 0,
 2,
 1,
 1,
 2,
 2,
 2,
 2,
 1,
 0,
 0,
 2,
 3,
 1,
 2,
 1,
 1,
 3,
 1,
 1,
 1,
 3,
 1,
 3,
 1,
 2,
 1,
 1,
 2,
 1,
 1,
 1,
 3,
 0,
 2,
 1,
 2,
 0,
 0,
 0,
 1,
 1,
 3,
 0,
 1,
 1,
 2,
 3,
 2,
 1,
 3,
 1,
 0,
 2,
 1,
 2,
 2,
 0,
 1,
 1,
 0,
 0,
 2,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 2,
 1,
 1,
 2,
 1,
 3,
 3,
 2,
 2,
 3,
 1,
 1,
 1,
 0,
 2,
 1,
 1,
 1,
 3,
 0,
 1,
 1,
 1,
 2,
 2,
 0,
 3,
 2,
 1,
 0,
 3,
 1,
 3,
 0,
 2,
 1,
 3,
 3,
 2,
 2,
 1,
 1,
 0,
 0,
 3,
 3,
 2,
 2,
 1,
 3,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 3,
 1,
 3,
 1,
 3,
 1,
 2,
 2,
 3,
 1,
 3,
 2,
 1,
 0,
 1,
 1,
 2,
 2,
 2,
 0,
 1,
 1,
 3,
 1,
 3,
 3,
 0,
 1,
 2,
 1,
 2,
 1,
 2,
 3,
 1,
 1,
 1,
 3,
 0,
 1,
 1,
 1,
 0,
 0,
 3,
 1,
 1,
 1,
 0,
 2,
 3,
 3,
 3,
 2,
 0,
 1,
 1,
 2,
 3,
 2,
 1,
 1,
 1,
 3,
 0,
 3,
 2,
 1,
 2,
 1,
 3,
 1,
 2,
 0,
 1,
 3,
 1,
 1,
 0,
 3,
 1,
 1,
 1,
 1,
 3,
 0,
 2,
 1,
 2,
 1,
 2,
 1,
 1,
 2,
 1,
 2,
 1,
 1,
 1,
 2,
 0,
 3,
 1,
 0,
 3,
 1,
 1,
 1,
 0,
 3,
 3,
 2,
 1,
 1,
 1,
 1,
 1,
 2,
 1,
 3,
 1,


In [14]:
# Example data
# texts = ['This is a positive sentence.', 'This is a negative sentence.']
# labels = [1, 0]

# Tokenize the texts
inputs = tokenizer(texts, padding=True, truncation=True, return_tensors='pt')

# Create a TensorDataset
dataset = TensorDataset(inputs['input_ids'], inputs['attention_mask'], torch.tensor(labels))

print(f'Len of Dataset: {len(dataset)}')

# Split the dataset into training and testing sets
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, test_size])

Len of Dataset: 3961


In [15]:
# Define the optimizer and loss function
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
criterion = torch.nn.CrossEntropyLoss()

# Train the model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [16]:
def evaluate(test_loader):
    # Test loop
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for batch in test_loader:
            input_ids, attention_mask, labels = batch
            input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
            
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            logits = outputs.logits
            _, predicted = torch.max(logits, 1)
            
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
            
    return correct/total

In [18]:
for epoch in range(5):
    train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=4, shuffle=False)
    
    # Train loop
    model.train()
    for batch in train_loader:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
        
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        
    # Test loop
    eval_accuracy = evaluate(test_loader)
            
    print(f'Epoch {epoch+1} Accuracy: { eval_accuracy }')

Epoch 1 Accuracy: 0.78562421185372
Epoch 2 Accuracy: 0.8058007566204287
Epoch 3 Accuracy: 0.7831021437578815
Epoch 4 Accuracy: 0.7843631778058008
Epoch 5 Accuracy: 0.7793190416141236
