Import the necessary libraries

In [21]:
!pip install transformers



In [22]:
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import numpy as np
import pandas as pd

This function load_data that reads a CSV file containing products and their corresponding tags. It returns a list of texts and list of labels, where 1 represents a positive product, and 0 represents a negative text.

In [25]:
def load_data(data_file):
    df = pd.read_csv(data_file)
    df['product'].replace('', np.nan, inplace=True)
    df.dropna(subset=['product'], inplace=True)
    texts = df['product'].tolist()
    labels = [1 if match == "product" else 0 for match in df['match'].tolist()]
    return texts, labels

In [26]:
data_file = "products_data.csv"
texts, labels = load_data(data_file)

Dataset class that helps organize products and their tags for our BERT model. It takes care of tokenizing the text, handling the sequence length, and providing a neat package with input IDs, attention masks, and labels for our model to learn from.

In [27]:
class TextClassificationDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(text, return_tensors='pt', max_length=self.max_length, padding='max_length', truncation=True)
        return {'input_ids': encoding['input_ids'].flatten(), 'attention_mask': encoding['attention_mask'].flatten(), 'label': torch.tensor(label)}

Createing custom BERT classifier, built on top of the BERT model, which understanding text. Then add a dropout layer to keep things in check and a linear layer to help us classify text.
BERTClassifier takes in some input IDs and attention masks, and runs them through BERT and the extra layers we added. The classifier returns our output as class scores.

In [28]:
class BERTClassifier(nn.Module):
    def __init__(self, bert_model_name, num_classes):
        super(BERTClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(bert_model_name)
        self.dropout = nn.Dropout(0.1)
        self.fc = nn.Linear(self.bert.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
            outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
            pooled_output = outputs.pooler_output
            x = self.dropout(pooled_output)
            logits = self.fc(x)
            return logits

This function takes the model, data loader, optimizer, scheduler, and device as its trainees. The function puts the model into training mode and then runs through each batch of data from the data loader. For each batch, it clears the optimizer’s gradients, gets the input IDs, attention masks, and labels, and feeds them to the model.

In [38]:
def train(model, data_loader, optimizer, scheduler, device):
    model.train()
    for batch in data_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = nn.CrossEntropyLoss()(outputs, labels)
        loss.backward()
        optimizer.step()
        scheduler.step()

Building evaluation method

In [30]:
def evaluate(model, data_loader, device):
    model.eval()
    predictions = []
    actual_labels = []

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs, dim=1)
            predictions.extend(preds.cpu().tolist())
            actual_labels.extend(labels.cpu().tolist())
    return accuracy_score(actual_labels, predictions), classification_report(actual_labels, predictions)

This function acts as our evaluation method. For each batch, it gets the input IDs, attention masks, and labels and feeds them to the model. The model then gives its best predictions, which are compared to the actual labels.
Finally, the function calculates the accuracy score and a classification report to let us know how well the model did in understanding products description.

In [31]:
def predict_match(text, model, tokenizer, device, max_length=128):
    model.eval()
    encoding = tokenizer(text, return_tensors='pt', max_length=max_length, padding='max_length', truncation=True)
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        _, preds = torch.max(outputs, dim=1)
        return "positive" if preds.item() == 1 else "negative"

Set up essential parameters for fine-tuning the BERTClassifier, including the BERT model name, number of classes, maximum input sequence length, batch size, number of training epochs, and learning rate, to help the model effectively understand products and their tags.

In [32]:
# Set up parameters
bert_model_name = 'bert-base-uncased'
num_classes = 2
max_length = 128
batch_size = 16
num_epochs = 4
learning_rate = 2e-5

Loading and splitting the data

In [33]:
train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)

Initialize tokenizer, dataset, and data loader

In [34]:
tokenizer = BertTokenizer.from_pretrained(bert_model_name)
train_dataset = TextClassificationDataset(train_texts, train_labels, tokenizer, max_length)
val_dataset = TextClassificationDataset(val_texts, val_labels, tokenizer, max_length)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)

Set up the device and model

In [35]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BERTClassifier(bert_model_name, num_classes).to(device)

Set up optimizer and learning rate scheduler

In [36]:
optimizer = AdamW(model.parameters(), lr=learning_rate)
total_steps = len(train_dataloader) * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)



Training the model

In [42]:
for epoch in range(num_epochs):
        print(f"Epoch {epoch + 1}/{num_epochs}")
        train(model, train_dataloader, optimizer, scheduler, device)
        accuracy, report = evaluate(model, val_dataloader, device)
        print(f"Validation Accuracy: {accuracy:.4f}")
        print(report)

Epoch 1/4


ValueError: ignored

In [None]:
torch.save(model.state_dict(), "bert_classifier.pth")

Evaluating our model’s performance

In [None]:
# Test product prediction
test_text = "Big bed - sonoma"
match = predict_match(test_text, model, tokenizer, device)
print("This bed is comfortable.")
print(f"Predicted product: {match}")