In [2]:
import pandas as pd
import numpy as np

# Assuming data is loaded in a DataFrame called df
df = pd.read_json("/kaggle/input/trademark-dataset/idmanual.json")  # Replace with the actual data loading method

# Only use rows with 'status' == 'A'
df = df[df['status'] == 'A']

# Preprocess text and labels
X = df['description'].values
y = df['class_id'].values

# Convert class labels to numeric
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Split data into training and validation sets
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [3]:
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertModel, AdamW

class TrademarkDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

# Initialize the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
max_len = 128

# Create datasets
train_dataset = TrademarkDataset(X_train, y_train, tokenizer, max_len)
val_dataset = TrademarkDataset(X_val, y_val, tokenizer, max_len)

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)

# Define the model
class TrademarkClassifier(nn.Module):
    def __init__(self, n_classes):
        super(TrademarkClassifier, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.drop = nn.Dropout(p=0.3)
        self.out = nn.Linear(self.bert.config.hidden_size, n_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        pooled_output = outputs.pooler_output
        output = self.drop(pooled_output)
        return self.out(output)

# Initialize the model
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = TrademarkClassifier(len(label_encoder.classes_))
model = model.to(device)

# Define optimizer and loss function
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
loss_fn = nn.CrossEntropyLoss().to('cuda' if torch.cuda.is_available() else 'cpu')

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [4]:
import wandb

# Initialize WandB
wandb.login()
wandb.init(project="trademark-classification", settings=wandb.Settings(start_method="fork"))
wandb.watch(model, log="all")

# Training and evaluation functions
def train_epoch(model, data_loader, loss_fn, optimizer, device, scheduler, n_examples):
    model = model.train()
    losses = []
    correct_predictions = 0

    for d in data_loader:
        input_ids = d["input_ids"].to(device)
        attention_mask = d["attention_mask"].to(device)
        labels = d["label"].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        _, preds = torch.max(outputs, dim=1)
        loss = loss_fn(outputs, labels)

        correct_predictions += torch.sum(preds == labels)
        losses.append(loss.item())

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

    return correct_predictions.double() / n_examples, np.mean(losses)

def eval_model(model, data_loader, loss_fn, device, n_examples):
    model = model.eval()
    losses = []
    correct_predictions = 0

    with torch.no_grad():
        for d in data_loader:
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            labels = d["label"].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )
            _, preds = torch.max(outputs, dim=1)
            loss = loss_fn(outputs, labels)

            correct_predictions += torch.sum(preds == labels)
            losses.append(loss.item())

    return correct_predictions.double() / n_examples, np.mean(losses)

#Training loop
num_epochs = 5
best_accuracy = 0

for epoch in range(num_epochs):
    print(f'Epoch {epoch + 1}/{num_epochs}')
    print('-' * 10)

    train_acc, train_loss = train_epoch(
        model,
        train_loader,
        loss_fn,
        optimizer,
        device,
        None,
        len(X_train)
    )

    print(f'Train loss {train_loss} accuracy {train_acc}')

    val_acc, val_loss = eval_model(
        model,
        val_loader,
        loss_fn,
        device,
        len(X_val)
    )

    print(f'Val   loss {val_loss} accuracy {val_acc}')

    wandb.log({
        "train_loss": train_loss,
        "train_acc": train_acc,
        "val_loss": val_loss,
        "val_acc": val_acc
    })

    if val_acc > best_accuracy:
        torch.save({'model_state_dict': model.state_dict(),
                   'label_encoder': label_encoder}, 'trademark_classifier.pth')
        best_accuracy = val_acc

wandb.finish()

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mrahulradhesh[0m ([33mrahulradhesh-vellore-institute-of-technology[0m). Use [1m`wandb login --relogin`[0m to force relogin


Epoch 1/5
----------
Train loss 1.2826075333568872 accuracy 0.6807053889492434
Val   loss 0.5547815478122484 accuracy 0.853071948261924
Epoch 2/5
----------
Train loss 0.40770890011106564 accuracy 0.891766251484298
Val   loss 0.43709824150292764 accuracy 0.8822756669361358
Epoch 3/5
----------
Train loss 0.22314368232914525 accuracy 0.939011141709406
Val   loss 0.4215591257251498 accuracy 0.8882376717865804
Epoch 4/5
----------
Train loss 0.13537663136885209 accuracy 0.9621030292311968
Val   loss 0.40234047631313347 accuracy 0.8999595796281326
Epoch 5/5
----------
Train loss 0.0947614638765381 accuracy 0.9735984436977337
Val   loss 0.42021978340752614 accuracy 0.9019805982215036


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
train_acc,▁▆▇██
train_loss,█▃▂▁▁
val_acc,▁▅▆██
val_loss,█▃▂▁▂

0,1
train_acc,0.9736
train_loss,0.09476
val_acc,0.90198
val_loss,0.42022


In [20]:
def predict(text, model, tokenizer, max_len):
    encoding = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=max_len,
        return_token_type_ids=False,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt',
    )
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    model.eval()
    with torch.no_grad():
        output = model(input_ids, attention_mask)
        _, prediction = torch.max(output, dim=1)

    return label_encoder.inverse_transform(prediction.cpu().numpy())[0]

# Test with samples input
sample_text1 = "Laptop carrying cases"
predicted_class1 = predict(sample_text, model, tokenizer, max_len)
print(f"Predicted class for '{sample_text1}': {predicted_class1}")

sample_text2 = "Chennai Super Kings"
predicted_class2 = predict(sample_text2, model, tokenizer, max_len)
print(f"Predicted class for '{sample_text2}': {predicted_class2}")

sample_text3 = "Ankle Fracture"
predicted_class3 = predict(sample_text3, model, tokenizer, max_len)
print(f"Predicted class for '{sample_text3}': {predicted_class3}")

sample_text4 = "Sports Wear"
predicted_class4 = predict(sample_text4, model, tokenizer, max_len)
print(f"Predicted class for '{sample_text4}': {predicted_class4}")

sample_text5 = "Shortage of Money"
predicted_class5 = predict(sample_text5, model, tokenizer, max_len)
print(f"Predicted class for '{sample_text5}': {predicted_class5}")

Predicted class for 'Laptop carrying cases': 009
Predicted class for 'Chennai Super Kings': 028
Predicted class for 'Ankle Fracture': 010
Predicted class for 'Sports Wear': 025
Predicted class for 'Shortage of Money': 036
