In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [13]:
# Install necessary libraries
!pip install torch transformers pandas sklearn flask docker

Collecting sklearn
  Downloading sklearn-0.0.post12.tar.gz (2.6 kB)
  Preparing metadata (setup.py) ... [?25lerror
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mpython setup.py egg_info[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m [31m[15 lines of output][0m
  [31m   [0m The 'sklearn' PyPI package is deprecated, use 'scikit-learn'
  [31m   [0m rather than 'sklearn' for pip commands.
  [31m   [0m 
  [31m   [0m Here is how to fix this error in the main use cases:
  [31m   [0m - use 'pip install scikit-learn' rather than 'pip install sklearn'
  [31m   [0m - replace 'sklearn' by 'scikit-learn' in your pip requirements files
  [31m   [0m   (requirements.txt, setup.py, setup.cfg, Pipfile, etc ...)
  [31m   [0m - if the 'sklearn' package is used by one of your dependencies,
  [31m   [0m   it would be great if you take some time to track which package uses
  [31m   [0m   'sklearn' instead of 'scikit-lea

### Step 1: Data Loading and Preprocessing

1. **Load the Data**:
   - Load the dataset from a JSON file into a Pandas DataFrame.

2. **Filter Active Entries**:
   - Keep only active entries (`status == 'A'`).

3. **Encode Labels**:
   - Use `LabelEncoder` to convert class IDs into numeric labels.

4. **Tokenize Descriptions**:
   - Tokenize the `description` column using BERT's tokenizer, limiting to 128 tokens.

5. **Split the Dataset**:
   - Split the data into training and validation sets (80/20 split).


In [14]:
import json
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from transformers import BertTokenizer


with open('/kaggle/input/idmanual/idmanual.json', 'r') as file:
    data = json.load(file)


df = pd.DataFrame(data)


df = df[df['status'] == 'A']


label_encoder = LabelEncoder()
df['class_id_encoded'] = label_encoder.fit_transform(df['class_id'])


tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
df['input_ids'] = df['description'].apply(lambda x: tokenizer.encode(x, add_special_tokens=True, max_length=128, truncation=True))


train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

### Step 1: Model Definition and Initialization

1. **Importing Libraries**:
   - We import PyTorch and the pre-trained BERT model from the `transformers` library.

2. **Defining the Model**:
   - We create a `TrademarkClassifier` class using BERT as the base model.
   - The model includes a dropout layer to prevent overfitting and a linear layer to output predictions for the number of trademark classes.

3. **Setting Up the Model**:
   - We determine the number of unique class labels (`num_labels`) and use this to define the output size of the model.
   - We initialize the model with the correct number of labels.

4. **Moving the Model to GPU**:
   - The model is moved to GPU (if available) for faster training and inference.



In [15]:
import torch
import torch.nn as nn
from transformers import BertModel

class TrademarkClassifier(nn.Module):
    def __init__(self, num_labels):
        super(TrademarkClassifier, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(0.3)
        self.linear = nn.Linear(768, num_labels)
    
    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs[1]
        dropout_output = self.dropout(pooled_output)
        logits = self.linear(dropout_output)
        return logits

num_labels = len(df['class_id_encoded'].unique())
model = TrademarkClassifier(num_labels)

# Move model to GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

TrademarkClassifier(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, eleme

### Creating DataLoaders

1. **Import Required Modules**:
   - We import `DataLoader` and `TensorDataset` from PyTorch, and `pad_sequence` for handling varying input lengths.

2. **Function to Create DataLoader**:
   - **Input IDs**: Convert tokenized input descriptions into tensors.
   - **Padding**: Pad the input sequences so they all have the same length.
   - **Labels**: Convert the encoded class labels into tensors.
   - **TensorDataset**: Combine the input IDs and labels into a dataset.
   - **DataLoader**: Create a DataLoader to efficiently manage batches during training.

3. **Initialize DataLoaders**:
   - We create `train_loader` and `val_loader` using the training and validation data.


In [16]:
from torch.utils.data import DataLoader, TensorDataset
from torch.nn.utils.rnn import pad_sequence


def create_data_loader(df, batch_size=16):
    
    input_ids = [torch.tensor(ids) for ids in df['input_ids'].tolist()]
    
    
    input_ids_padded = pad_sequence(input_ids, batch_first=True, padding_value=0)
    
    
    labels = torch.tensor(df['class_id_encoded'].values)
    
   
    dataset = TensorDataset(input_ids_padded, labels)
    
    
    return DataLoader(dataset, batch_size=batch_size)

train_loader = create_data_loader(train_df)
val_loader = create_data_loader(val_df)


### Step: Training and Evaluating the Model

1. **Import Necessary Modules**:
   - Import `Adam` for optimization and `accuracy_score` for evaluating model performance.

2. **Training Function (`train_model`)**:
   - **Optimizer and Loss**: Initialize the Adam optimizer and CrossEntropyLoss.
   - **Training Loop**:
     - Iterate over epochs and batches.
     - Move input data and labels to GPU.
     - Perform a forward pass, compute loss, backpropagate, and update weights.
     - Track and print the average loss per epoch.

3. **Evaluation Function (`evaluate_model`)**:
   - **Evaluation Mode**: Set the model to evaluation mode.
   - **Prediction and Accuracy**:
     - Move data to GPU.
     - Make predictions and compute accuracy using `accuracy_score`.
     - Print the validation accuracy after each epoch.

4. **Running the Training**:
   - Call `train_model` to start training the model for 3 epochs and evaluate it after each epoch.


In [8]:
from torch.optim import Adam
from sklearn.metrics import accuracy_score


def train_model(model, train_loader, val_loader, epochs=3):
    optimizer = Adam(model.parameters(), lr=2e-5)
    criterion = nn.CrossEntropyLoss()

    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for batch in train_loader:
            input_ids, labels = batch
            
            # Move tensors to GPU
            input_ids = input_ids.to(device)
            labels = labels.to(device)
            
            optimizer.zero_grad()
            outputs = model(input_ids=input_ids, attention_mask=(input_ids > 0))
            loss = criterion(outputs, labels)
            total_loss += loss.item()
            loss.backward()
            optimizer.step()
        
        print(f'Epoch {epoch+1}, Loss: {total_loss/len(train_loader)}')
        evaluate_model(model, val_loader)


def evaluate_model(model, val_loader):
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for batch in val_loader:
            input_ids, labels = batch
            
            
            input_ids = input_ids.to(device)
            labels = labels.to(device)
            
            outputs = model(input_ids=input_ids, attention_mask=(input_ids > 0))
            preds = torch.argmax(outputs, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    
    accuracy = accuracy_score(all_labels, all_preds)
    print(f'Validation Accuracy: {accuracy}')

# Train the model
train_model(model, train_loader, val_loader, epochs=3)


Epoch 1, Loss: 1.2562365964741695
Validation Accuracy: 0.8498383185125303
Epoch 2, Loss: 0.339635901520497
Validation Accuracy: 0.8769199676637025
Epoch 3, Loss: 0.1392535182100434
Validation Accuracy: 0.8966248989490704


In [17]:
def predict_class(description, model, tokenizer, label_encoder, max_length=128, device='cpu'):
    
    
    input_ids = torch.tensor([tokenizer.encode(description, add_special_tokens=True, max_length=max_length, truncation=True)])
    
    
    input_ids = input_ids.to(device)
    
    
    model.eval()
    
    with torch.no_grad():
        
        outputs = model(input_ids=input_ids, attention_mask=(input_ids > 0))
        prediction = torch.argmax(outputs, dim=1).item()
    
    
    class_id = label_encoder.inverse_transform([prediction])[0]
    return class_id


In [18]:

test_description = "DVD recorders"

# Predict the class ID
predicted_class = predict_class(test_description, model, tokenizer, label_encoder, device=device)

print(f"Predicted Class ID: {predicted_class}")

Predicted Class ID: 040


In [19]:
# List of new descriptions
descriptions = [
    "Scientific apparatus for measuring DNA concentration",
    "Wireless adapters for computers",
    "Notebook computer carrying cases",
    "DVD recorders"
]

# Predict and print class IDs for each description
for desc in descriptions:
    predicted_class = predict_class(desc, model, tokenizer, label_encoder, device=device)
    print(f"Description: {desc}")
    print(f"Predicted Class ID: {predicted_class}\n")


Description: Scientific apparatus for measuring DNA concentration
Predicted Class ID: 010

Description: Wireless adapters for computers
Predicted Class ID: 010

Description: Notebook computer carrying cases
Predicted Class ID: 010

Description: DVD recorders
Predicted Class ID: 040



In [20]:

different_description = "Cosmetic applicators for applying makeup"

# Predict the class ID
predicted_class = predict_class(different_description, model, tokenizer, label_encoder, device=device)

print(f"Predicted Class ID: {predicted_class}")


Predicted Class ID: 010


In [21]:

different_description = "Cheese Burgers"

# Predict the class ID
predicted_class = predict_class(different_description, model, tokenizer, label_encoder, device=device)

print(f"Predicted Class ID: {predicted_class}")


Predicted Class ID: 010


In [22]:
import torch
import pickle
import torch.nn as nn
from sklearn.preprocessing import LabelEncoder
from transformers import BertTokenizer, BertModel


label_encoder = LabelEncoder()
df['class_id_encoded'] = label_encoder.fit_transform(df['class_id'])


num_labels = len(label_encoder.classes_)


class TrademarkClassifier(nn.Module):
    def __init__(self, num_labels):
        super(TrademarkClassifier, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(0.3)
        self.linear = nn.Linear(768, num_labels)
    
    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs[1]
        dropout_output = self.dropout(pooled_output)
        logits = self.linear(dropout_output)
        return logits


model = TrademarkClassifier(num_labels)


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


model_save_path = "trademark_classifier_model.pt"
torch.save(model.state_dict(), model_save_path)
print(f"Model saved to {model_save_path}")


with open("label_encoder.pkl", "wb") as f:
    pickle.dump(label_encoder, f)
print("Label Encoder saved to label_encoder.pkl")


tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
with open("tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)
print("Tokenizer saved to tokenizer.pkl")



Model saved to trademark_classifier_model.pt
Label Encoder saved to label_encoder.pkl
Tokenizer saved to tokenizer.pkl


In [23]:
import torch
import pickle
from flask import Flask, request, jsonify


class TrademarkClassifier(nn.Module):
    def __init__(self, num_labels):
        super(TrademarkClassifier, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(0.3)
        self.linear = nn.Linear(768, num_labels)
    
    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs[1]
        dropout_output = self.dropout(pooled_output)
        logits = self.linear(dropout_output)
        return logits


with open("label_encoder.pkl", "rb") as f:
    label_encoder = pickle.load(f)


with open("tokenizer.pkl", "rb") as f:
    tokenizer = pickle.load(f)


num_labels = len(label_encoder.classes_)


model = TrademarkClassifier(num_labels)
model.load_state_dict(torch.load("trademark_classifier_model.pt"))


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


TrademarkClassifier(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, eleme