In [None]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertModel
import pandas as pd
from pymongo import MongoClient

In [None]:
# Data Preparation

# Connect to MongoDB and load data
client = MongoClient("use client link")
db = client.bumblebee

train_data = pd.DataFrame(list(db.attribute_train_data.find()))
val_data = pd.DataFrame(list(db.attribute_val_data.find()))
train_solution = pd.DataFrame(list(db.attribute_train_solution.find()))
val_solution = pd.DataFrame(list(db.attribute_val_solution.find()))

# Merge data with solutions
train_merged = pd.merge(train_data, train_solution, on='indoml_id')
val_merged = pd.merge(val_data, val_solution, on='indoml_id')

# Drop unnecessary columns
train_merged = train_merged.drop(columns=['indoml_id'])
val_merged = val_merged.drop(columns=['indoml_id'])

# Combine the category levels into one string for tokenization
for df in [train_merged, val_merged]:
    df['category'] = df[['L0_category', 'L1_category', 'L2_category', 'L3_category', 'L4_category']].apply(lambda x: ' '.join(x.dropna()), axis=1)

In [None]:
# Tokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Custom Dataset
class CustomDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.title = dataframe['title']
        self.category = dataframe['category']
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        title = str(self.title[index])
        category = str(self.category[index])

        encoding = self.tokenizer.encode_plus(
            title,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'title_text': title,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(self.tokenizer.encode(category, add_special_tokens=True, max_length=self.max_len, truncation=True), dtype=torch.long).flatten()
        }

# Parameters
MAX_LEN = 128
BATCH_SIZE = 32

# Creating datasets
train_dataset = CustomDataset(train_merged, tokenizer, MAX_LEN)
val_dataset = CustomDataset(val_merged, tokenizer, MAX_LEN)

# DataLoader
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [None]:
# Transformer Model Definition

class TransformerModel(nn.Module):
    def __init__(self, num_labels):
        super(TransformerModel, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(0.3)
        self.linear = nn.Linear(self.bert.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        pooled_output = outputs[1]
        dropout_output = self.dropout(pooled_output)
        logits = self.linear(dropout_output)
        return logits

# Initialize model
model = TransformerModel(num_labels=len(tokenizer.vocab))

In [None]:
# Training Function

def train_model(model, dataloader, optimizer, device):
    model = model.to(device)
    model.train()
    
    total_loss = 0
    for batch in dataloader:
        optimizer.zero_grad()

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = nn.CrossEntropyLoss()(outputs, labels)
        
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

    return total_loss / len(dataloader)

In [None]:
# Evaluation Function

def evaluate_model(model, dataloader, device):
    model = model.to(device)
    model.eval()

    total_accuracy = 0
    total_loss = 0

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            loss = nn.CrossEntropyLoss()(outputs, labels)
            
            total_loss += loss.item()

            preds = torch.argmax(outputs, dim=1).flatten()
            accuracy = (preds == labels).cpu().numpy().mean()
            total_accuracy += accuracy

    avg_loss = total_loss / len(dataloader)
    avg_accuracy = total_accuracy / len(dataloader)
    
    return avg_loss, avg_accuracy

In [None]:
# Training Loop

EPOCHS = 5
LEARNING_RATE = 2e-5
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)

for epoch in range(EPOCHS):
    print(f'Epoch {epoch + 1}/{EPOCHS}')
    
    train_loss = train_model(model, train_dataloader, optimizer, device)
    print(f'Train loss: {train_loss}')
    
    val_loss, val_accuracy = evaluate_model(model, val_dataloader, device)
    print(f'Validation loss: {val_loss}, Validation accuracy: {val_accuracy}')

In [None]:
# Save the Model

MODEL_PATH = 'path/to/saved/model.pth'
torch.save(model.state_dict(), MODEL_PATH)

In [None]:
# Inference

# Load the trained model for inference
model.load_state_dict(torch.load(MODEL_PATH))
model.eval()

# Dummy input, replace with actual data
input_text = "product description"

# Tokenize the input
input_encoding = tokenizer.encode_plus(
    input_text,
    add_special_tokens=True,
    max_length=MAX_LEN,
    return_token_type_ids=False,
    padding='max_length',
    truncation=True,
    return_attention_mask=True,
    return_tensors='pt',
)

input_ids = input_encoding['input_ids'].to(device)
attention_mask = input_encoding['attention_mask'].to(device)

# Inference
with torch.no_grad():
    output = model(input_ids, attention_mask)
    prediction = torch.argmax(output, dim=1).cpu().item()

print("Prediction:", prediction)
