<a href="https://colab.research.google.com/github/sofinvalery/ml-notebooks/blob/main/inline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from collections import Counter
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertModel
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
import numpy as np

In [None]:
df = pd.read_csv('meatinfo.csv', sep=';')

product_counts = df['mtype'].value_counts()
filtered_products = product_counts[product_counts >= 500].index.tolist()

df_filtered = df[df['mtype'].isin(filtered_products)]

train_df, test_df = train_test_split(df_filtered, test_size=0.2, stratify=df_filtered['mtype'], random_state=42)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
max_len = 64

le = LabelEncoder()
train_labels = le.fit_transform(train_df['mtype'])
test_labels = le.transform(test_df['mtype'])

train_dataset = TextDataset(
    texts=train_df['text'].tolist(),
    labels=train_labels,
    tokenizer=tokenizer,
    max_len=max_len
)

test_dataset = TextDataset(
    texts=test_df['text'].tolist(),
    labels=test_labels,
    tokenizer=tokenizer,
    max_len=max_len
)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

class MeatClassifier(nn.Module):
    def __init__(self, n_classes):
        super(MeatClassifier, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.drop = nn.Dropout(p=0.3)
        self.out = nn.Linear(self.bert.config.hidden_size, n_classes)

    def forward(self, input_ids, attention_mask):
        pooled_output = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask
        )['pooler_output']
        output = self.drop(pooled_output)
        return self.out(output)

n_classes = len(le.classes_)

model = MeatClassifier(n_classes)
model = model.to(device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-5)

def train_epoch(model, data_loader, criterion, optimizer, device):
    model = model.train()
    losses = []
    correct_predictions = 0

    for d in data_loader:
        input_ids = d['input_ids'].to(device)
        attention_mask = d['attention_mask'].to(device)
        labels = d['label'].to(device)

        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        correct_predictions += torch.sum(torch.argmax(outputs, dim=1) == labels)
        losses.append(loss.item())

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

    return correct_predictions.double() / len(data_loader.dataset), np.mean(losses)

def eval_model(model, data_loader, criterion, device):
    model = model.eval()
    losses = []
    correct_predictions = 0

    with torch.no_grad():
        for d in data_loader:
            input_ids = d['input_ids'].to(device)
            attention_mask = d['attention_mask'].to(device)
            labels = d['label'].to(device)

            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)
            correct_predictions += torch.sum(torch.argmax(outputs, dim=1) == labels)
            losses.append(loss.item())

    return correct_predictions.double() / len(data_loader.dataset), np.mean(losses)

num_epochs = 5

for epoch in range(num_epochs):
    print(f'Epoch {epoch + 1}/{num_epochs}')
    train_acc, train_loss = train_epoch(model, train_loader, criterion, optimizer, device)
    print(f'Train loss {train_loss} accuracy {train_acc}')
    val_acc, val_loss = eval_model(model, test_loader, criterion, device)
    print(f'Val loss {val_loss} accuracy {val_acc}')


Epoch 1/5
Train loss 0.3571030331170526 accuracy 0.8864638783269962
Val loss 0.14389702589341521 accuracy 0.9452554744525548
Epoch 2/5
Train loss 0.11972414164967074 accuracy 0.9565019011406845
Val loss 0.1255292418600485 accuracy 0.9464720194647203
Epoch 3/5
Train loss 0.10118596702972722 accuracy 0.9606083650190114
Val loss 0.11465022342846937 accuracy 0.9534671532846716
Epoch 4/5
Train loss 0.0938478569832116 accuracy 0.96212927756654
Val loss 0.13675276894185656 accuracy 0.9504257907542579
Epoch 5/5
Train loss 0.08852013012146749 accuracy 0.9637262357414449
Val loss 0.11177253656064441 accuracy 0.9540754257907543


In [None]:
test_texts = [
    "Говядина блочная 2 сорт в наличии ООО \"АгроСоюз\" реализует блочную говядину 2 сорт (80/20); Свободный объем 8 тонн Самовывоз или доставка. Все подробности по телефону.;",
    "Куриная разделка Продам кур и куриную разделку гост и халяль по хорошей цене .Тел:;",
    "Говяжью мукозу Продам говяжью мукозу в охл и замороженном виде. Есть объем."
]

test_dataset = TextDataset(
    texts=test_texts,
    labels=[0]*len(test_texts),
    tokenizer=tokenizer,
    max_len=max_len
)

test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False)

model = model.eval()
predictions = []

with torch.no_grad():
    for d in test_loader:
        input_ids = d['input_ids'].to(device)
        attention_mask = d['attention_mask'].to(device)
        outputs = model(input_ids, attention_mask)
        _, preds = torch.max(outputs, dim=1)
        predictions.extend(preds.cpu().numpy())

pred_labels = le.inverse_transform(predictions)
for text, label in zip(test_texts, pred_labels):
    print(f'Text: {text}\nPredicted label: {label}\n')


Text: Говядина блочная 2 сорт в наличии ООО "АгроСоюз" реализует блочную говядину 2 сорт (80/20); Свободный объем 8 тонн Самовывоз или доставка. Все подробности по телефону.;
Predicted label: Говядина

Text: Куриная разделка Продам кур и куриную разделку гост и халяль по хорошей цене .Тел:;
Predicted label: Кура

Text: Говяжью мукозу Продам говяжью мукозу в охл и замороженном виде. Есть объем.
Predicted label: Говядина

