In [10]:
import os
import shutil
import json
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
import torch
from torch import nn, optim
from torch.utils.data import DataLoader, TensorDataset

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
extract_to = "/content/data"
os.makedirs(extract_to, exist_ok=True)

In [5]:
zip_path_bert  = "/content/drive/MyDrive/fast_russian_embeddings.json"
shutil.copy(zip_path_bert,  os.path.join(extract_to, 'fast_russian_embeddings.json'))

'/content/data/fast_russian_embeddings.json'

In [12]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [13]:
def load_embeddings(json_path):
    with open(json_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    df = pd.DataFrame(data)
    return df

# Укажите путь к вашему файлу с эмбеддингами
df = load_embeddings('/content/data/fast_russian_embeddings.json')

In [14]:
# Кодируем текстовые метки в целочисленные
le = LabelEncoder()
df['label_encoded'] = le.fit_transform(df['label'])

# Формируем массивы признаков и целевых меток
X = np.vstack(df['embedding'].values).astype(np.float32)
y = df['label_encoded'].values

# 3. Разбиение на train / val / test (70% / 15% / 15%)
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.30, random_state=42, stratify=y
)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.50, random_state=42, stratify=y_temp
)

# 4. Создание DataLoader-ов
batch_size = 32

train_ds = TensorDataset(
    torch.from_numpy(X_train),
    torch.from_numpy(y_train)
)
val_ds = TensorDataset(
    torch.from_numpy(X_val),
    torch.from_numpy(y_val)
)
test_ds = TensorDataset(
    torch.from_numpy(X_test),
    torch.from_numpy(y_test)
)

train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
val_loader   = DataLoader(val_ds,   batch_size=batch_size)
test_loader  = DataLoader(test_ds,  batch_size=batch_size)

In [16]:
# 5. Определение сбалансированной нейросети
class BalancedNN(nn.Module):
    def __init__(self, input_dim: int, num_classes: int):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, num_classes)
        )

    def forward(self, x):
        return self.model(x)

input_dim   = X_train.shape[1]
num_classes = len(le.classes_)
model = BalancedNN(input_dim, num_classes).to(device)

# 6. Настройка критерия с весами для учёта дисбаланса
class_counts = np.bincount(y_train)
# Инверсия частот: чем реже класс, тем выше вес
class_weights = 1.0 / class_counts
class_weights = torch.tensor(class_weights, dtype=torch.float32, device=device)

criterion = nn.CrossEntropyLoss(weight=class_weights)
optimizer = optim.Adam(model.parameters(), lr=1e-5)

# 7. Цикл обучения с сохранением лучших весов
num_epochs     = 20
best_val_acc   = 0.0
save_path      = 'balanced_nn_weights.pth'

for epoch in range(1, num_epochs + 1):
    # --- Training ---
    model.train()
    train_loss = 0.0
    correct = total = 0
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()

        train_loss += loss.item() * X_batch.size(0)
        preds = outputs.argmax(dim=1)
        correct += (preds == y_batch).sum().item()
        total += y_batch.size(0)

    train_loss /= total
    train_acc   = correct / total

    # --- Validation ---
    model.eval()
    val_loss = 0.0
    correct = total = 0
    with torch.no_grad():
        for X_batch, y_batch in val_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)

            val_loss += loss.item() * X_batch.size(0)
            preds = outputs.argmax(dim=1)
            correct += (preds == y_batch).sum().item()
            total += y_batch.size(0)

    val_loss /= total
    val_acc   = correct / total

    print(f"Epoch {epoch}/{num_epochs} | "
          f"Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f} | "
          f"Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}")

    # Сохраняем модель, если точность на валидации улучшилась
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        torch.save(model.state_dict(), save_path)

print(f"\nTraining complete. Best validation accuracy: {best_val_acc:.4f}")
print(f"Best model weights saved to '{save_path}'")

Epoch 1/20 | Train Loss: 1.0986, Train Acc: 0.1525 | Val Loss: 1.0943, Val Acc: 0.1624
Epoch 2/20 | Train Loss: 1.0890, Train Acc: 0.4004 | Val Loss: 1.0810, Val Acc: 0.5431
Epoch 3/20 | Train Loss: 1.0706, Train Acc: 0.5659 | Val Loss: 1.0547, Val Acc: 0.6071
Epoch 4/20 | Train Loss: 1.0377, Train Acc: 0.6035 | Val Loss: 1.0150, Val Acc: 0.6080
Epoch 5/20 | Train Loss: 0.9986, Train Acc: 0.6142 | Val Loss: 0.9757, Val Acc: 0.6257
Epoch 6/20 | Train Loss: 0.9651, Train Acc: 0.6198 | Val Loss: 0.9444, Val Acc: 0.6245
Epoch 7/20 | Train Loss: 0.9395, Train Acc: 0.6234 | Val Loss: 0.9229, Val Acc: 0.6346
Epoch 8/20 | Train Loss: 0.9228, Train Acc: 0.6251 | Val Loss: 0.9082, Val Acc: 0.6295
Epoch 9/20 | Train Loss: 0.9110, Train Acc: 0.6233 | Val Loss: 0.8981, Val Acc: 0.6305
Epoch 10/20 | Train Loss: 0.9027, Train Acc: 0.6268 | Val Loss: 0.8910, Val Acc: 0.6318
Epoch 11/20 | Train Loss: 0.8968, Train Acc: 0.6257 | Val Loss: 0.8857, Val Acc: 0.6318
Epoch 12/20 | Train Loss: 0.8913, Train A

In [20]:
import json
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader

from sentence_transformers import SentenceTransformer

# 0. Параметры
EMBEDDING_MODEL_NAME = 'sentence-transformers/all-MiniLM-L6-v2'
EMBEDDING_MODEL_DIR  = 'embedding_model'       # если хотим сохранить локально
WEIGHTS_PATH         = 'balanced_nn_weights_bert.pth'
LABELS_PATH          = 'label_classes.npy'
BATCH_SIZE           = 32
NUM_EPOCHS           = 20
LR                   = 1e-5

# 1. Загрузка данных (как в вашем тренировочном скрипте)
def load_embeddings(json_path):
    with open(json_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    df = pd.DataFrame(data)
    return df

df = load_embeddings('/content/data/fast_russian_embeddings.json')
le = LabelEncoder().fit(df['label'])
df['label_encoded'] = le.transform(df['label'])

X = np.vstack(df['embedding'].values).astype(np.float32)
y = df['label_encoded'].values

# 2. Разбиение
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.30, random_state=42, stratify=y
)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.50, random_state=42, stratify=y_temp
)

# 3. DataLoader с WeightedRandomSampler (чтобы батчи были сбалансированы)
class_counts   = np.bincount(y_train)
class_weights  = 1.0 / class_counts
sample_weights = class_weights[y_train]
sampler = torch.utils.data.WeightedRandomSampler(
    weights=torch.tensor(sample_weights, dtype=torch.double),
    num_samples=len(sample_weights),
    replacement=True
)

train_ds = TensorDataset(torch.from_numpy(X_train), torch.from_numpy(y_train))
val_ds   = TensorDataset(torch.from_numpy(X_val),   torch.from_numpy(y_val))

train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, sampler=sampler)
val_loader   = DataLoader(val_ds,   batch_size=BATCH_SIZE, shuffle=False)

# 4. Определение модели
class BalancedNN(nn.Module):
    def __init__(self, input_dim: int, num_classes: int):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, num_classes)
        )
    def forward(self, x):
        return self.model(x)

device     = torch.device("cuda" if torch.cuda.is_available() else "cpu")
input_dim  = X_train.shape[1]
n_classes  = len(le.classes_)
model      = BalancedNN(input_dim, n_classes).to(device)

# 5. Критерий с весами классов
weight_tensor = torch.tensor(class_weights, dtype=torch.float32, device=device)
criterion     = nn.CrossEntropyLoss(weight=weight_tensor)
optimizer     = optim.Adam(model.parameters(), lr=LR)

# 6. Тренировочный цикл
best_val_acc = 0.0

for epoch in range(1, NUM_EPOCHS+1):
    model.train()
    total_loss = correct = total = 0
    for Xb, yb in train_loader:
        Xb, yb = Xb.to(device), yb.to(device)
        optimizer.zero_grad()
        logits = model(Xb)
        loss   = criterion(logits, yb)
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * Xb.size(0)
        preds = logits.argmax(dim=1)
        correct += (preds==yb).sum().item()
        total += yb.size(0)
    train_acc = correct/total

    model.eval()
    correct=total=0
    with torch.no_grad():
        for Xb, yb in val_loader:
            Xb, yb = Xb.to(device), yb.to(device)
            preds = model(Xb).argmax(dim=1)
            correct += (preds==yb).sum().item()
            total += yb.size(0)
    val_acc = correct/total

    print(f"Epoch {epoch}/{NUM_EPOCHS} — Train Acc: {train_acc:.4f}, Val Acc: {val_acc:.4f}")

    # сохраняем лучшую модель
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        torch.save(model.state_dict(), WEIGHTS_PATH)

print(f"Training finished. Best Val Acc = {best_val_acc:.4f}")
print(f"Model weights saved to {WEIGHTS_PATH}")

# 7. Сохраняем список классов из LabelEncoder
np.save(LABELS_PATH, le.classes_)
print(f"Label classes saved to {LABELS_PATH}: {le.classes_}")

# 8. (опционально) Кешируем/сохраняем embedding-модель локально
embedder = SentenceTransformer(EMBEDDING_MODEL_NAME)
embedder.save(EMBEDDING_MODEL_DIR)
print(f"Embedding model saved to ./{EMBEDDING_MODEL_DIR}/")

Epoch 1/20 — Train Acc: 0.3380, Val Acc: 0.1506
Epoch 2/20 — Train Acc: 0.3332, Val Acc: 0.1506
Epoch 3/20 — Train Acc: 0.3351, Val Acc: 0.1506
Epoch 4/20 — Train Acc: 0.3345, Val Acc: 0.1513
Epoch 5/20 — Train Acc: 0.3455, Val Acc: 0.1601
Epoch 6/20 — Train Acc: 0.3617, Val Acc: 0.1773
Epoch 7/20 — Train Acc: 0.3837, Val Acc: 0.1897
Epoch 8/20 — Train Acc: 0.4034, Val Acc: 0.2018
Epoch 9/20 — Train Acc: 0.4166, Val Acc: 0.2088
Epoch 10/20 — Train Acc: 0.4283, Val Acc: 0.2164
Epoch 11/20 — Train Acc: 0.4337, Val Acc: 0.2182
Epoch 12/20 — Train Acc: 0.4348, Val Acc: 0.2222
Epoch 13/20 — Train Acc: 0.4380, Val Acc: 0.2245
Epoch 14/20 — Train Acc: 0.4420, Val Acc: 0.2260
Epoch 15/20 — Train Acc: 0.4481, Val Acc: 0.2283
Epoch 16/20 — Train Acc: 0.4533, Val Acc: 0.2304
Epoch 17/20 — Train Acc: 0.4539, Val Acc: 0.2406
Epoch 18/20 — Train Acc: 0.4603, Val Acc: 0.2546
Epoch 19/20 — Train Acc: 0.4710, Val Acc: 0.2658
Epoch 20/20 — Train Acc: 0.4782, Val Acc: 0.2802
Training finished. Best Val A

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Embedding model saved to ./embedding_model/
