In [1]:
import numpy as np
def binary_cross_entropy_loss(y_true, y_pred):
    """
    Кросс-энтропийный лосс для бинарной классификации с softmax.
    
    Args:
        y_true: Истинные метки (0 или 1), форма (batch_size, ).
        y_pred: Предсказанные вероятности, форма (batch_size, 2).
        
    Returns:
        Средний лосс по батчу.
    """
    # Выбираем вероятность положительного класса (p_1)
    p1 = y_pred 
    loss = -np.mean(y_true * np.log(p1 + 1e-8) + (1 - y_true) * np.log(1 - p1 + 1e-8))
    return loss

from transformers import BertTokenizer
from transformers import BertModel
import numpy as np
model = BertModel.from_pretrained('DeepPavlov/rubert-base-cased')
tokenizer = BertTokenizer.from_pretrained('DeepPavlov/rubert-base-cased')

  from .autonotebook import tqdm as notebook_tqdm
Some weights of the model checkpoint at DeepPavlov/rubert-base-cased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [2]:
import pandas as pd
import tiktoken
from sklearn.model_selection import train_test_split

encoder = tiktoken.get_encoding("o200k_base")
data = pd.read_csv('/Users/a.konstantinov/Documents/less/Neura/lab5/russian_comments_from_2ch_pikabu.csv')

data['tok_len'] = [len(tokenizer(i)['input_ids']) for i in data.comment]

data = data[data.tok_len < 450]

In [3]:
mas = np.zeros((len(data),2))
for i, v in enumerate(data.toxic):
    mas[i,int(v)] = 1



In [4]:

X_train,X_test, y_train, y_test = train_test_split(data['comment'],mas,  train_size = 0.8)

In [5]:
X_test[:2]

13629             какая задняя мысль в покупке шлюхи????\n
10479    Да и я тоже про данный препарат хотела написат...
Name: comment, dtype: object

In [6]:
tokenizer.encode(list(X_test[:2]))

[101, 100, 100, 102]

In [7]:
data

Unnamed: 0,comment,toxic,translated,tok_len
0,"Верблюдов-то за что? Дебилы, бл...\n",1.0,"Camels, for what? Morons, bl ...",17
1,"Хохлы, это отдушина затюканого россиянина, мол...",1.0,"Ukrainians, this is an outlet vent zatyukanog...",37
2,Собаке - собачья смерть\n,1.0,Dog - Dog Death,8
3,"Страницу обнови, дебил. Это тоже не оскорблени...",1.0,"Refresh the page, moron. This is also not an ...",48
4,"тебя не убедил 6-страничный пдф в том, что Скр...",1.0,did not convince you of the 6-page pdf that S...,34
...,...,...,...,...
14407,Вонючий совковый скот прибежал и ноет. А вот и...,1.0,Smelly soviet cattle came running and aching....,27
14408,А кого любить? Гоблина тупорылого что-ли? Или ...,1.0,And whom to love? Goblin blunt or what? Or so...,35
14409,"Посмотрел Утомленных солнцем 2. И оказалось, ч...",0.0,I watched Burnt by the Sun 2. And it turned o...,87
14410,КРЫМОТРЕД НАРУШАЕТ ПРАВИЛА РАЗДЕЛА Т.К В НЕМ Н...,1.0,CRIMEA THREAD VIOLATES THE RULES OF SECTION. ...,134


In [8]:
inputs = tokenizer(list(X_train)[:10],  padding=True, truncation=True, max_length=1819, return_tensors="pt")
outputs = model(**inputs)

In [9]:
outputs.last_hidden_state.shape

torch.Size([10, 57, 768])

In [10]:
from transformer import Block_encoder,Classification
block_size = 452
n_emb = 768
heads = 2
batch_size = 32

for _ in range(3):
    loss = 0
    # Перемешивание данных
    indices = np.arange(len(data))
    np.random.shuffle(indices)
    for batch_start in range(0, len(data), batch_size):
        batch_end = batch_start + batch_size
        X = list(X_train)[batch_start:batch_end]
        y = np.array(y_train)[batch_start:batch_end]
        encoder = Block_encoder(n_emb=n_emb, n_head=heads)
        cl = Classification(n_emb, 2)
        inputs = tokenizer(X,  padding=True, truncation=True, max_length=block_size, return_tensors="pt")
        print('t')
        outputs = model(**inputs)
        size = outputs.last_hidden_state
        text = size.detach().numpy()
        itog = encoder.forward(text)
        itog = cl.forward(itog)
        loss += binary_cross_entropy_loss(itog, y)
        ds = cl.backward(itog - y)
        encoder.backward(ds)
        print(loss)



In [59]:
train(X_train[:10], y_train[:10], model,tokenizer)

9.422330951444264


In [48]:
np.array(y_train[:10]).shape

(10, 2)

In [41]:
y_train

array([[0., 1.],
       [1., 0.],
       [1., 0.],
       ...,
       [1., 0.],
       [1., 0.],
       [1., 0.]])

In [8]:

text = "Hello, how are you?"
inputs = tokenizer(text, return_tensors="pt")
outputs = model(**inputs)
size = outputs.last_hidden_state






Some weights of the model checkpoint at DeepPavlov/rubert-base-cased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


array([[[ 0.01770019, -0.03874434, -0.0306534 , ..., -0.01961146,
          0.01876126,  0.09435405],
        [ 0.00142463, -0.00667082,  0.00998321, ...,  0.00620277,
          0.01432333,  0.00816661],
        [-0.02518048, -0.04990216, -0.03309439, ..., -0.02977574,
         -0.02272186, -0.00451382],
        ...,
        [-0.00321451, -0.06673963,  0.0010987 , ..., -0.03235276,
          0.02098892,  0.08524102],
        [ 0.00573047, -0.00529849,  0.00156429, ..., -0.00729951,
          0.00242065,  0.01445389],
        [ 0.02229708,  0.03077694,  0.03430222, ..., -0.05498369,
          0.04315628,  0.01171194]]])

(1, 2)

In [9]:
print(loss)

9.210340366976187


In [17]:
fc = Fullyconnected(768, 2)
sm = Softmax()
itog = fc.forward(np.mean(itog, axis = 1))
sm.forward(itog)

array([[0.3655728, 0.6344272]])

In [20]:
 k = np.ones((1, 9, 768)).transpose(1,2,0)

In [21]:
import numpy as np 
np.dot(k.T,  np.ones((1,9,3072)))

array([[[[9., 9., 9., ..., 9., 9., 9.]],

        [[9., 9., 9., ..., 9., 9., 9.]],

        [[9., 9., 9., ..., 9., 9., 9.]],

        ...,

        [[9., 9., 9., ..., 9., 9., 9.]],

        [[9., 9., 9., ..., 9., 9., 9.]],

        [[9., 9., 9., ..., 9., 9., 9.]]]])

(1, 9, 768)

ValueError: shapes (768,9,1) and (1,9,3072) not aligned: 1 (dim 2) != 9 (dim 1)

In [14]:
X_train,X_test, y_train, y_test = train_test_split(data['comment'],data['toxic'],  train_size = 0.8)

In [6]:
data.toxic.unique()

array([1., 0.])

In [23]:
import torch 
from torch import nn

em = nn.Embedding(3,3)
em(torch.tensor([0,2]))

tensor([[ 0.3470,  0.4323,  0.2609],
        [-1.5009, -0.2066, -1.5735]], grad_fn=<EmbeddingBackward0>)