In [3]:
!nvidia-smi -L

In [None]:
from IPython.display import clear_output

In [None]:
!pip install simpletransformers==0.61.14
!pip install gdown
clear_output()

In [None]:
!gdown --fuzzy https://drive.google.com/file/d/1FjOIk4tXZhWOo1He6ZncBjvGBqe1T2eX/view?usp=sharing

In [None]:
!gdown --fuzzy https://drive.google.com/file/d/1WHBkle7Y92RBkwk61wq8rOPBYEJmvrV6/view?usp=sharing -O data_new.zip
!unzip data_new.zip

In [None]:
!gdown --fuzzy https://drive.google.com/file/d/1UMCm8SLKG5t4gTwDfM3XqVxy72HIvA_z/view?usp=sharing
!unzip -u RuBERT-tiny_0.9981.zip

In [1]:
import pandas as pd
import numpy as np
import os
from tqdm.notebook import tqdm
from zipfile import ZipFile
import torch
import shutil
import transformers

from simpletransformers.classification import ClassificationModel, ClassificationArgs
from sklearn.model_selection import train_test_split
import logging

from transformers import AutoTokenizer, AutoModel
from sklearn.metrics import f1_score

In [2]:
transformers.__version__

## Тренировка

In [None]:
Ext2 = pd.read_csv("db2/Ext2.csv").replace('\\N', np.nan).dropna() # с завода

In [None]:
df = Ext2[["fish", "volume", "unit"]].drop_duplicates()
df["fish"] = df["fish"].str.strip()
df["text"] = "рыба: " + df["fish"] + " [SEP] количество кг тонна: " + df["volume"].astype(str)
df = df.drop(["fish", "volume"], axis=1)[["text", "unit"]].drop_duplicates()

In [None]:
df.shape

In [None]:
df

In [None]:
to_columns = {"тонна": 0, "кг": 1}
df.columns = ["text", "labels"]
df["labels"] = df["labels"].apply(lambda x: to_columns[x])

In [None]:
train_df_text, val_df_text = train_test_split(df,
                                              test_size=0.1,
                                              random_state=42)

In [None]:
train_df_text.shape, val_df_text.shape

In [None]:
logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)
BATCH_SIZE = 256

model_args = ClassificationArgs(num_train_epochs=1,
                                overwrite_output_dir=True,
                                evaluate_during_training=True,
                                evaluate_during_training_verbose=True,
                                reprocess_input_data=True,
                                train_batch_size=BATCH_SIZE,
                                eval_batch_size=BATCH_SIZE,
                                save_model_every_epoch=True,
                                save_best_model=True,
                                max_seq_length=256,
                                use_multiprocessing=True,
                                manual_seed=42)

text_model = ClassificationModel("bert", "cointegrated/rubert-tiny-bilingual-nli",
                                 args=model_args,
                                 num_labels=2,
                                 use_cuda=True)

In [None]:
for x in text_model.model.parameters(): x.requires_grad = False
for x in text_model.model.bert.embeddings.parameters(): x.requires_grad = True
for x in text_model.model.bert.encoder.layer[1].parameters(): x.requires_grad = True
for x in text_model.model.bert.encoder.layer[2].parameters(): x.requires_grad = True
for x in text_model.model.classifier.parameters(): x.requires_grad = True

In [None]:
text_model.model

In [None]:
text_model.train_model(train_df_text, eval_df=val_df_text)

In [None]:
prediction, raw_output = text_model.predict(val_df_text["text"].to_list())

In [None]:
score = f1_score(val_df_text["labels"], prediction)
print(score)

In [None]:
def embed_bert_cls(text, model, tokenizer):
    t = tokenizer(text, padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
        model_output = model(**{k: v.to(model.device) for k, v in t.items()})
    embeddings = model_output.last_hidden_state[:, 0, :]
    embeddings = torch.nn.functional.normalize(embeddings)
    return embeddings[0].cpu().numpy()

In [None]:
zip_filename = f"RuBERT-tiny_{round(score, 4)}.zip"
with ZipFile(zip_filename, "w") as f:
    path = "outputs/best_model"
    for filename in os.listdir(path):
        f.write(f"{path}/{filename}")

In [None]:
!ls

## ЧАСТЬ 2 (Предсказания)

In [4]:
df = pd.read_csv("new_ext2.csv").replace("\\N", np.nan)
df["fish"] = df["fish"].str.strip()
df["text"] = "рыба: " + df["fish"] + " [SEP] количество кг тонна: " + df["volume"].astype(str)
df = df.drop_duplicates(subset=["text"])

In [5]:
df = df.drop_duplicates(subset=["text", "unit"])

In [6]:
df.head(3)

In [7]:
df.shape

In [8]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
tokenizer = AutoTokenizer.from_pretrained("outputs/best_model")
model = AutoModelForSequenceClassification.from_pretrained("outputs/best_model")
model = model.to(device)

In [9]:
def gen_batch(inputs, batch_size):
    batch_start = 0
    while batch_start < len(inputs):
        yield inputs[batch_start: batch_start + batch_size]
        batch_start += batch_size

In [10]:
kwargs = dict(return_tensors='pt',
              padding=True,
              truncation=True,
              max_length=256)

def get_predict(batch):
    tokenized = tokenizer(batch, **kwargs)
    model_outputs = model(**{k: v.to(model.device) for k, v in tokenized.items()})
    return torch.nn.functional.softmax(model_outputs.logits)[:, 0].cpu().detach().numpy()

In [11]:
for x in model.parameters(): x.requires_grad = False
model = model.eval()

In [12]:
pbar = tqdm()

predictions = []
for batch in gen_batch(df["text"].to_list(),
                       batch_size=256):
    predictions.extend(get_predict(batch))
    pbar.update(1)

In [13]:
df["predictions"] = predictions

In [14]:
df["predictions"].isna().sum()

In [15]:
df.head(5)

In [16]:
df[df["unit"] == "тонна"].tail(5)

In [17]:
df[df["unit"] == "тонна"].iloc[0]["text"]

In [18]:
to_columns = {0:"тонна", 1:"кг"}
df["temp"] = 1 - (df["predictions"] >= 0.95) * 1
df["temp"] = [to_columns[int(i)] for i in df["temp"]]

In [19]:
pd.set_option('max_colwidth', 400)

In [20]:
df

In [21]:
df_temp = df.dropna()
df_temp[(df_temp["unit"] != df_temp["temp"]) & (df_temp["volume"] > 0)].rename({"temp": "bert"}, axis=1).sort_values(by="predictions")[::-1].head(20)

In [22]:
df["temp"].value_counts()

In [23]:
df_full = pd.read_csv("new_ext2.csv").replace("\\N", np.nan)
df_full["fish"] = df_full["fish"].str.strip()
df_full["text"] = "рыба: " + df_full["fish"] + " [SEP] количество кг тонна: " + df_full["volume"].astype(str)

In [24]:
df_full

In [25]:
df_full_merged = df_full.merge(df[["text", "temp"]], on="text", how="left")
df_full_merged = df_full_merged.drop_duplicates()

In [26]:
df_full_merged

In [27]:
df_full_merged_dropna = df_full_merged.dropna(subset=["unit"])

In [28]:
df_full_merged_dropna.loc[(df_full_merged_dropna["unit"] != df_full_merged_dropna["temp"]) & (df_full_merged_dropna["volume"] > 0), ["unit", "text", "temp"]]

In [29]:
df_full_merged = df_full_merged.rename({"temp": "bert"}, axis=1).drop(["text", "unit"], axis=1)

In [30]:
df_full_merged

In [31]:
df_full_merged.to_csv("ext2_new_bert.csv", index=False)