In [2]:
import torch
# from torch.utils.data import Dataset, DataLoader
from transformers import Trainer, TrainingArguments
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
import pandas as pd
import numpy as np
import wandb as wb

In [5]:
!pip install datasets
wb.init(project="VK_Assistant_Task")



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


VBox(children=(Label(value='0.011 MB of 0.011 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

In [6]:
from datasets import load_dataset



ds = load_dataset("Cohere/miracl-ru-queries-22-12")

In [9]:
ds

DatasetDict({
    train: Dataset({
        features: ['query_id', 'query', 'positive_passages', 'negative_passages', 'emb'],
        num_rows: 4683
    })
    dev: Dataset({
        features: ['query_id', 'query', 'positive_passages', 'negative_passages', 'emb'],
        num_rows: 1252
    })
    testA: Dataset({
        features: ['query_id', 'query', 'positive_passages', 'negative_passages', 'emb'],
        num_rows: 911
    })
    testB: Dataset({
        features: ['query_id', 'query', 'positive_passages', 'negative_passages', 'emb'],
        num_rows: 718
    })
})

In [10]:
train = ds["train"]
train

Dataset({
    features: ['query_id', 'query', 'positive_passages', 'negative_passages', 'emb'],
    num_rows: 4683
})

In [11]:
dev = ds["dev"]
dev

Dataset({
    features: ['query_id', 'query', 'positive_passages', 'negative_passages', 'emb'],
    num_rows: 1252
})

In [15]:
train_queries = train["query"]

wb.log({"train_queries_count": len(train_queries)})


In [10]:
train_positive_passages = train["positive_passages"]
train_negative_passages = train["negative_passages"]
wb.log({"positive response":len(train_positive_passages)})
wb.log({"negative response":len(train_positive_passages)})


(4683, 4683)

In [11]:
train_df = pd.DataFrame(columns=['question', 'answer', 'isRelevant'])

In [12]:
for i in range(len(train_queries)):
    positive_answers = [x["text"] for x in train_positive_passages[i]]
    negative_answers = [x["text"] for x in train_negative_passages[i]]

    positive_data = pd.DataFrame({
        'question': [train_queries[i]] * len(positive_answers),
        'answer': positive_answers,
        'isRelevant': [1] * len(positive_answers)
    })

    negative_data = pd.DataFrame({
        'question': [train_queries[i]] * len(negative_answers),
        'answer': negative_answers,
        'isRelevant': [0] * len(negative_answers)
    })

    train_df = pd.concat([train_df, positive_data, negative_data], ignore_index=True)
    wb.log({
        'question': train_queries[i],
        'positive_answers': positive_answers,
        'negative_answers': negative_answers,
        'positive_count': len(positive_answers),
        'negative_count': len(negative_answers)
    })
    
    wb.log({"train_df": wb.Table(dataframe=train_df)})

In [13]:
train_df.head(20)

Unnamed: 0,question,answer,isRelevant
0,Когда был спущен на воду первый миноносец «Спо...,Зачислен в списки ВМФ СССР 19 августа 1952 год...,1
1,Когда был спущен на воду первый миноносец «Спо...,Стерегу́щий — русский миноносец типа «Сокол». ...,0
2,Когда был спущен на воду первый миноносец «Спо...,Эскадренный миноносец заложен в 1900 году на Н...,0
3,Когда был спущен на воду первый миноносец «Спо...,10 октября 1937 года эсминец был спущен на вод...,0
4,Когда был спущен на воду первый миноносец «Спо...,В 1901 году миноносец «Бодрый» был зачислен в ...,0
5,Когда был спущен на воду первый миноносец «Спо...,Миноносец «Бдительный» был заказан по судостро...,0
6,Когда был спущен на воду первый миноносец «Спо...,Спе́шный — первый фрегат одноимённой серии из ...,0
7,Когда был спущен на воду первый миноносец «Спо...,Зачислен в списке судов Черноморского флота 11...,0
8,Когда был спущен на воду первый миноносец «Спо...,Зачислен в списке судов Черноморского флота 11...,0
9,Когда был спущен на воду первый миноносец «Спо...,Миноносец «Разящий» 15 апреля 1905 года был за...,0


In [14]:
train_df.shape

(33921, 3)

In [15]:
val_positive_passages = dev["positive_passages"]
val_negative_passages = dev["negative_passages"]
val_queries = dev["query"]
m = len(val_queries)
val_df = pd.DataFrame(columns=['question', 'answer', 'isRelevant'])
for i in range(m):
    positive_answers = [x["text"] for x in val_positive_passages[i]]
    negative_answers = [x["text"] for x in val_negative_passages[i]]

    positive_data = pd.DataFrame({
        'question': [val_queries[i]] * len(positive_answers),
        'answer': positive_answers,
        'isRelevant': [1] * len(positive_answers)
    })

    negative_data = pd.DataFrame({
        'question': [val_queries[i]] * len(negative_answers),
        'answer': negative_answers,
        'isRelevant': [0] * len(negative_answers)
    })
    wb.log({
        'question': val_queries[i],
        'positive_answers': positive_answers,
        'negative_answers': negative_answers,
        "positive_data": wb.Table(dataframe=positive_data),
        "negative_data": wb.Table(dataframe=negative_data),  
        'positive_count': len(positive_answers),
        'negative_count': len(negative_answers)
    })
    val_df = pd.concat([val_df, positive_data, negative_data], ignore_index=True)

In [16]:
val_df.head(20)

Unnamed: 0,question,answer,isRelevant
0,Когда начался Кари́бский кризис?,"Кари́бский кризис — исторический термин, опред...",1
1,Когда начался Кари́бский кризис?,"Политическое, дипломатическое и военное против...",1
2,Когда начался Кари́бский кризис?,В октябре 1962 года произошел Карибский кризис...,1
3,Когда начался Кари́бский кризис?,"Первоначально, после победы революции на Кубе ...",0
4,Когда начался Кари́бский кризис?,"Невозможно однозначно утверждать, стало ли уда...",0
5,Когда начался Кари́бский кризис?,"В пятницу, 26 октября, в 13:00 по вашингтонско...",0
6,Когда начался Кари́бский кризис?,После окончания Третьей Мировой (1962—1965) сл...,0
7,Когда начался Кари́бский кризис?,"Кризис Штата Аден или Аденский Кризис, или, ка...",0
8,Когда начался Кари́бский кризис?,"Суэцкий кризис (Суэцкая война, Синайская война...",0
9,Когда начался Кари́бский кризис?,"На фоне революции в Тунисе, приведшей 14 январ...",0


In [17]:

val_df.shape

(13100, 3)

In [18]:
train_df["isRelevant"].mean(), val_df["isRelevant"].mean()

(0.29480262963945636, 0.2717557251908397)

In [19]:


import json

data = []
c = 0
with open('/content/drive/MyDrive/dataset.jsonl', 'r', encoding='utf-8') as f:
    for line in f:
        data.append(json.loads(line))

In [20]:
data = [d for d in data if len(d["answer"]) > 300]
for d in data:
    d["question"] = d["question"].replace("\n", "")
    d["answer"] = d["answer"].replace("\n", "")
print(data[:5])

[{'description': '', 'question': 'Почему у ноутбука очень часто включается вентилятор?', 'answer': 'Если у вас открыто 20 вкладок это особо не нагружает ноутбук, просто забивается оперативная память. Открытое видео нагружает видео процессор. Часто включается вентилятор понятие очень растяжимое тк зависит от множества факторов, какое устройство, на сколько часто итп. В некоторых ноутбуках вентилятор работает на постоянной основе просто на разных оборотах. Всё зависит от того на сколько горячий чипсет на материнской плате. Современные ноутбуки многие вообще без вентиляторов. На сколько я понимаю у вас на самом деле никакой проблемы нет, а максимум у вас забилась система охлаждения из-за этого затруднён отвод тепла и её нужно просто почистить и возможно заменить термопасту и термопрокладки.Надеюсь ответ был полезен. Буду благодарен за оценку ответа и если остались вопросы пишите в комментариях с удовольствием отвечу. Подписывайтесь на наши ответы и узнаете много полезного.'}, {'descriptio

In [21]:
yandexq_df = pd.DataFrame()
yandexq_df["question"] = [d["question"] for d in data]
yandexq_df["answer"] = [d["answer"] for d in data]
yandexq_df["isRelevant"] = [1 for d in data]

In [22]:
print(yandexq_df.shape)
yandexq_df.head()

(414233, 3)


Unnamed: 0,question,answer,isRelevant
0,Почему у ноутбука очень часто включается венти...,Если у вас открыто 20 вкладок это особо не наг...,1
1,"Как снять заднее сиденье на ""Киа Рио Х Лайн""?",Необходимость в этой операции обычно возникает...,1
2,Может ли бес заставлять молиться?,"И может, и активно так поступает. Это одна из ...",1
3,Где найти работу для фрилансера в 13 лет?,Вы не поверите — на фриланс-биржах. В своё вре...,1
4,"Почему правильнее говорить ""гомосексуалы"", а н...",Суффикс -ист- несёт в себе значение человека с...,1


In [23]:
yandexq_df_sampled = yandexq_df.sample(n=33000, random_state=42)  # берем 33000 случайных строк

In [24]:
print(yandexq_df_sampled.shape)
yandexq_df_sampled.head()

(33000, 3)


Unnamed: 0,question,answer,isRelevant
370901,Иисус творил чудеса или колдовал?,"Если обратиться к первоисточникам, то первые п...",1
217657,"Как ответить на мамины слова: ""Если будешь игр...","- Если ты сидишь целыми сутками, залипая в одн...",1
64692,"Может ли социальная сеть ""Одноклассники"" отклю...",Это крайне маловероятно. Та же Visa для работе...,1
247124,"Если Гэвин О'Коннор, режиссёр «Воина», решит п...","По месту нахождения ответчика - компании, кото...",1
309992,Куда нужно обращаться за имущественным вычетом?,Добрый деньПолучить налоговый вычет вы можете ...,1


In [25]:
from sklearn.model_selection import train_test_split

train_yandexq_df_sampled, val_yandexq_df_sampled = train_test_split(yandexq_df_sampled, test_size=2/7, random_state=42)

In [26]:
train_yandexq_df_sampled.shape

(23571, 3)

In [27]:
val_yandexq_df_sampled.shape

(9429, 3)

In [28]:
train_df = pd.concat([train_df, train_yandexq_df_sampled], ignore_index=True)
val_df = pd.concat([val_df, val_yandexq_df_sampled], ignore_index=True)

In [29]:
train_df.shape, val_df.shape

((57492, 3), (22529, 3))

In [30]:
train_df["isRelevant"].mean(), val_df["isRelevant"].mean()

(0.583924719961038, 0.5765457854321098)

In [31]:
def apply_augmentations(df,
                        symmetrize=True,
                        speech_garbage=0,
                        drop_symbol=0,
                        drop_token=0,
                        double_token=0,
                        insert_random_symbol=0,
                        swap_tokens=0,
                        siblings=0):
    wb.config.update({
        "symmetrize": symmetrize,
        "speech_garbage": speech_garbage,
        "drop_symbol": drop_symbol,
        "drop_token": drop_token,
        "double_token": double_token,
        "insert_random_symbol": insert_random_symbol,
        "swap_tokens": swap_tokens,
        "siblings": siblings
    })
    cyrillic_letters = 'абвгдежзийклмнопрстуфхцчшщъыьэюя'
    sibling_letters = { 'а': 'a','В': 'B', 'е': 'e', 'о': 'o', 'р': 'p', 'с': 'c', 'Т': 'T', 'у': 'y', 'х': 'x'}


    augmentation_probas = [speech_garbage, drop_symbol, drop_token, double_token, insert_random_symbol, swap_tokens, siblings]

    if sum(augmentation_probas) <= 0:
        if symmetrize:
            raise RuntimeError("To symmetrize the classes at least one augmentation must be applied")
        else:
            return df


    df_0 = df[df["isRelevant"] == 0]
    df_1 = df[df["isRelevant"] == 1]
    new_rows = []

    wb.log({
        "initial_total_samples": len(df),
        "initial_relevant_samples": len(df_1),
        "initial_irrelevant_samples": len(df_0)
    })
    augmentation_counts = {
        "speech_garbage": 0,
        "drop_symbol": 0,
        "drop_token": 0,
        "double_token": 0,
        "insert_random_symbol": 0,
        "swap_tokens": 0,
        "siblings": 0
    }

    if symmetrize:
        while df_0.shape[0] + len(new_rows) < df_1.shape[0]:
            df_0 = df_0.drop_duplicates()
            if speech_garbage > 0:
                if np.random.rand() < speech_garbage:
                    new_row = df_0.sample().copy()
                    text = new_row['question']
                    text_split = text.iloc[0].split()
                    if text_split:
                        insert_index = np.random.randint(0, len(text_split) + 1)
                        random_word = np.random.choice(["ээ", "мм", "ну", "кхм-кхм"])
                        text_split.insert(insert_index, random_word)
                    text = " ".join(text_split)
                    new_row["question"] = text
                    if text_split:
                      if type(new_row["question"].iloc[0]) != type("aboba"):
                          print(type(new_row["question"].iloc[0]), 1)
                      new_rows.append(new_row)

            if drop_symbol > 0:
                if np.random.rand() < drop_symbol:
                    new_row = df_0.sample().copy()
                    for _ in range (10):
                        symbol_to_drop = np.random.choice(list(cyrillic_letters))
                        new_row['answer'] = new_row['answer'].str.replace(symbol_to_drop, '', regex=False)
                    if type(new_row["answer"].iloc[0]) != type("aboba"):
                        print(type(new_row["answer"].iloc[0]), 2)
                    new_rows.append(new_row)

            if drop_token > 0:
                if np.random.rand() < drop_token:
                    new_row = df_0.sample().copy()
                    tokens = new_row['answer'].iloc[0].split()
                    drop_index = np.random.randint(0, len(tokens) + 1)
                    if len(tokens) > 0:
                        drop_index = np.random.randint(0, len(tokens))
                        tokens.pop(drop_index)
                    new_row['answer'] = ' '.join(tokens)
                    if tokens:
                      if type(new_row["answer"].iloc[0]) != type("aboba"):
                          print(type(new_row["answer"].iloc[0]), 3)
                      new_rows.append(new_row)

            if double_token > 0:
                if np.random.rand() < double_token:
                    new_row = df_0.sample().copy()
                    tokens = new_row['answer'].iloc[0].split()
                    for _ in range(5):
                        if tokens:
                            duplicate_index = np.random.randint(0, len(tokens))
                            tokens.insert(duplicate_index + 1, tokens[duplicate_index])
                    new_row['answer'] = ' '.join(tokens)
                    if tokens:
                        if type(new_row["answer"].iloc[0]) != type("aboba"):
                            print(type(new_row["answer"].iloc[0]), 4)
                        new_rows.append(new_row)

            if insert_random_symbol > 0:
                if np.random.rand() < insert_random_symbol:
                    new_row = df_0.sample().copy()
                    for _ in range(10):
                        random_symbol = np.random.choice(list(cyrillic_letters))
                        insert_index = np.random.randint(0, len(new_row['answer']))
                        new_row['answer'] = new_row['answer'][:insert_index] + random_symbol + new_row['answer'][insert_index:]

                    if type(new_row["answer"].iloc[0]) != type("aboba"):
                        print(type(new_row["answer"].iloc[0]), 5)
                    new_rows.append(new_row)

            if swap_tokens > 0:
                if np.random.rand() < swap_tokens:
                    new_row = df_0.sample().copy()
                    tokens = new_row['answer'].iloc[0].split()
                    for _ in range(3):
                        if len(tokens) > 1:
                            swap_index = np.random.randint(0, len(tokens) - 1)
                            tokens[swap_index], tokens[swap_index + 1] = tokens[swap_index + 1], tokens[swap_index]
                    new_row['answer'] = ' '.join(tokens)
                    if tokens:
                        if type(new_row["answer"].iloc[0]) != type("aboba"):
                            print(type(new_row["answer"].iloc[0]), 6)
                        new_rows.append(new_row)


            if siblings > 0:
                if np.random.rand() < siblings:
                    new_row = df_0.sample().copy()
                    answer = new_row['answer'].iloc[0]
                    new_answer = []

                    for char in answer:
                        if char in sibling_letters and np.random.rand() < siblings:
                            new_answer.append(sibling_letters[char])
                        else:
                            new_answer.append(char)

                    new_row['answer'] = ''.join(new_answer)
                    if new_answer:
                        if type(new_row["answer"].iloc[0]) != type("aboba"):
                            print(type(new_row["answer"].iloc[0]), 7)
                        new_rows.append(new_row)


        while df_1.shape[0] + len(new_rows) < df_0.shape[0]:
            if speech_garbage > 0:
                if np.random.rand() < speech_garbage:
                    new_row = df_1.sample().copy()
                    text = new_row['question']
                    text_split = text.iloc[0].split()
                    if text_split:
                        insert_index = np.random.randint(0, len(text_split) + 1)
                        random_word = np.random.choice(["ээ", "мм", "ну", "кхм-кхм"])
                        text_split.insert(insert_index, random_word)
                    text = " ".join(text_split)
                    new_row["question"] = text
                    if text_split:
                        if type(new_row["question"].iloc[0]) != type("aboba"):
                            print(type(new_row["question"].iloc[0]), 8)
                        new_rows.append(new_row)

            if drop_symbol > 0:
                if np.random.rand() < drop_symbol:
                    new_row = df_1.sample().copy()
                    for _ in range (10):
                        symbol_to_drop = np.random.choice(list(cyrillic_letters))
                        new_row['answer'] = new_row['answer'].str.replace(symbol_to_drop, '', regex=False)
                    if type(new_row["answer"].iloc[0]) != type("aboba"):
                        print(type(new_row["answer"].iloc[0]), 9)
                    new_rows.append(new_row)

            if drop_token > 0:
                if np.random.rand() < drop_token:
                    new_row = df_1.sample().copy()
                    tokens = new_row['answer'].iloc[0].split()
                    drop_index = np.random.randint(0, len(tokens) + 1)
                    if len(tokens) > 0:
                        drop_index = np.random.randint(0, len(tokens))
                        tokens.pop(drop_index)
                    new_row['answer'] = ' '.join(tokens)
                    if tokens:
                        if type(new_row["answer"].iloc[0]) != type("aboba"):
                            print(type(new_row["answer"].iloc[0]), 10)
                        new_rows.append(new_row)

            if double_token > 0:
                if np.random.rand() < double_token:
                    new_row = df_1.sample().copy()
                    tokens = new_row['answer'].iloc[0].split()
                    for _ in range(5):
                        if tokens:
                            duplicate_index = np.random.randint(0, len(tokens))
                            tokens.insert(duplicate_index + 1, tokens[duplicate_index])
                    new_row['answer'] = ' '.join(tokens)
                    if tokens:
                        if type(new_row["answer"].iloc[0]) != type("aboba"):
                            print(type(new_row["answer"].iloc[0]), 11)
                        new_rows.append(new_row)

            if insert_random_symbol > 0:
                if np.random.rand() < insert_random_symbol:
                    new_row = df_1.sample().copy()
                    for _ in range(10):
                        random_symbol = np.random.choice(list(cyrillic_letters))
                        insert_index = np.random.randint(0, len(new_row['answer']))
                        new_row['answer'] = new_row['answer'].str[:insert_index] + random_symbol + new_row['answer'].str[insert_index:]
                    if type(new_row["answer"].iloc[0]) != type("aboba"):
                        print(type(new_row["answer"].iloc[0]), 12)
                    new_rows.append(new_row)

            if swap_tokens > 0:
                if np.random.rand() < swap_tokens:
                    new_row = df_1.sample().copy()
                    tokens = new_row['answer'].iloc[0].split()
                    for _ in range(3):
                        if len(tokens) > 1:
                            swap_index = np.random.randint(0, len(tokens) - 1)
                            tokens[swap_index], tokens[swap_index + 1] = tokens[swap_index + 1], tokens[swap_index]
                    new_row['answer'] = ' '.join(tokens)
                    if tokens:
                        if type(new_row["answer"].iloc[0]) != type("aboba"):
                            print(type(new_row["answer"].iloc[0]), 13)
                        new_rows.append(new_row)

            if siblings > 0:
                if np.random.rand() < siblings:
                    new_row = df_1.sample().copy()
                    answer = new_row['answer'].iloc[0]
                    new_answer = []

                    for char in answer:
                        if char in sibling_letters and np.random.rand() < siblings:
                            new_answer.append(sibling_letters[char])
                        else:
                            new_answer.append(char)

                    new_row['answer'] = ''.join(new_answer)
                    if new_answer:
                        if type(new_row["answer"].iloc[0]) != type("aboba"):
                            print(type(new_row["answer"].iloc[0]), 14)
                        new_rows.append(new_row)

    if new_rows:
        new_df = pd.concat(new_rows).reset_index(drop=True)
        df = pd.concat([df, new_df], ignore_index=True)
    df = df.drop_duplicates()
    wb.log({
        "final_total_samples": len(df),
        "final_relevant_samples": len(df[df["isRelevant"] == 1]),
        "final_irrelevant_samples": len(df[df["isRelevant"] == 0]),
        "augmentations_applied": augmentation_counts
    })
    wb.log({"augmented_data_sample": wb.Table(dataframe=df.sample(min(100, len(df))))})
    return df






In [32]:
train_df.tail(20)
print(train_df.shape)
train_df[train_df.isna()].shape

(57492, 3)


(57492, 3)

In [33]:
# train_df = apply_augmentations(train_df,
#                                speech_garbage=0.5,
#                                 drop_symbol=0.6,
#                                 drop_token=0.7,
#                                 double_token=0.7,
#                                 insert_random_symbol=0.5,
#                                 swap_tokens=0.6,
#                                 siblings=0.6)

In [34]:
!pip install pymorphy3

Collecting pymorphy3
  Downloading pymorphy3-2.0.2-py3-none-any.whl.metadata (1.8 kB)
Collecting dawg-python>=0.7.1 (from pymorphy3)
  Downloading DAWG_Python-0.7.2-py2.py3-none-any.whl.metadata (7.0 kB)
Collecting pymorphy3-dicts-ru (from pymorphy3)
  Downloading pymorphy3_dicts_ru-2.4.417150.4580142-py2.py3-none-any.whl.metadata (2.0 kB)
Downloading pymorphy3-2.0.2-py3-none-any.whl (53 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.8/53.8 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading DAWG_Python-0.7.2-py2.py3-none-any.whl (11 kB)
Downloading pymorphy3_dicts_ru-2.4.417150.4580142-py2.py3-none-any.whl (8.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.4/8.4 MB[0m [31m83.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pymorphy3-dicts-ru, dawg-python, pymorphy3
Successfully installed dawg-python-0.7.2 pymorphy3-2.0.2 pymorphy3-dicts-ru-2.4.417150.4580142


In [35]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from transformers import pipeline
import pymorphy3
import random

try:
    russian_stopwords = stopwords.words("russian")
except LookupError:
    nltk.download('stopwords')
    russian_stopwords = stopwords.words("russian")

try:
    word_tokenize("Пример текста")
except LookupError:
    nltk.download('punkt')

morph = pymorphy3.MorphAnalyzer()

# Проверка доступности CUDA
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("Используется CUDA")
else:
    device = torch.device("cpu")
    print("CUDA не доступна, используется CPU")

# device = torch.device("cpu")

# Инициализируем пайплайн для перевода
translator = pipeline("translation_ru_to_en", model="Helsinki-NLP/opus-mt-ru-en", device = device)
back_translator = pipeline("translation_en_to_ru", model="Helsinki-NLP/opus-mt-en-ru", device = device)


def augment_answers(answer, num_augmentations=1):
    augmented_answers = []
    for _ in range(num_augmentations):
        augmented_answer = answer
        # Back Translation
        translated = translator(augmented_answer, batch_size=16)[0]["translation_text"]
        back_translated = back_translator(translated, batch_size=16)[0]["translation_text"]
        augmented_answers.append(back_translated)
        wb.log({
        "answer_augmentations": wb.Table(
            columns=["original", "augmented", ""],
            data=[[answer, aug] for aug, aug_type in zip(augmented_answers)]
        )
    })
    

    return augmented_answers


def augment_questions(question, num_augmentations=2):
    augmented_questions = []
    for _ in range(num_augmentations):
        augmented_question = question
        # Back Translation (с большей вероятностью, чем для ответов)
        translated = translator(augmented_question, batch_size=16)[0]["translation_text"]
        back_translated = back_translator(translated, batch_size=16)[0]["translation_text"]
        augmented_questions.append(back_translated)

        # Synonym Replacement (требует словарь синонимов)
        words = word_tokenize(augmented_question)
        for i in range(len(words)):
            if words[i] not in russian_stopwords:
                try:
                    synonyms = morph.parse(words[i])[0].lexeme
                    if synonyms:
                        synonym = random.choice([s.word for s in synonyms])
                        words[i] = synonym
                except:
                    pass
        
        augmented_questions.append(" ".join(words))

        # Random Insertion/Deletion
        words = word_tokenize(augmented_question)
        if len(words) > 1:
            # if random.random() < 0.5:  # Insertion
            if random.random() < 0: # пока так, без Insertion
                insert_index = random.randint(0, len(words))
                # Вставка случайного слова (нужен словарь слов)
                # words.insert(insert_index, random.choice(russian_words))
            else:  # Deletion
                del words[random.randint(0, len(words) - 1)]
        augmented_questions.append(" ".join(words))
    wb.log({
        "question_augmentations": wb.Table(
            columns=["original", "augmented"],
            data=[[question, aug] for aug in zip(augmented_questions)]
        )
    })

    return augmented_questions

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Используется CUDA


config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/307M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/803k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.60M [00:00<?, ?B/s]



config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/307M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/803k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.60M [00:00<?, ?B/s]

In [36]:
# Пример использования:
example_question = "Кто первым полетел в космос?"
augmented_questions = augment_questions(example_question)
print(f"Аугментированные вопросы: {augmented_questions}")

example_answer = "Первым космонавтом стал Юрий Алексеевич Гагарин из СССР. Он полетел в космос 12 апреля 1961 года."
augmented_answers = augment_answers(example_answer)
print(f"Аугментированные ответы: {augmented_answers}")

Аугментированные вопросы: ['Кто совершил первый полет в космос?', 'кто наипервейшими полетевшая в космосы ?', 'Кто первым полетел в космос', 'Кто совершил первый полет в космос?', 'ком первейшая полетевшим в космосов ?', 'Кто первым полетел в космос']
Аугментированные ответы: ['Первым астронавтом стал Юрий Алексеевич Гагарин из СССР, который прилетел в космос 12 апреля 1961 года.']


In [37]:
def apply_augmentations_df(df):
    new_rows = []
    processed_questions = {}  # Словарь для хранения обработанных вопросов
    
      # Initialize counters for logging
    total_original_rows = len(df)
    total_augmented_rows = 0
    skipped_long_answers = 0
    unique_questions_augmented = 0
    
    # Create a W&B Table for logging examples
    augmentation_examples = wb.Table(columns=["original_question", "augmented_question", "original_answer", "augmented_answer"])

    for i in range(len(df)):
        question = df.loc[i, "question"]
        answer = df.loc[i, "answer"]

        if i % 10 == 0:
            print(i)
            wb.log({"augmentation_progress": i / len(df)})

        # Проверяем длину ответа
        if len(answer) > 550:
            skipped_long_answers+=1
            continue  # Пропускаем ответ, если он слишком длинный

        # Проверяем, был ли вопрос уже обработан
        if question not in processed_questions:
            # Аугментация вопроса
            augmented_questions = augment_questions(question)
            processed_questions[question] = augmented_questions
            unique_questions_augmented += 1
        else:
            augmented_questions = processed_questions[question]

        # Аугментация ответа
        augmented_answers = augment_answers(df.loc[i, "answer"])

        for q in augmented_questions:
            for a in augmented_answers:
                new_rows.append({'question': q, 'answer': a, 'isRelevant': df.loc[i, "isRelevant"]})
                total_augmented_rows+=1
    wb.log({
        "total_original_rows": total_original_rows,
        "total_augmented_rows": total_augmented_rows,
        "total_final_rows": len(df),
        "skipped_long_answers": skipped_long_answers,
        "unique_questions_augmented": unique_questions_augmented,
        "augmentation_ratio": total_augmented_rows / total_original_rows if total_original_rows > 0 else 0,
        "augmentation_examples": augmentation_examples
    })

    wb.log({"augmented_data_sample": wb.Table(dataframe=df.sample(min(100, len(df))))})


    df = pd.concat([df, pd.DataFrame(new_rows)], ignore_index=True)
    return df

In [38]:
# train_df = apply_augmentations_df(train_df)
# слишком долго работает, надо оптимизировать

In [39]:
train_df.to_csv('train_df.csv')
val_df.to_csv('val_df.csv')

In [40]:
train_df.tail(20)
print(train_df.shape)
train_df[train_df.isna()].shape

(57492, 3)


(57492, 3)

In [41]:
train_df = train_df.sample(frac=1).reset_index(drop=True)
train_texts = train_df.copy()
train_texts = train_df.sample(frac=1).reset_index(drop=True)
valid_texts = val_df.copy()

In [42]:
train_texts['text'] = train_df['question'] + " [SEP] " + train_df['answer']
valid_texts['text'] = val_df['question'] + " [SEP] " + val_df['answer']
train_texts = train_texts.drop(["question", "answer", "isRelevant"], axis=1)
valid_texts = valid_texts.drop(["question", "answer", "isRelevant"], axis=1)

train_labels = train_df["isRelevant"]
valid_labels = val_df["isRelevant"]

In [43]:
train_on_gpu = torch.cuda.is_available()

if not train_on_gpu:
    print('CUDA is not available.  Training on CPU ...')
else:
    print('CUDA is available!  Training on GPU ...')

device = torch.device("cuda")




CUDA is available!  Training on GPU ...


In [44]:
#tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
tokenizer = DistilBertTokenizer.from_pretrained('cointegrated/rubert-tiny')
#model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)
model = DistilBertForSequenceClassification.from_pretrained('cointegrated/rubert-tiny', num_labels=2, dropout=0.5)

#TODO
# change to BertForSequenceClassification

tokenizer_config.json:   0%|          | 0.00/341 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/241k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/468k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/632 [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BertTokenizer'. 
The class this function is called from is 'DistilBertTokenizer'.
You are using a model of type bert to instantiate a model of type distilbert. This is not supported for all configurations of models and can yield errors.


model.safetensors:   0%|          | 0.00/47.7M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at cointegrated/rubert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight', 'embeddings.LayerNorm.bias', 'embeddings.LayerNorm.weight', 'embeddings.position_embeddings.weight', 'embeddings.word_embeddings.weight', 'pre_classifier.bias', 'pre_classifier.weight', 'transformer.layer.0.attention.k_lin.bias', 'transformer.layer.0.attention.k_lin.weight', 'transformer.layer.0.attention.out_lin.bias', 'transformer.layer.0.attention.out_lin.weight', 'transformer.layer.0.attention.q_lin.bias', 'transformer.layer.0.attention.q_lin.weight', 'transformer.layer.0.attention.v_lin.bias', 'transformer.layer.0.attention.v_lin.weight', 'transformer.layer.0.ffn.lin1.bias', 'transformer.layer.0.ffn.lin1.weight', 'transformer.layer.0.ffn.lin2.bias', 'transformer.layer.0.ffn.lin2.weight', 'transformer.layer.0.output_layer_norm.bias', 'transformer.layer.0.output_layer_norm.weight', 'transfor

Freeze:

In [45]:
# for param in model.distilbert.parameters():
#     param.requires_grad = False

In [46]:
model.to("cuda")

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(29564, 312, padding_idx=0)
      (position_embeddings): Embedding(512, 312)
      (LayerNorm): LayerNorm((312,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.5, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-2): 3 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=312, out_features=312, bias=True)
            (k_lin): Linear(in_features=312, out_features=312, bias=True)
            (v_lin): Linear(in_features=312, out_features=312, bias=True)
            (out_lin): Linear(in_features=312, out_features=312, bias=True)
          )
          (sa_layer_norm): LayerNorm((312,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.5, inplace=False)
 

In [47]:
train_encodings = tokenizer(train_texts["text"].tolist(), truncation=True, padding=True)
valid_encodings = tokenizer(valid_texts["text"].tolist(), truncation=True, padding=True)



In [48]:
class RelevantDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        # Переносим данные на GPU
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [49]:
train_dataset = RelevantDataset(train_encodings, train_labels.tolist())
valid_dataset = RelevantDataset(valid_encodings, valid_labels.tolist())



In [50]:
# training_args = TrainingArguments(
#     output_dir='./results',
#     num_train_epochs=3,
#     per_device_train_batch_size=8,
#     per_device_eval_batch_size=8,
#     warmup_steps=500,
#     weight_decay=0.01,
#     logging_dir='./logs',
#     eval_strategy='steps',
#     eval_steps=500,
# )

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,  # увеличено число эпох
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=2e-5,  # зафиксированная скорость обучения
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    evaluation_strategy="steps",  # оценка на каждом шаге
    eval_steps=500,
    lr_scheduler_type='cosine',
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    save_strategy="steps",  # сохранение на каждом шаге
    save_steps=500,  # save_steps кратно eval_steps
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
)



In [51]:
loss_hist = pd.DataFrame(trainer.state.log_history)

In [52]:
trainer.train()

Step,Training Loss,Validation Loss
500,0.6873,0.687197
1000,0.5799,0.454809
1500,0.4457,0.426446
2000,0.4224,0.440009
2500,0.4394,0.433788
3000,0.4423,0.45748
3500,0.4173,0.436793
4000,0.4242,0.421995
4500,0.4252,0.457845
5000,0.4333,0.42041


TrainOutput(global_step=21561, training_loss=0.41971848613780943, metrics={'train_runtime': 4098.8635, 'train_samples_per_second': 42.079, 'train_steps_per_second': 5.26, 'total_flos': 3727711193112576.0, 'train_loss': 0.41971848613780943, 'epoch': 3.0})

In [53]:
torch.save(model.state_dict(), "next_model.pt")

In [66]:
model.eval()

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(29564, 312, padding_idx=0)
      (position_embeddings): Embedding(512, 312)
      (LayerNorm): LayerNorm((312,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.5, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-2): 3 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=312, out_features=312, bias=True)
            (k_lin): Linear(in_features=312, out_features=312, bias=True)
            (v_lin): Linear(in_features=312, out_features=312, bias=True)
            (out_lin): Linear(in_features=312, out_features=312, bias=True)
          )
          (sa_layer_norm): LayerNorm((312,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.5, inplace=False)
 

In [54]:
loss_hist




In [55]:
question = "Кто первым полетел в космос?"
# answer = "Первым космонавтом стал Юрий Алексеевич Гагарин из СССР. Он полетел в космос 12 апреля 1961 года."

# абсолютно рандомные комментарии
# answer = "Вымышленный мир манги «Берсерк» напоминает Европу приблизительно XIV—XV веков — позднее Средневековье (также много доспехов начала XVI века, что уже раннее Новое время). Основное место действия — королевство Мидланд"
answer = "АБОБА"

In [None]:
# model.eval()
# def predict(question, answer):
#     inputs = tokenizer(f"{question}[SEP]{answer}", return_tensors='pt', truncation=True, padding=True)
#     outputs = model(**inputs)
#     predictions = torch.argmax(outputs.logits, dim=-1)
#     return 'Relevant' if predictions.item() == 1 else 'Not Relevant'

In [None]:
# print(predict(question, answer))

In [60]:
# загрузка обученной модели
model2 = DistilBertForSequenceClassification.from_pretrained('cointegrated/rubert-tiny', num_labels=2)
state_dict2 = torch.load("next_model.pt")
model2.load_state_dict(state_dict2)
model2.eval()

You are using a model of type bert to instantiate a model of type distilbert. This is not supported for all configurations of models and can yield errors.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at cointegrated/rubert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight', 'embeddings.LayerNorm.bias', 'embeddings.LayerNorm.weight', 'embeddings.position_embeddings.weight', 'embeddings.word_embeddings.weight', 'pre_classifier.bias', 'pre_classifier.weight', 'transformer.layer.0.attention.k_lin.bias', 'transformer.layer.0.attention.k_lin.weight', 'transformer.layer.0.attention.out_lin.bias', 'transformer.layer.0.attention.out_lin.weight', 'transformer.layer.0.attention.q_lin.bias', 'transformer.layer.0.attention.q_lin.weight', 'transformer.layer.0.attention.v_lin.bias', 'transformer.layer.0.attention.v_lin.weight', 'transformer.layer.0.ffn.lin1.bias', 'transformer.layer.0.ffn.lin1.weight', 'transformer.layer.0.ffn

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(29564, 312, padding_idx=0)
      (position_embeddings): Embedding(512, 312)
      (LayerNorm): LayerNorm((312,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-2): 3 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=312, out_features=312, bias=True)
            (k_lin): Linear(in_features=312, out_features=312, bias=True)
            (v_lin): Linear(in_features=312, out_features=312, bias=True)
            (out_lin): Linear(in_features=312, out_features=312, bias=True)
          )
          (sa_layer_norm): LayerNorm((312,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [61]:
def predict2(question, answer):
    inputs = tokenizer(f"{question}[SEP]{answer}", return_tensors='pt', truncation=True, padding=True)
    outputs = model2(**inputs)
    predictions = torch.argmax(outputs.logits, dim=-1)
    return 1 if predictions.item() == 1 else 0

In [62]:
question = "Где находится Санкт-Петербург?"
answer1 = "В России"
answer2 = "в Казахстане"
print(predict2(question, answer1))
print(predict2(question, answer2))

0
1


In [67]:
predictions = []

In [68]:
from tqdm import tqdm

In [69]:
for text in tqdm(valid_texts["text"]):
    qu = text.split("[SEP]")[0]
    an = text.split("[SEP]")[1]
    predictions.append(predict2(qu, an))

100%|██████████| 22529/22529 [08:57<00:00, 41.94it/s]


In [70]:
len(predictions), len(valid_labels), len(valid_texts)

(22529, 22529, 22529)

In [71]:
from sklearn.metrics import classification_report, roc_auc_score

In [72]:
print(classification_report(valid_labels.tolist(), predictions))

              precision    recall  f1-score   support

           0       0.72      0.93      0.81      9540
           1       0.93      0.74      0.82     12989

    accuracy                           0.82     22529
   macro avg       0.83      0.83      0.82     22529
weighted avg       0.84      0.82      0.82     22529



In [73]:
print(roc_auc_score(valid_labels.tolist(), predictions))

0.8328016344421735


In [74]:
train_texts = train_texts.drop_duplicates()

In [75]:
train_texts.shape

(57409, 1)