In [None]:
from IPython.display import clear_output

!pip install transformers accelerate
!pip install datasets
!pip install wandb
# !pip install nlpaug
# !pip install sacremoses
# !pip install sentencepiece
# !pip install langdetect

clear_output()

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from torch import cuda
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torch.nn.utils.rnn import pad_sequence
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModel,
    BertModel,
    DataCollatorWithPadding,
    BertForSequenceClassification,
    BertForTokenClassification,
    AutoModelForMaskedLM,
    BertTokenizer,
    AutoModelForTokenClassification,
    pipeline,
    TrainingArguments,
    Trainer,
    BertConfig,
    EarlyStoppingCallback
)

from sklearn.model_selection import train_test_split
from google.colab import drive, output
from sklearn.metrics import f1_score
from tqdm.notebook import tqdm
import locale
from torch.nn import BCELoss, BCEWithLogitsLoss
# import wandb
import random

# wandb.login()

def getpreferredencoding(do_setlocale = True):
    return "UTF-8"
locale.getpreferredencoding = getpreferredencoding

drive.mount('/content/drive', force_remount=True)
clear_output()

In [None]:
WORK_FOLDER = 'drive/MyDrive/ML/GPT or Human'
DEVICE_NUM = 0
DEVICE = f"cuda:{DEVICE_NUM}" if cuda.is_available() else "cpu"
BATCH_SIZE = 16
EPOCHS=10

In [None]:
df = pd.read_csv(f'{WORK_FOLDER}/data/train.csv')

In [None]:
df.drop(columns=['q_id', 'line_id'], inplace=True)
df['label'] = df['label'].map({'ai_answer': 1, 'hu_answer': 0}).values
df.head()

Unnamed: 0,q_title,label,ans_text
0,"Какие комплектующие должны быть в компьютере, ...",0,"Да ничего особенного. :)\nКорпус должен быть, ..."
1,"Какие комплектующие должны быть в компьютере, ...",1,Здравствуйте! Спасибо за интересный вопрос. Дл...
2,Loading a Reusable UITableViewCell from a Nib,1,"To load a reusable UITableViewCell from a Nib,..."
3,Loading a Reusable UITableViewCell from a Nib,0,"Actually, since you are building the cell in I..."
4,How can I change UIButton title color?,0,You can use -[UIButton setTitleColor:forState:...


In [None]:
train_df, valid_df = train_test_split(df, test_size=0.2, random_state=42)

In [None]:
multi_lan_model = "bert-base-multilingual-cased"
multi_tokenizer = BertTokenizer.from_pretrained(multi_lan_model)
clear_output()

In [None]:
# tokenized_train = multi_tokenizer(
#     train_df["ans_text"].tolist(),
#     padding=True,
#     truncation=True,
#     return_tensors="pt"
# )
# train_input_ids, train_masks = (
#     tokenized_train["input_ids"],
#     tokenized_train["attention_mask"],
# )
# train_labels = torch.tensor(train_df["label"].values).unsqueeze(-1)

tokenized_valid = multi_tokenizer(
    valid_df["ans_text"].tolist(),
    padding=True,
    truncation=True,
    return_tensors="pt",
)
# valid_input_ids, valid_masks = (
#     tokenized_valid["input_ids"],
#     tokenized_valid["attention_mask"],
# )
# valid_labels = torch.tensor(valid_df["label"].values).unsqueeze(-1)

# # train_dataset = TensorDataset(train_input_ids, train_masks, train_labels)
# val_dataset = TensorDataset(valid_input_ids, valid_masks, valid_labels)

In [None]:
tokenized_valid

{'input_ids': tensor([[   101,  11220,  10454,  ...,    121,    119,    102],
        [   101,    107, 105104,  ...,      0,      0,      0],
        [   101,    523,    117,  ...,      0,      0,      0],
        ...,
        [   101,  69345,  15597,  ...,  10297,  12709,    102],
        [   101,  11469,  13708,  ...,      0,      0,      0],
        [   101,    516,  35865,  ...,      0,      0,      0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}

## *Бейзлайн модель*

In [None]:
# base_model = BertForSequenceClassification.from_pretrained(
#     multi_lan_model,
#     num_labels = 2,
#     output_attentions = False,
#     output_hidden_states = False
# )

# base_model.to(DEVICE)
# clear_output()

In [None]:
class MyDataCollator:
    def __call__(self, batch):
        return {
            "input_ids": torch.stack([t[0] for t in batch]),
            "attention_mask": torch.stack([t[1] for t in batch]),
            "labels": torch.stack([t[2] for t in batch]),
        }

def f1_metric(preds):
    y_true = preds.label_ids
    y_pred = preds.predictions.argmax(-1)
    return {"F1": f1_score(y_true, y_pred)}


training_args = TrainingArguments(
    output_dir=f"{WORK_FOLDER}/Runs/Base",
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    logging_dir=f"{WORK_FOLDER}/Runs",
    remove_unused_columns=False,
    include_inputs_for_metrics=True,
    evaluation_strategy="steps",
    save_strategy="steps",
    eval_steps=500,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="F1",
    greater_is_better=True,
)

In [None]:
trainer = Trainer(
    model=base_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=MyDataCollator(),
    compute_metrics=f1_metric
)
trainer.train()



Step,Training Loss,Validation Loss,F1
500,0.1628,0.19227,0.95534
1000,0.026,0.163972,0.976143


Step,Training Loss,Validation Loss,F1
500,0.1628,0.19227,0.95534
1000,0.026,0.163972,0.976143
1500,0.0,0.131455,0.983017
2000,0.0,0.141592,0.983017


TrainOutput(global_step=2410, training_loss=0.03918642372654218, metrics={'train_runtime': 3817.0127, 'train_samples_per_second': 10.1, 'train_steps_per_second': 0.631, 'total_flos': 1.0142931184128e+16, 'train_loss': 0.03918642372654218, 'epoch': 10.0})

#### *Другие гиперпараметры у базовой модели*

In [None]:
training_args.output_dir = f"{WORK_FOLDER}/Runs/Base2"
training_args.num_train_epochs = 15
training_args.weight_decay = 0.01
training_args.learning_rate = 1e-6

trainer = Trainer(
    model=base_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=MyDataCollator(),
    compute_metrics=f1_metric,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],
)

*Скор упал*

##*Другой классификатор*

In [None]:
class TwoLayerMLPClassifier(nn.Module):
    def __init__(self, hidden_size, num_classes, dropout_rate=0.1):
        super().__init__()
        self.layer1 = nn.Linear(hidden_size, hidden_size)
        self.activation = nn.ReLU()
        self.dropout = nn.Dropout(dropout_rate)
        self.layer2 = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        x = self.layer1(x)
        x = self.activation(x)
        x = self.dropout(x)
        x = self.layer2(x)
        return x

In [None]:
base_model.classifier = TwoLayerMLPClassifier(base_model.config.hidden_size, 2)

training_args.output_dir = f"{WORK_FOLDER}/Runs/Class2"
training_args.num_train_epochs = 15
training_args.weight_decay = 0.01
training_args.learning_rate = 1e-6

trainer = Trainer(
    model=base_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=MyDataCollator(),
    compute_metrics=f1_metric,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],
)

In [None]:
trainer.train()



Step,Training Loss,Validation Loss,F1
500,0.1644,0.079431,0.981057
1000,0.0166,0.061808,0.984985
1500,0.0073,0.070297,0.984


Step,Training Loss,Validation Loss,F1
500,0.1644,0.079431,0.981057
1000,0.0166,0.061808,0.984985
1500,0.0073,0.070297,0.984
2000,0.0045,0.078709,0.984


TrainOutput(global_step=2000, training_loss=0.04819664597511292, metrics={'train_runtime': 3109.9205, 'train_samples_per_second': 18.594, 'train_steps_per_second': 1.162, 'total_flos': 8475491924656128.0, 'train_loss': 0.04819664597511292, 'epoch': 8.3})

*^ Это пока лучшая модель*

## *Дисбаланс классов + бейзлайн / другой классификатор*

In [None]:
train_df['label'].value_counts()

1    1932
0    1923
Name: label, dtype: int64

In [None]:
class_count_1, class_count_0 = train_df['label'].value_counts()

class_0 = train_df[train_df['label'] == 0]
class_1 = train_df[train_df['label'] == 1]

class_0_over = class_0.sample(class_count_1, replace=True)

train_df_balanced = pd.concat([
    class_0_over, class_1
    ], axis=0).sample(frac=1)

In [None]:
train_df_balanced['label'].value_counts()

0    1932
1    1932
Name: label, dtype: int64

In [None]:
tokenized_train = multi_tokenizer(
    train_df_balanced["ans_text"].tolist(),
    padding=True,
    truncation=True,
    return_tensors="pt"
)
train_input_ids, train_masks = (
    tokenized_train["input_ids"],
    tokenized_train["attention_mask"],
)
train_labels = torch.tensor(train_df_balanced["label"].values).unsqueeze(-1)
train_dataset = TensorDataset(train_input_ids, train_masks, train_labels)

In [None]:
base_model = BertForSequenceClassification.from_pretrained(
    multi_lan_model,
    num_labels = 2,
    output_attentions = False,
    output_hidden_states = False
)

base_model.to(DEVICE)
clear_output()

training_args.output_dir = f"{WORK_FOLDER}/Runs/Base4"
training_args.num_train_epochs = 30
training_args.weight_decay = 0.01
training_args.learning_rate = 1e-6

trainer = Trainer(
    model=base_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=MyDataCollator(),
    compute_metrics=f1_metric,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=5)],

)

trainer.train()



Step,Training Loss,Validation Loss,F1
500,0.4077,0.333023,0.893773


In [None]:
trainer.train(f"{WORK_FOLDER}/Runs/Base2/checkpoint-2000")

Step,Training Loss,Validation Loss


TrainOutput(global_step=2410, training_loss=3.891109384565433e-06, metrics={'train_runtime': 634.6782, 'train_samples_per_second': 60.739, 'train_steps_per_second': 3.797, 'total_flos': 1.0142931184128e+16, 'train_loss': 3.891109384565433e-06, 'epoch': 10.0})

In [None]:
trainer.evaluate()

{'eval_loss': 0.1908539980649948,
 'eval_F1': 0.9752229930624381,
 'eval_runtime': 32.6148,
 'eval_samples_per_second': 29.557,
 'eval_steps_per_second': 1.87,
 'epoch': 20.0}

In [None]:
trainer.save_model(f"{WORK_FOLDER}/Runs/Base2/maybe_better_cls_balanced")

In [None]:
# for param in base_model.base_model.parameters():
#     param.requires_grad = False

base_model.classifier = TwoLayerMLPClassifier(base_model.config.hidden_size, 2)

training_args.output_dir = f"{WORK_FOLDER}/Runs/Class4"
training_args.num_train_epochs = 30
training_args.weight_decay = 0.01
training_args.learning_rate = 1e-6

trainer = Trainer(
    model=base_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=MyDataCollator(),
    compute_metrics=f1_metric,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=5)],
)

trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mlisabeth-shevtsova[0m ([33mliza-i-pivko[0m). Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss,Validation Loss,F1
500,0.6074,0.40549,0.875
1000,0.3548,0.251672,0.920152
1500,0.1839,0.213309,0.939013
2000,0.0983,0.174511,0.952102
2500,0.0587,0.219478,0.951267
3000,0.0457,0.246394,0.948693


Step,Training Loss,Validation Loss,F1
500,0.6074,0.40549,0.875
1000,0.3548,0.251672,0.920152
1500,0.1839,0.213309,0.939013
2000,0.0983,0.174511,0.952102
2500,0.0587,0.219478,0.951267
3000,0.0457,0.246394,0.948693


In [None]:
# play notification :D
output.eval_js('new Audio("https://upload.wikimedia.org/wikipedia/commons/0/05/Beep-09.ogg").play()')
output.eval_js('new Audio("https://upload.wikimedia.org/wikipedia/commons/0/05/Beep-09.ogg").play()')
output.eval_js('new Audio("https://upload.wikimedia.org/wikipedia/commons/0/05/Beep-09.ogg").play()')

## *Языки*

In [None]:
from langdetect import detect

In [None]:
train_langs = train_df_balanced['ans_text'].apply(detect)
train_langs.value_counts()

en    2170
ru    1691
ca       3
Name: ans_text, dtype: int64

In [None]:
train_df_balanced['lang'] = train_langs
train_df_balanced['lang'] = train_df_balanced['lang'].apply(lambda l: 'en' if l != 'ru' else 'ru')

train_df_balanced.head()

Unnamed: 0,q_title,label,ans_text,lang
2246,Can multithreading be implemented on a single ...,1,"Yes, multithreading can be implemented on a si...",en
1498,How do I get the intersection between two arra...,1,"Well, well, well, looks like someone needs to ...",en
2880,"Есть ли что-то из классической литературы, что...",0,Примерно раз в 5 лет перечитываю всего Оскара ...,ru
1031,iOS app error - Can't add self as subview,0,I am speculating based on something similar th...,en
1313,Какую книгу для подростков 14-15 лет вы считае...,0,"Помню как (правда, это было уже на пороге 16 л...",ru


In [None]:
train_df_balanced.lang.unique()

array(['en', 'ru'], dtype=object)

## *Augmentation*

In [None]:
import nlpaug.augmenter.char as nac
import nlpaug.augmenter.word as naw
import nlpaug.augmenter.sentence as nas
import nlpaug.flow as nafc

from nlpaug.util import Action

In [None]:
back_translation_aug_ru = naw.BackTranslationAug(
    from_model_name='Helsinki-NLP/opus-mt-ru-en',
    to_model_name='Helsinki-NLP/opus-mt-en-ru',
    device=DEVICE,
    batch_size=8
)

clear_output()

class_rus = train_df_balanced[train_df_balanced.lang == 'ru'].copy()
aug_rus = back_translation_aug_ru.augment(class_rus.ans_text.to_list())

In [None]:
# play notification :D
output.eval_js('new Audio("https://upload.wikimedia.org/wikipedia/commons/0/05/Beep-09.ogg").play()')
output.eval_js('new Audio("https://upload.wikimedia.org/wikipedia/commons/0/05/Beep-09.ogg").play()')
output.eval_js('new Audio("https://upload.wikimedia.org/wikipedia/commons/0/05/Beep-09.ogg").play()')

In [None]:
torch.cuda.empty_cache()

In [None]:
back_translation_aug_en = naw.BackTranslationAug(
    from_model_name='Helsinki-NLP/opus-mt-en-ru',
    to_model_name='Helsinki-NLP/opus-mt-ru-en',
    device=DEVICE,
    batch_size=8
)
class_eng = train_df_balanced[train_df_balanced.lang == 'en'].copy()
aug_eng = back_translation_aug_en.augment(class_eng.ans_text.to_list())

In [None]:
torch.cuda.empty_cache()

In [None]:
output.eval_js('new Audio("https://upload.wikimedia.org/wikipedia/commons/0/05/Beep-09.ogg").play()')
output.eval_js('new Audio("https://upload.wikimedia.org/wikipedia/commons/0/05/Beep-09.ogg").play()')
output.eval_js('new Audio("https://upload.wikimedia.org/wikipedia/commons/0/05/Beep-09.ogg").play()')

In [None]:
class_rus['ans_text'] = aug_rus
class_eng['ans_text'] = aug_eng

In [None]:
train_df_balanced_aug = pd.concat([train_df_balanced, class_rus, class_eng]).sample(frac=1).reset_index(drop=True)
train_df_balanced_aug.to_csv(f"{WORK_FOLDER}/train_df_balanced_aug.csv")

In [None]:
train_df_balanced_aug = pd.read_csv(f"{WORK_FOLDER}/train_df_balanced_aug.csv")

In [None]:
tokenized_train = multi_tokenizer(
    train_df_balanced_aug["ans_text"].tolist(),
    padding=True,
    truncation=True,
    return_tensors="pt"
)
train_input_ids, train_masks = (
    tokenized_train["input_ids"],
    tokenized_train["attention_mask"],
)
train_labels = torch.tensor(train_df_balanced_aug["label"].values).unsqueeze(-1)
train_dataset = TensorDataset(train_input_ids, train_masks, train_labels)

base_model = BertForSequenceClassification.from_pretrained(
    multi_lan_model,
    num_labels = 2,
    output_attentions = False,
    output_hidden_states = False
)

base_model.to(DEVICE)
clear_output()

training_args.output_dir = f"{WORK_FOLDER}/Runs/Base4"
training_args.num_train_epochs = 10

trainer = Trainer(
    model=base_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=MyDataCollator(),
    compute_metrics=f1_metric,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
)

In [None]:
trainer.train()



Step,Training Loss,Validation Loss,F1
500,0.2453,0.12521,0.964567
1000,0.0571,0.25238,0.960938
1500,0.0271,0.136579,0.974155
2000,0.0145,0.190755,0.973081
2500,0.014,0.276504,0.967552


Step,Training Loss,Validation Loss,F1
500,0.2453,0.12521,0.964567
1000,0.0571,0.25238,0.960938
1500,0.0271,0.136579,0.974155
2000,0.0145,0.190755,0.973081
2500,0.014,0.276504,0.967552
3000,0.0024,0.298018,0.966535


TrainOutput(global_step=3000, training_loss=0.06007338134447734, metrics={'train_runtime': 4884.3315, 'train_samples_per_second': 15.822, 'train_steps_per_second': 0.989, 'total_flos': 1.262933065728e+16, 'train_loss': 0.06007338134447734, 'epoch': 6.21})

In [None]:
output.eval_js('new Audio("https://upload.wikimedia.org/wikipedia/commons/0/05/Beep-09.ogg").play()')
output.eval_js('new Audio("https://upload.wikimedia.org/wikipedia/commons/0/05/Beep-09.ogg").play()')
output.eval_js('new Audio("https://upload.wikimedia.org/wikipedia/commons/0/05/Beep-09.ogg").play()')

## *SCRIPT FOR INFERENCE*

In [None]:
import torch
import torch.nn as nn
import pandas as pd
from transformers import BertForSequenceClassification, BertTokenizer
from torch.utils.data import TensorDataset, DataLoader


class TwoLayerMLPClassifier(nn.Module):
    def __init__(self, hidden_size, num_classes, dropout_rate=0.1):
        super().__init__()
        self.layer1 = nn.Linear(hidden_size, hidden_size)
        self.activation = nn.ReLU()
        self.dropout = nn.Dropout(dropout_rate)
        self.layer2 = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        x = self.layer1(x)
        x = self.activation(x)
        x = self.dropout(x)
        x = self.layer2(x)
        return x


class CustomBertForSequenceClassification(BertForSequenceClassification):
    def __init__(self, config):
        super().__init__(config)
        self.classifier = TwoLayerMLPClassifier(
            self.bert.config.hidden_size, self.num_labels)


MULTI_LANG_BERT = "bert-base-multilingual-cased"

if __name__ == "__main__":
    df_test = pd.read_csv(f"{WORK_FOLDER}/data/train.csv")
    multi_tokenizer = BertTokenizer.from_pretrained(
        MULTI_LANG_BERT, model_max_length=512)
    tokenized_test = multi_tokenizer(
        df_test["ans_text"].tolist(),
        padding=True,
        truncation=True,
        return_tensors="pt",
    )
    test_input_ids, test_masks = (
        tokenized_test["input_ids"],
        tokenized_test["attention_mask"],
    )
    test_dataset = TensorDataset(test_input_ids, test_masks)
    loader = DataLoader(test_dataset, batch_size=256)

    model = CustomBertForSequenceClassification.from_pretrained(
        f"{WORK_FOLDER}/Runs/Class2/checkpoint-1000",
        num_labels=2,
        output_attentions=False,
        output_hidden_states=False
    )
    model.to("cuda")
    with torch.no_grad():
        predictions = []
        for batch in loader:
            test_input_ids, test_masks = batch
            outputs = model(
                test_input_ids.to("cuda"),
                test_masks.to("cuda"),
            )
            predictions.append(outputs.logits.argmax(-1).cpu())
        df_test["label"] = torch.cat(predictions, dim=0).numpy()
        df_test["label"] = df_test["label"].map(
            {1: 'ai_answer', 0: 'hu_answer'})
        df_test[["line_id", "label"]].to_csv(
            f"{WORK_FOLDER}/data/submission.csv", sep=",", index=False)


In [None]:
torch.cuda.empty_cache()