In [1]:
import math
from tqdm import tqdm
from datetime import datetime
import pandas as pd
import numpy as np
import os
import torch
import sentence_transformers
import matplotlib.pyplot as plt
from matplotlib import ticker
%matplotlib inline
import torch.nn as nn

from transformers import BertTokenizer, BertModel, pipeline, BartModel
from transformers import RobertaTokenizer, RobertaModel, AutoConfig
from transformers import AutoTokenizer, AutoModel

from sentence_transformers.util import cos_sim
from sentence_transformers import SentenceTransformer, models
from torch.utils.data import DataLoader, Dataset, random_split
from sklearn.metrics.pairwise import cosine_similarity
import torch.nn.functional as F
from DeBERTa import deberta
from collections import OrderedDict

import textstat

In [2]:
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
print(device)

cuda:0


In [3]:
model_name = 'base'
size = 768

In [4]:
def config_debert():
    configuration = deberta.ModelConfig()
    configuration = configuration.from_json_file(r"C:\Users\shmak\.~DeBERTa\assets\latest\deberta-base\model_config.json")

    configuration.attention_probs_dropout_prob = 0.0
    configuration.hidden_dropout_prob = 0.0

    return configuration

In [5]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# tokenizer = RobertaTokenizer.from_pretrained()

In [6]:
# DeBERTa model
#
# vocab_path, vocab_type = deberta.load_vocab(pretrained_id='large')
# tokenizer = deberta.tokenizers[vocab_type](vocab_path)

embeddings_model = deberta.DeBERTa(config=config_debert(), pre_trained=model_name)

08/17/2023 15:38:44|INFO|logging|00| Loaded pretrained model file C:\Users\shmak\.~DeBERTa/assets/latest/deberta-base\pytorch_model.bin


## Data

In [7]:
summaries_train_path = r"dataset\summaries_train_merged.csv"
summaries_test_path = r"dataset\summaries_test.csv"
prompts_train_path = r"dataset\prompts_train.csv"
prompts_test_path = r"dataset\prompts_test.csv"

In [8]:
def norm_score(df: pd.DataFrame) -> pd.DataFrame:
    """Normilizing score to values 0 to 1"""
    df -= np.min(df)
    df /= df.max()
    print('Normilized' if df.min() == 0.0 and df.max() == 1.0 else 'NormError:wrong values')

    return df

def short_text(text, max_length=200) -> str:
    if len(text.split()) > 700:
        middle_point = text[len(text)//2:].index('.') + len(text)//2 + 1
        text_pt1 = text[:middle_point]
        text_pt2 = text[middle_point:]

        text = short_text(text_pt1) + short_text(text_pt2)

    summarizer = pipeline("summarization", model="bart-large-cnn")

    # summarizer.model.to(device)

    summarized = summarizer(text, max_length=max_length, min_length=80, do_sample=False)

    return summarized[0]['summary_text']

def moving_average(array: np.array, betta=0.9) -> np.array:
    """
    Computing moving average with bias correction.
    """

    V = 0
    average_array = np.zeros(len(array))
    for i in range(len(array)):
        V = betta * V + (1 - betta) * array[i]
        average_array[i] = V/(1 - pow(betta, i+1))

    return average_array


def mean_pooling(outputs, batch) -> torch.tensor:

    attention_mask = batch['attention_mask']
    embeddings = outputs['hidden_states'][-1]

    mask = attention_mask.unsqueeze(-1).expand(embeddings.size()).float()
    masked_embeddings = embeddings * mask

    summed = torch.sum(masked_embeddings, 1)
    summed_mask = torch.clamp(mask.sum(dim=1), min=1e-9)

    mean_pooled = summed/summed_mask

    return mean_pooled

def normilize(df_column: pd.DataFrame):

    df_column = (df_column - df_column.mean())/df_column.std()

    return df_column

def get_cos_sim(df):
    sentence_1 = df.text
    sentence_2 = df.prompt_text

    sentence_tokenizer = AutoTokenizer.from_pretrained('all-mpnet-base-v2')
    sentence_model = AutoModel.from_pretrained('all-mpnet-base-v2')

    cos_sim_list = []
    sentence_model.to(device)

    for i, summarie in enumerate(tqdm(sentence_1)):
        prompt = sentence_2[i]
        inputs = sentence_tokenizer([summarie, prompt], padding=True,
                                    return_tensors='pt', truncation=True)
        inputs.to(device)

        with torch.no_grad():
            outputs = sentence_model(**inputs)

        sentence_embeddings = mean_pooling(outputs, inputs)
        sentence_embeddings = F.normalize(sentence_embeddings)

        score = nn.CosineSimilarity(dim=0)(sentence_embeddings[0], sentence_embeddings[1])

        cos_sim_list.append(score.item())

    return cos_sim_list

def get_stat_features(df, text_col="text"):

    df["num_unique_words"] = normilize(df[text_col].apply(lambda x: len(set(x.split()))))
    df["num_words"] = normilize(df[text_col].apply(lambda x: len(x.split())))
    df["num_sentences"] = normilize(df[text_col].apply(lambda x: len(x.split('.'))))

    df["syntax_count"] = normilize(df[text_col].apply(lambda x: x.count(",")
                                                      + x.count("-") + x.count(";") + x.count(":")))
    df['smog_index'] = normilize(df[text_col].apply(lambda x: textstat.smog_index(x)))
    df['cos_sim'] = normilize(pd.DataFrame(get_cos_sim(df)))

    return df

In [9]:
class SentenseData(Dataset):
    """
    :params: path to csv file with summaries, path to csv file with prompts, 'score' param\
    defines which score is used: content/wording
    """
    def __init__(self, summaries_path=summaries_train_path,
                 prompts_path=prompts_train_path,
                 score='wording', test=False, fold_test='ebad26'):
        self.test = test
        self.score_type = score
        self.fold_test = fold_test

        with open(prompts_path, encoding='utf-8') as f:
            self.prompts = pd.read_csv(f)

        with open(summaries_path, encoding='utf-8') as f:
            summaries = pd.read_csv(f)
            self.summaries = summaries

    def __len__(self):
        return len(self.__summaries)

    @property
    def summaries(self):
        return self.__summaries

    @summaries.setter
    def summaries(self, df):
        if self.test:
            self.__summaries = df[df.prompt_id == self.fold_test].reset_index(drop=True)

        else:
            self.__summaries = df[df.prompt_id != self.fold_test].reset_index(drop=True)

    @property
    def prompts(self):
        return self.__prompts

    @prompts.setter
    def prompts(self, file):
        self.__prompts = file
        # for i, text in enumerate(self.__prompts.prompt_text):
        #     if len(text.split()) > 300:
        #         self.__prompts.prompt_text[i] = short_text(text)

    def get_batch_text(self, index):
        summary_text = self.summaries.text[index]

        if True:#self.score_type == 'wording':
            res = tokenizer(summary_text, padding='max_length', return_tensors='pt', truncation=True)
            res = {k:val.squeeze() for k, val in res.items()}
            res['features'] = torch.tensor([self.summaries['num_unique_words'][index], self.summaries['num_words'][index],
                                  self.summaries['num_sentences'][index],self.summaries['syntax_count'][index],
                                  self.summaries['smog_index'][index], self.summaries['cos_sim'][index]], dtype=torch.float32)

            return res

        prompt_text = self.prompts.prompt_text[self.prompts.prompt_id ==
                                               self.summaries.prompt_id[index]].item().replace('\n','')

        return tokenizer([summary_text, prompt_text], padding='max_length', return_tensors='pt', truncation=True)

    def get_score(self, index):
        if self.score_type == 'content':
            score = self.summaries.content[index]

        elif self.score_type == 'wording':
            score = self.summaries.wording[index]

        return torch.tensor(score)

    def __getitem__(self, index) -> torch.tensor:

        batch_text = self.get_batch_text(index)
        batch_score = self.get_score(index)

        return batch_text, batch_score

## Model

In [10]:
class STSBertModel(nn.Module):
    """
    Sentence Semantic Similarity Bert model
    :param: seg_head=True for using segmentation head instead of cosine similarity
    :param: freeze_weights=True to freeze BERT model's weights and train only the segmentation head
    """
    def __init__(self, with_features=False, input_size=size):
        super(STSBertModel, self).__init__()

        self.word_embedding = deberta.DeBERTa(config=config_debert(), pre_trained=model_name)
        self.cos_score = nn.CosineSimilarity(dim=0)
        self.identity = nn.Identity()
        self.input_size = input_size
        self.with_features = with_features

        if with_features:
            self.n_features = 6
            self.input_size += self.n_features

        # for param in self.word_embedding.parameters():
        #     param.requires_grad = False

        self.attention = SelfAttention(self.input_size)
        self.block = AttentionBlock(self.input_size)

        self.FC_head = nn.Sequential(OrderedDict([
            ('fc1', nn.Linear(self.input_size, self.input_size)),
            ('relu1', nn.ReLU()),
            # ('fc2', nn.Linear(1024, self.input_size)),
            # ('relu2', nn.ReLU())
            ]))

        self.FC_output = nn.Sequential(OrderedDict([
            ('fc1', nn.Linear(self.input_size, 1)),
            ]))

    def forward(self, x):
        output = self.word_embedding(input_ids=x['input_ids'], attention_mask=x['attention_mask'])
        output = mean_pooling(output, x)

        if self.with_features:
            output = torch.cat((output, x['features']), dim=1)

        output = output.unsqueeze(1)
        output = self.attention(output)
        output = self.FC_output(output)

        return output.squeeze()


class AttentionBlock(nn.Module):
    def __init__(self, input_dim):
        super(AttentionBlock, self).__init__()

        self.input_dim = input_dim
        self.fc = nn.Linear(input_dim, input_dim)
        self.attention = SelfAttention(input_dim)
        self.relu = nn.ReLU()
        self.norm = nn.LayerNorm(input_dim)

    def forward(self, x):
        identity = x

        x = self.attention(x)
        x = self.fc(x)

        x += identity
        x = self.norm(x)
        x = self.relu(x)

        return x


class SelfAttention(nn.Module):
    def __init__(self, input_dim):
        super(SelfAttention, self).__init__()
        self.input_dim = input_dim
        self.query = nn.Linear(input_dim, input_dim)
        self.key = nn.Linear(input_dim, input_dim)
        self.value = nn.Linear(input_dim, input_dim)
        self.softmax = nn.Softmax(dim=2)

    def forward(self, x):
        queries = self.query(x)
        keys = self.key(x)
        values = self.value(x)
        scores = torch.bmm(queries, keys.transpose(1, 2)) / (self.input_dim ** 0.5)
        attention = self.softmax(scores)
        weighted = torch.bmm(attention, values)
        return weighted


## Training

In [11]:
def validate_model(model, loss_func):

    losses_epoch_test = []

    model.eval()
    with torch.no_grad():
        for inputs, targets in test_loader:
            targets = targets.float().to(device)
            if True:
                inputs = {k:val.squeeze().to(device) for k, val in inputs.items()}

                with torch.autocast(device_type='cuda', dtype=torch.float16):
                    outputs = model(inputs)

            else:
                outputs = torch.stack([model(sentence.to(device)) for sentence in inputs])


            loss = torch.sqrt(loss_func(outputs.squeeze(), targets))


            losses_epoch_test.append(loss.item())

    return losses_epoch_test

def train_model(n_epochs, lr, weight_decay, n_fold):

    model = STSBertModel(with_features=True)
    # model = ClassificationModel(n_features=7, n_fold=n_fold)

    model.float()
    model.to(device)

    lr_head = 5e-04
    params = [
        {'params': model.word_embedding.parameters()},
        {'params': model.attention.parameters(), 'lr': lr_head},
        {'params': model.block.parameters(), 'lr': lr_head},
        {'params': model.FC_head.parameters(), 'lr': lr_head},
        {'params': model.FC_output.parameters(), 'lr': lr_head}
    ]

    loss_func = nn.MSELoss()
    optimizer = torch.optim.AdamW(params, lr=lr, weight_decay=weight_decay)
    lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 2, gamma=0.5)
    scaler = torch.cuda.amp.GradScaler()

    losses_train = []
    losses_test = []
    t0 = datetime.now()

    for i in range(n_epochs):

        losses_train_per_epoch = []

        for j, batch in enumerate(tqdm(train_loader, position=0, leave=True), 1):
            inputs, targets = batch
            targets = targets.float().to(device)

            model.train()

            optimizer.zero_grad()

            with torch.autocast(device_type='cuda', dtype=torch.float16):
                inputs = {k:val.squeeze().to(device) for k, val in inputs.items()}
                outputs = model(inputs)

                loss = torch.sqrt(loss_func(outputs.squeeze(), targets))

            scaler.scale(loss).backward()
            scaler.step(optimizer)

            scaler.update()

            losses_train_per_epoch.append(loss.item())

            if j%20 == 0 or j == len(train_loader):

                losses_test_per_epoch = validate_model(model, loss_func)
                losses_test_mean = np.mean(losses_test_per_epoch)
                losses_train_mean = np.mean(losses_train_per_epoch)

                losses_train.append(losses_train_mean)
                losses_test.append(losses_test_mean)

                if losses_test_mean == np.min(losses_test):
                    best_score = losses_test_mean
                    n_iter = len(losses_train)
                    torch.save(model.state_dict(), r'models\cv\best__fold_{}.pt'.format(n_fold))
                    print(f'Best saved, loss: {best_score:.5f}')

                print(f'Fold# {n_fold}, Epoch: {i+1}/{n_epochs}, L_train: {losses_train_mean:.5f},\
                L_test: {losses_test_mean:.5f}')

        torch.cuda.empty_cache()

    print(f"Time elapsed:{datetime.now()-t0}, Best score:{best_score:.5f}")

    return best_score, n_iter, losses_test, losses_train

In [12]:
def loaders_init(score_type, batch_size, fold):

    test_data = SentenseData(test=True, score=score_type, fold_test=fold)
    train_data = SentenseData(score=score_type, fold_test=fold)

    train_loader = DataLoader(train_data,
                           batch_size=batch_size,
                           shuffle=True, drop_last=True)

    test_loader = DataLoader(test_data,
                          batch_size=batch_size,
                          shuffle=True, drop_last=True)

    return train_loader, test_loader

def axis_set(ax):

    ax.plot(test_losses, label='test')
    ax.plot(train_losses, label='train')
    ax.scatter(n_iter - 1, best_score, label='min')
    ax.set_title(f'#{i}, min: {best_score:.5f}', loc='left')
    ax.xaxis.set_major_locator(ticker.LinearLocator(n_epochs + 1))
    ax.xaxis.set_major_formatter(ticker.FixedFormatter(range(n_epochs)))
    ax.legend()

    return ax

In [None]:
# def cross_validation():
with open(prompts_train_path, encoding='utf-8') as f:
    prompts = pd.read_csv(f)
#     prompts = prompts[prompts.prompt_id != '3b9047'].reset_index(drop=True)

plt.figure(figsize=(12,4))

folds_list = list(prompts.prompt_id)
score_type = 'content'
batch_size = 10
lr = 5e-04
weight_decay = 1e-05

n_epochs = 3

scores = []

fig, axs = plt.subplots(1, len(folds_list), figsize=(12,4), layout='tight')

for i, fold in enumerate(folds_list, 1):
    train_loader, test_loader = loaders_init(score_type, batch_size, fold)

    best_score, n_iter, test_losses, train_losses = train_model(n_epochs, lr, weight_decay, i)

    scores.append(best_score)

    axs[i-1] = axis_set(axs[i-1])

mean_score = np.mean(scores)

fig.suptitle(f'CV={mean_score:.5f}')
fig.savefig('res_figure.png', dpi=200)

mean_score

08/17/2023 15:38:44|INFO|logging|00| Loaded pretrained model file C:\Users\shmak\.~DeBERTa/assets/latest/deberta-base\pytorch_model.bin
  4%|▍         | 20/510 [00:34<51:13,  6.27s/it]

Best saved, loss: 1.16936
Fold# 1, Epoch: 1/3, L_train: 2.47591,                L_test: 1.16936


  8%|▊         | 40/510 [01:08<49:14,  6.29s/it]

Best saved, loss: 0.71779
Fold# 1, Epoch: 1/3, L_train: 1.79553,                L_test: 0.71779


 12%|█▏        | 60/510 [01:41<47:03,  6.28s/it]

Best saved, loss: 0.53087
Fold# 1, Epoch: 1/3, L_train: 1.43130,                L_test: 0.53087


 16%|█▌        | 80/510 [02:14<43:58,  6.14s/it]

Fold# 1, Epoch: 1/3, L_train: 1.27160,                L_test: 0.84677


 20%|█▉        | 100/510 [02:47<42:06,  6.16s/it]

Fold# 1, Epoch: 1/3, L_train: 1.13871,                L_test: 0.68065


 24%|██▎       | 120/510 [03:20<39:46,  6.12s/it]

Fold# 1, Epoch: 1/3, L_train: 1.05696,                L_test: 0.54395


 27%|██▋       | 140/510 [03:54<38:42,  6.28s/it]

Best saved, loss: 0.50052
Fold# 1, Epoch: 1/3, L_train: 0.98846,                L_test: 0.50052


 31%|███▏      | 160/510 [04:27<35:27,  6.08s/it]

Fold# 1, Epoch: 1/3, L_train: 0.93374,                L_test: 0.51274


 35%|███▌      | 180/510 [04:59<33:09,  6.03s/it]

Fold# 1, Epoch: 1/3, L_train: 0.90014,                L_test: 0.53866


 39%|███▉      | 200/510 [05:32<31:15,  6.05s/it]

Fold# 1, Epoch: 1/3, L_train: 0.87184,                L_test: 0.68851


 43%|████▎     | 220/510 [06:05<29:50,  6.18s/it]

Best saved, loss: 0.49977
Fold# 1, Epoch: 1/3, L_train: 0.84937,                L_test: 0.49977


 47%|████▋     | 240/510 [06:38<27:14,  6.05s/it]

Fold# 1, Epoch: 1/3, L_train: 0.82841,                L_test: 0.50676


 51%|█████     | 260/510 [07:11<25:39,  6.16s/it]

Fold# 1, Epoch: 1/3, L_train: 0.81124,                L_test: 0.55497


 55%|█████▍    | 280/510 [07:44<23:15,  6.07s/it]

Fold# 1, Epoch: 1/3, L_train: 0.79324,                L_test: 0.50414


 59%|█████▉    | 300/510 [08:17<21:30,  6.14s/it]

Best saved, loss: 0.48804
Fold# 1, Epoch: 1/3, L_train: 0.77844,                L_test: 0.48804


 63%|██████▎   | 319/510 [08:31<02:23,  1.33it/s]