In [None]:
!pip install --upgrade accelerate

In [None]:
!pip install --no-cache-dir -q transformers sentencepiece

In [2]:
import torch
seed = 22
torch.manual_seed(seed)
torch.backends.cudnn.deterministic = True
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)

In [None]:
from transformers import T5ForConditionalGeneration, T5Tokenizer, T5Config
MODEL_NAME = 'bragovo/qqq'
tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)

def summarize(
    text, n_words=None, compression=None,
    max_length=1000, num_beams=3, do_sample=False, repetition_penalty=10.0, 
    **kwargs
):
    """
    Summarize the text
    The following parameters are mutually exclusive:
    - n_words (int) is an approximate number of words to generate.
    - compression (float) is an approximate length ratio of summary and original text.
    """
    if n_words:
        text = '[{}] '.format(n_words) + text
    elif compression:
        text = '[{0:.1g}] '.format(compression) + text
    x = tokenizer(text, return_tensors='pt', padding=True).to(model.device)
    with torch.inference_mode():
        out = model.generate(
            **x, 
            max_length=max_length, num_beams=num_beams, 
            do_sample=do_sample, repetition_penalty=repetition_penalty, 
            **kwargs
        )
    return tokenizer.decode(out[0], skip_special_tokens=True)

In [None]:
import nltk
nltk.download('popular')

In [5]:
import random

def apply_word_dropout(texts, dropout_prob):
    """
    Apply word dropout by replacing on the  token

    Params:
    -------
    texts (list): list of tokenized texts
    dropout_prob (float): probability of replacing word on  token

    Return:
    -------
    processed_texts (list) - list of tokenised and preprocessed texts
    """

    processed_texts = []
    for text in texts:
        result = []
        tokens = nltk.word_tokenize(text)
        for token in tokens:
            if random.random() < dropout_prob:
                result.append("<unk>")
            else:
                result.append(token)
        processed_texts.append(' '.join(result))
    return processed_texts

In [None]:
!git clone https://github.com/spbu-smart-assistant/support-chatbot.git

In [7]:
!pip install -q datasets

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m474.6/474.6 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 kB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.5/212.5 kB[0m [31m20.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.3/134.3 kB[0m [31m15.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m18.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m114.5/114.5 kB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m15.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m149.6/149.6 kB[0m [31m17.3 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
from datasets import load_dataset
our_dataset = load_dataset('json', data_files={'train': '/content/support-chatbot/data/raw/processed_train.json', 
                                               'test': '/content/support-chatbot/data/raw/processed_test.json'})

In [9]:
our_dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'summa'],
        num_rows: 54
    })
    test: Dataset({
        features: ['text', 'summa'],
        num_rows: 8
    })
})

In [10]:
our_dataset['train'] = our_dataset['train'].shuffle(seed=seed)

In [11]:
our_dataset['train'][0]['text']

'здравствуйте это служба технической поддержки университета чем я могу вам помочь здравствуйте у меня проблема с микрофоном он не улавливает мой голос во время онлайн занятий это ю сб микрофон или микрофон с разъемом три целых пять десятых миллиметра это есьби микрофон хорошо подключите его к другому юсб порту и посмотрите работает ли он ладно я попробую нет он все еще не работает так зайдите в диспетчер устройств и найдите микрофон в разделе аудия входов и выходов щелкните на нем правой кнопкой мыши и выберите обновить драйвер окей я делаю это он говорит что лучшие драйверы уже установлены хм это странно возможно проблема с самим микрофоном у вас есть другой микрофон который вы можете попробовать да у меня есть старый котораый использует разъем три целых и пять десятых миллиметра подключите его и посмотрите работает ли он хорошо и работает может быть вы сможете вернуть новый микрофон и получить деньги назад или замену да я так и сделаю хорошего дня вам тоже до свидания'

In [12]:
prefix = "summarize: "
dropout_prob = 0.0

def preprocess_function(examples):
    dropped_inputs = apply_word_dropout(examples["text"], dropout_prob)
    inputs = [prefix + doc for doc in dropped_inputs]
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True)

    labels = tokenizer(text_target=examples["summa"], max_length=256, truncation=True) # Если поменять max_length на побольше, то, вероятно, модель не будет обрезать предложения на полуслове

    model_inputs["labels"] = labels["input_ids"]
    model_inputs["text"] = examples["text"]
    model_inputs["summa"] = examples["summa"]
    return model_inputs

In [13]:
tokenized_our_dataset = our_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/54 [00:00<?, ? examples/s]

Map:   0%|          | 0/8 [00:00<?, ? examples/s]

In [None]:
tokenized_our_dataset

In [15]:
from transformers import DataCollatorForSeq2Seq
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=MODEL_NAME)

In [16]:
!pip install -q rouge

In [17]:
from rouge import Rouge
import numpy as np
rouge = Rouge()
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    print('True:', decoded_labels)
    print('prediction:', decoded_preds)

    scores = rouge.get_scores(decoded_labels, decoded_preds, avg=True)

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    scores['gen_len'] = {'f': np.mean(prediction_lens)}
    scores['avg_rouge_f'] = {'f': np.mean([scores['rouge-1']['f'], 
                                           scores['rouge-2']['f'], 
                                           scores['rouge-l']['f']])}  

    return {k: round(v['f'], 4) for k, v in scores.items()}

##Train

In [18]:
from transformers import EarlyStoppingCallback
callback = EarlyStoppingCallback(early_stopping_patience=10, 
                                 early_stopping_threshold=0.0001)

In [19]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
model_name = MODEL_NAME.split("/")[-1]
batch_size = 3
training_args = Seq2SeqTrainingArguments(
    output_dir=f"{model_name}-finetuned-on-calls",
    evaluation_strategy="steps",
    eval_steps=50,
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size, 
    per_device_eval_batch_size=1,
    weight_decay=0.01,
    warmup_steps=200,
    save_total_limit=3,
    num_train_epochs=250,
    predict_with_generate=True,
    push_to_hub=False,
    logging_steps=50,
    load_best_model_at_end=True,
    metric_for_best_model='eval_avg_rouge_f',
)

In [20]:
num_folds = 9

In [None]:
from datasets import Dataset
from tqdm.auto import trange
t5_config = T5Config.from_pretrained(MODEL_NAME)
t5_config.dropout_rate = 0.1
total_avg_rouge = 0
for i in trange(num_folds):
  model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME,
                                                     config=t5_config)

  ex_text = tokenized_our_dataset['train'][i*6:(i+1)*6]['text']
  ex_summa = tokenized_our_dataset['train'][i*6:(i+1)*6]['summa']
  ex_input_ids = tokenized_our_dataset['train'][i*6:(i+1)*6]['input_ids']
  ex_attention_mask = tokenized_our_dataset['train'][i*6:(i+1)*6]['attention_mask']
  ex_labels = tokenized_our_dataset['train'][i*6:(i+1)*6]['labels']

  test_dataset = Dataset.from_dict({'text': ex_text, 'summa': ex_summa, 
                                    'input_ids': ex_input_ids, 
                                    'attention_mask': ex_attention_mask,
                                    'labels': ex_labels})
  
  train_text = tokenized_our_dataset['train'][:i*6]['text'] + tokenized_our_dataset['train'][(i+1)*6:]['text']
  train_summa = tokenized_our_dataset['train'][:i*6]['summa'] + tokenized_our_dataset['train'][(i+1)*6:]['summa']

  train_dataset = Dataset.from_dict({'text': train_text, 'summa': train_summa, 
                                     })
  train_dataset.set_transform(preprocess_function)
  trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    )
  trainer.add_callback(callback)
  trainer.train()
  trainer.save_model()
  trainer.save_state()
  metrics = trainer.evaluate()
  total_avg_rouge += metrics['eval_avg_rouge_f']
  torch.cuda.empty_cache()

##Test

In [22]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
model_name = MODEL_NAME.split("/")[-1]
batch_size = 3
training_args = Seq2SeqTrainingArguments(
    output_dir=f"{model_name}-finetuned-on-calls",
    evaluation_strategy="steps",
    eval_steps=50,
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size, 
    per_device_eval_batch_size=1,
    weight_decay=0.01,
    warmup_steps=200,
    save_total_limit=3,
    num_train_epochs=250,
    predict_with_generate=True,
    push_to_hub=False,
    logging_steps=50,
    load_best_model_at_end=True,
    metric_for_best_model='eval_avg_rouge_f',
)

In [None]:
from tqdm.auto import trange
t5_config = T5Config.from_pretrained(MODEL_NAME)
t5_config.dropout_rate = 0.1
model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME,
                                                     config=t5_config)
test_dataset = tokenized_our_dataset['test']
train_dataset = tokenized_our_dataset['train']
train_dataset.set_transform(preprocess_function)
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    )
trainer.train()

In [None]:
trainer.save_model()
trainer.save_state()
metrics = trainer.evaluate()

trainer.log_metrics("eval", metrics)
trainer.save_metrics("eval", metrics)