# Обучение модели FLAN-T5

In [1]:
#%pip install --upgrade pip

In [2]:
#%pip install -q -U einops datasets matplotlib tqdm boto3 git+https://github.com/dask/s3fs

In [3]:
#%pip install wandb

In [4]:
#%pip install transformers

In [5]:
#%pip install evaluate

In [6]:
#%pip install accelerate -U

In [7]:
#%pip install rouge-score

In [8]:
#%pip install bert_score

In [9]:
import pandas as pd
import os
from utils import scripts_rework
import warnings

warnings.filterwarnings("ignore")

In [10]:
df = pd.read_pickle("data/scripts_reworked.pkl")
df

Unnamed: 0,answer,question,context
0,"There’s no point, I just think it’s a good id...","Agreed, what’s your point?",
1,I think this is the place.,"If you have to ask, maybe you shouldn’t be here.","Hang on. One across is Aegean, eight down is..."
2,I think this is the place.,"If you have to ask, maybe you shouldn’t be here.","One across is Aegean, eight down is Nabakov, ..."
3,I think this is the place.,"If you have to ask, maybe you shouldn’t be here.","Can I help you? Yes. Um, is this the High IQ ..."
4,I think this is the place.,"If you have to ask, maybe you shouldn’t be here.","Yes. Um, is this the High IQ sperm bank?"
...,...,...,...
48859,"Well, that would raise a number of problems. ...",What if I were?,"Well, perfect. I made us sandwiches. How thou..."
48860,"Well, that would raise a number of problems. ...",What if I were?,"How thoughtful. Thank you. Mmm. No big deal, ..."
48861,"Well, that would raise a number of problems. ...",What if I were?,"Mmm. No big deal, I enjoy spending time with ..."
48862,"Well, that would raise a number of problems. ...",What if I were?,"And I with you. Question, are you seeking a r..."


In [11]:
df["combined"] = df[["question", "context"]].apply(
    lambda df: "context: " + df["context"] + "</s>" + 'question: '
+ df['question'], axis = 1)
df

Unnamed: 0,answer,question,context,combined
0,"There’s no point, I just think it’s a good id...","Agreed, what’s your point?",,"context: </s>question: Agreed, what’s your po..."
1,I think this is the place.,"If you have to ask, maybe you shouldn’t be here.","Hang on. One across is Aegean, eight down is...","context: Hang on. One across is Aegean, eigh..."
2,I think this is the place.,"If you have to ask, maybe you shouldn’t be here.","One across is Aegean, eight down is Nabakov, ...","context: One across is Aegean, eight down is ..."
3,I think this is the place.,"If you have to ask, maybe you shouldn’t be here.","Can I help you? Yes. Um, is this the High IQ ...","context: Can I help you? Yes. Um, is this the..."
4,I think this is the place.,"If you have to ask, maybe you shouldn’t be here.","Yes. Um, is this the High IQ sperm bank?","context: Yes. Um, is this the High IQ sperm b..."
...,...,...,...,...
48859,"Well, that would raise a number of problems. ...",What if I were?,"Well, perfect. I made us sandwiches. How thou...","context: Well, perfect. I made us sandwiches...."
48860,"Well, that would raise a number of problems. ...",What if I were?,"How thoughtful. Thank you. Mmm. No big deal, ...",context: How thoughtful. Thank you. Mmm. No b...
48861,"Well, that would raise a number of problems. ...",What if I were?,"Mmm. No big deal, I enjoy spending time with ...","context: Mmm. No big deal, I enjoy spending t..."
48862,"Well, that would raise a number of problems. ...",What if I were?,"And I with you. Question, are you seeking a r...","context: And I with you. Question, are you se..."


In [12]:
from sklearn.model_selection import train_test_split

train_df, val_df = train_test_split(df, test_size=0.01, random_state=42)
train_df = train_df.reset_index(drop=True)
val_df = val_df.reset_index(drop=True)
train_df.shape, val_df.shape

((48375, 4), (489, 4))

In [13]:
from datasets import Dataset, DatasetDict

dataset = DatasetDict(
    {
        "train": Dataset.from_pandas(train_df.reset_index(drop=True)),
        "valid": Dataset.from_pandas(val_df.reset_index(drop=True)),
    }
)
dataset

DatasetDict({
    train: Dataset({
        features: ['answer', 'question', 'context', 'combined'],
        num_rows: 48375
    })
    valid: Dataset({
        features: ['answer', 'question', 'context', 'combined'],
        num_rows: 489
    })
})

In [14]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model_id = "google/flan-t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_id)

In [15]:
from datasets import concatenate_datasets

# The maximum total input sequence length after tokenization.
# Sequences longer than this will be truncated, sequences shorter will be padded.
tokenized_inputs = concatenate_datasets([dataset["train"], dataset["valid"]]).map(
    lambda x: tokenizer(x["combined"], truncation=True),
    batched=True,
    remove_columns=["context", "question", "answer", "combined"],
)
max_source_length = max([len(x) for x in tokenized_inputs["input_ids"]])
print(f"Max source length: {max_source_length}")

# The maximum total sequence length for target text after tokenization.
# Sequences longer than this will be truncated, sequences shorter will be padded."
tokenized_targets = concatenate_datasets([dataset["train"], dataset["valid"]]).map(
    lambda x: tokenizer(x["answer"], truncation=True),
    batched=True,
    remove_columns=["context", "question", "answer", "combined"],
)
max_target_length = max([len(x) for x in tokenized_targets["input_ids"]])
print(f"Max target length: {max_target_length}")



Map:   0%|          | 0/48864 [00:00<?, ? examples/s]

Max source length: 332


Map:   0%|          | 0/48864 [00:00<?, ? examples/s]

Max target length: 304


In [16]:
def preprocess_function(sample, padding="max_length"):

    model_inputs = tokenizer(
        sample['combined'], max_length=max_source_length, padding=padding, truncation=True
    )

    # Tokenize targets with the `text_target` keyword argument
    labels = tokenizer(
        text_target=sample["answer"],
        max_length=max_target_length,
        padding=padding,
        truncation=True,
    )

    # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
    # padding in the loss.
    if padding == "max_length":
        labels["input_ids"] = [
            [(l if l != tokenizer.pad_token_id else -100) for l in label]
            for label in labels["input_ids"]
        ]

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


tokenized_dataset = dataset.map(
    preprocess_function, batched=True, remove_columns=["context", "question", "answer", "combined"]
)
print(f"Keys of tokenized dataset: {list(tokenized_dataset['train'].features)}")

Map:   0%|          | 0/48375 [00:00<?, ? examples/s]

Map:   0%|          | 0/489 [00:00<?, ? examples/s]

Keys of tokenized dataset: ['input_ids', 'attention_mask', 'labels']


In [17]:
from transformers import AutoModelForSeq2SeqLM

# huggingface hub model id
model_id = "google/flan-t5-base"

# load model from the hub
model = AutoModelForSeq2SeqLM.from_pretrained(model_id)

In [18]:
import evaluate
import nltk
import numpy as np
from nltk.tokenize import sent_tokenize

nltk.download("punkt")

# Metric
rouge = evaluate.load("rouge")
bertscore = evaluate.load("bertscore")


# helper function to postprocess text
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]

    # rougeLSum expects newline after each sentence
    preds = ["\n".join(sent_tokenize(pred)) for pred in preds]
    labels = ["\n".join(sent_tokenize(label)) for label in labels]

    return preds, labels


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    bertscore_result = bertscore.compute(predictions=decoded_preds, references=decoded_labels,lang="en")
    

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    rouge_result = rouge.compute(
        predictions=decoded_preds, references=decoded_labels, use_stemmer=True
    )
    rouge_result = {k: round(v * 100, 4) for k, v in rouge_result.items()}
    prediction_lens = [
        np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds
    ]
    rouge_result["gen_len"] = np.mean(prediction_lens)
    
    result = {'rouge_1': round(rouge_result['rouge1'], 4),
              'rouge_2': round(rouge_result['rouge2'], 4),
              'rouge_L': round(rouge_result['rougeL'], 4), 
              'avg_len': round(rouge_result['gen_len'], 4), 
              'bertscore_prec': round(np.mean(bertscore_result['precision']), 4),
              'bertscore_rec': round(np.mean(bertscore_result['recall']), 4),
              'bertscore_f1': round(np.mean(bertscore_result['f1']), 4)}
    return result

[nltk_data] Downloading package punkt to /home/jupyter/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.95k [00:00<?, ?B/s]

In [19]:
from transformers import DataCollatorForSeq2Seq

# we want to ignore tokenizer pad token in the loss
label_pad_token_id = -100
# Data collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer, model=model, label_pad_token_id=label_pad_token_id, pad_to_multiple_of=8
)

In [20]:
from huggingface_hub import HfFolder
HfFolder.save_token(os.environ['hugging_face_login'])

In [21]:
import wandb
os.environ["WANDB_PROJECT"] = "generative_models_chat"
os.environ["WANDB_LOG_MODEL"] = "true"
wandb.login(key=os.environ["wandb_login"])

[34m[1mwandb[0m: Currently logged in as: [33mkatya_shakhova[0m ([33mshakhova[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/jupyter/.netrc


True

In [22]:
from huggingface_hub import HfFolder
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

# Hugging Face repository id
repository_id = f"{model_id.split('/')[1]}-sheldon-chat-v2"

# Define training args
training_args = Seq2SeqTrainingArguments(
    output_dir=repository_id,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    predict_with_generate=True,
    fp16=False,  # Overflows with fp16
    learning_rate=5e-5,
    num_train_epochs=5,
    warmup_ratio=0.1,
    weight_decay=0.1,
    optim="adamw_torch",
    # logging & evaluation strategies
    evaluation_strategy="steps",
    eval_steps=200,
    save_strategy="no",
    save_total_limit=1,
    report_to="wandb",
    logging_steps=200,
    push_to_hub=False,
    hub_model_id=repository_id,
    hub_token=HfFolder.get_token(),
)

# Create Trainer instance
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["valid"],
    compute_metrics=compute_metrics,
)

In [23]:
trainer.train()

[34m[1mwandb[0m: wandb version 0.16.4 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade
[34m[1mwandb[0m: Tracking run with wandb version 0.16.3
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/home/jupyter/work/resources/chat_bot_katya_HW2/wandb/run-20240307_100606-t5nq3oct[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mcelestial-hill-37[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/shakhova/generative_models_chat[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/shakhova/generative_models_chat/runs/t5nq3oct[0m


Step,Training Loss,Validation Loss,Rouge 1,Rouge 2,Rouge L,Avg Len,Bertscore Prec,Bertscore Rec,Bertscore F1
200,3.5731,3.255625,9.2886,1.7826,8.8558,10.8814,0.8661,0.8494,0.8573
400,3.421,3.175784,9.2559,1.9064,8.8243,10.7076,0.8714,0.8538,0.8622
600,3.3457,3.119731,9.0991,2.1854,8.7076,10.1063,0.8741,0.8546,0.8639
800,3.3013,3.075485,8.5523,1.8896,8.2224,12.3681,0.8706,0.8537,0.8617
1000,3.2509,3.038346,9.2396,2.0409,8.8314,12.0511,0.8729,0.8552,0.8636
1200,3.2376,3.00055,8.7356,1.8181,8.3039,12.9796,0.8702,0.8541,0.8617
1400,3.2075,2.96021,9.4303,2.0003,8.7571,16.0736,0.8633,0.8548,0.8587
1600,3.1342,2.926318,10.8148,2.2698,9.8735,15.4826,0.866,0.8571,0.8611
1800,3.1376,2.891541,9.9381,1.8569,9.1291,15.1718,0.8635,0.8553,0.859
2000,3.0862,2.864813,9.6534,1.7925,8.9539,14.9939,0.8603,0.8538,0.8567


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


TrainOutput(global_step=15120, training_loss=2.696330834444238, metrics={'train_runtime': 10620.0465, 'train_samples_per_second': 22.775, 'train_steps_per_second': 1.424, 'total_flos': 1.0869185553408e+17, 'train_loss': 2.696330834444238, 'epoch': 5.0})

In [24]:
# Save our tokenizer and create model card
tokenizer.save_pretrained(repository_id)
trainer.create_model_card()
# Push the results to the hub
trainer.push_to_hub()

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

training_args.bin:   0%|          | 0.00/4.67k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Shakhovak/flan-t5-base-sheldon-chat-v2/commit/ae2a7ee107d9da041aa3be37333ec32b3e9d9f5a', commit_message='End of training', commit_description='', oid='ae2a7ee107d9da041aa3be37333ec32b3e9d9f5a', pr_url=None, pr_revision=None, pr_num=None)

In [25]:
wandb.finish()

[34m[1mwandb[0m: [32m[41mERROR[0m Control-C detected -- Run data was not synced
