<a href="https://colab.research.google.com/github/ronalds82/Datu-kopas-tulkosana/blob/main/SQuAD.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Autors: *Ronalds Turnis, rt20018*
===========================


# Globālie mainīgie

In [1]:
INSTALL = False # True, ja videi ir nepieciešamas papildus instalācijas
TRAIN = True  # True, ja modelim ir nepieciešama satura regulēšana
FINE_TUNED = False # True, ja testējam pašu saglabātu modeli
CHAT = False # True, ja vēlamies ieslēgt tērzētavu ar modeli
FLATTEN = True # True, ja ir nepieciešama SQuAD datu kopas "saspiešana" (ja faili neeksistē)

weights_dir = './weights' # Tiek glabāti modeļa svari
logs_dir = './logs' # Tiek glabāta notikumu informācija
tmp_dir = './tmp_trainer' # Glabājās pagaidu informācija

# Jānorāda, ja vēlamies veikt pašu saglabāta modeļa novērtēšanu
model_link = 'https://www.dropbox.com/scl/fi/cguiqykr0jlp1yj6mi7rm/modelis.pth?rlkey=m7oeyaev1pt0x1c5u2kdnl0bu&dl=0'
model_weight_file = 'modelis.pth?rlkey=m7oeyaev1pt0x1c5u2kdnl0bu'

# 1.1 vai 2.0
SQuAD_version = '2.0'

# SQUAD testēšanai nepieciešamie mainīgie
data_files = {'train': f'train-v{SQuAD_version}.LV.json', 'validation': f'train-v{SQuAD_version}.LV.json'}
flattened_train_file_1 = 'flattened_train_file_1.json'
flattened_val_file_2 = 'flattened_val_file_2.json'

# Šie ir nepieciešami failu saglabāšanai, bet netiek izmantoti
flattened_val_file_1 = 'flattened_val_file_1.json'
flattened_train_file_2 = 'flattened_train_file_2.json'

# Faili, kuri tiks izmantoti apmācības un validācijas laikā
flattened_data_files = {'train': flattened_train_file_1, 'validation': flattened_val_file_2}

test_size = 0.2 # LV validācijas datu kopas izmērs attiecībā pret apmācības datu kopu

model_name = 'bert-base-multilingual-cased' # Norādām HuggingFace modeli, kurš tiks izmantots

max_length = 384 # Maksimālais garums context un question string tipa mainīgajiem
doc_stride = 128 # Pārlaidums starp segmentiem, lai apstrādātu garākus tekstus, nekā modelis var apstrādāt vienlaikus

# Moduļu importēšana

In [None]:
# Ja ir nepieciešamas papildus instalācijas
if INSTALL:
    !pip install datasets
    !pip install transformers[torch] -U
    !pip install accelerate -U
    !pip install rouge_score
    !pip install evaluate

# Importējam nepieciešamos moduļus
import pandas as pd
import torch
import matplotlib.pyplot as plt
import numpy as np
import shutil
import os
import evaluate
import json
import collections
from tqdm import tqdm, tqdm_notebook
from transformers import Trainer, TrainingArguments, IntervalStrategy, DataCollatorForLanguageModeling, trainer_utils, EvalPrediction
from transformers import default_data_collator, AutoTokenizer, AutoModelForQuestionAnswering, tokenization_utils_base
from sklearn.model_selection import train_test_split
from google.colab import files
from torch.utils.data import Dataset, DataLoader, IterableDataset, random_split
from datasets import load_dataset, arrow_dataset
from torch.nn import CrossEntropyLoss
from pathlib import Path
from typing import Optional, Tuple

# Modeļa un tokenizera inicializācija

In [None]:
print("Ielādējam modeli un tokenizeru...")

# Modeļa un tokenizera definēšana
model = AutoModelForQuestionAnswering.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Iestatām padding tokenu, ja tā nav
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Pārbaudām CUDA pieejamību un uzstādām modeli uz atbilstošās ierīces
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

print("Modelis un tokenizeris inicializēts!")
print("Izmantotā ierīce:", device)

# Iepriekšējo datu dzēšana

In [None]:
# Pārbaudām vai eksistē ģenerētās direktorijas un dzēšam tās pirms modeļa trenēšanas, lai nekonfliktētu ar vecajiem datiem
def delete_directory(dir_path):
    try:
        if os.path.exists(dir_path):
            shutil.rmtree(dir_path)
            print(f"Direktorija dzēsta: {dir_path}")
        else:
            print(f"Direktorija neeksistē: {dir_path}")
    except Exception as e:
        print(f"Kļūda dzēšot direktoriju: {e}")

delete_directory(weights_dir)
delete_directory(logs_dir)
delete_directory(tmp_dir)

# Izdzēšam GPU kešatmiņu
torch.cuda.empty_cache()

# Datu inicializācija

In [None]:
# Pārbaudam, vai "spilventiņš" ir uz labās puses
pad_on_right = tokenizer.padding_side == "right"

# SquadDatasetFlat klase ar norādītajiem failiem un ceļiem
class SquadDatasetFlat():
    def __init__(self,
                 path_to_json_file: str,
                 checkpoint_path: str,
                 train_file: str,
                 val_file: str) -> None:
        self.path_to_json_file = path_to_json_file
        self.checkpoint_path = checkpoint_path

        self.train_file = train_file
        self.val_file = val_file

        self.data = self.load_data()

    def load_data(self):
        # Atver un nolasa JSON failu ar apmācības datiem
        with open(self.path_to_json_file, 'r') as f:
            train_data = json.load(f)
        print(f'Flattening SQUAD {train_data["version"]}')

        # Izlīdzina SQUAD datus apmācībai un validācijai
        train_data_flat, val_data_flat, errors = self.load_squad_data(train_data)
        print(f'\nErroneous Datapoints: {errors}')

        # Saglabā izlīdzinātos apmācības datus
        with open(Path(self.checkpoint_path) / Path(self.train_file), 'w') as file:
            train_data = {'data': train_data_flat}
            file.write(json.dumps(train_data))
            file.close()

        # Saglabā izlīdzinātos validācijas datus
        with open(Path(self.checkpoint_path) / Path(self.val_file), 'w') as file:
            val_data = {'data': val_data_flat}
            file.write(json.dumps(val_data))
            file.close()

    def load_squad_data(self, data):
        # Inicializē kļūdu skaitu un tukšus sarakstus izlīdzinātiem datiem
        errors = 0
        flattened_data_train = []
        flattened_data_val = []

        # Aprēķina robežu, līdz kurai dati tiks izmantoti apmācībai
        train_range = len(data['data']) - (len(data['data']) * test_size)

        # Pāriet cauri visiem datiem un izlīdzina tos
        for i, article in enumerate(data["data"]):
            title = article.get("title", "").strip()
            for paragraph in article["paragraphs"]:
                context = paragraph["context"].strip()
                for qa in paragraph["qas"]:
                    question = qa["question"].strip()
                    id_ = qa["id"]

                    # Iegūst atbildes sākuma pozīcijas un pašas atbildes
                    answer_starts = [answer["answer_start"] for answer in qa["answers"]]
                    answers = [answer["text"].strip() for answer in qa["answers"]]

                    # Pievieno datus apmācībai vai validācijai atkarībā no indeksa
                    if i <= train_range:
                        flattened_data_train.append({"title": title,
                                                     "context": context,
                                                     "question": question,
                                                     "id": id_,
                                                     "answers": {
                                                         "answer_start": answer_starts,
                                                         "text": answers}
                                                     })
                    else:
                        flattened_data_val.append({"title": title,
                                                   "context": context,
                                                   "question": question,
                                                   "id": id_,
                                                   "answers": {
                                                       "answer_start": answer_starts,
                                                       "text": answers}
                                                   })

        # Atgriež izlīdzinātos apmācības un validācijas datus kopā ar kļūdu skaitu
        return flattened_data_train, flattened_data_val, errors

def prepare_squad_dataset(examples: collections.OrderedDict or dict) -> tokenization_utils_base.BatchEncoding:
    tokenized_examples = tokenizer(
        examples["question" if pad_on_right else "context"],
        examples["context" if pad_on_right else "question"],
        truncation="only_second" if pad_on_right else "only_first",
        max_length=max_length,
        stride=doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    # Tā kā viens piemērs var mums dot vairākus paraugus, ja tajā ir garš konteksts, mums ir vajadzīga karte no parauga uz
    # tā atbilstošo piemēru. Šī atslēga mums to dod
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")

    # Offsetu kartējumi mums dos karti no tokena uz rakstzīmes pozīciju sākotnējā kontekstā. Tas
    # palīdzēs mums aprēķināt starta pozīcijas un beigu pozīcijas
    offset_mapping = tokenized_examples.pop("offset_mapping")
    tokenized_examples["start_positions"] = []
    tokenized_examples["end_positions"] = []

    for i, offsets in enumerate(offset_mapping):
        # CLS indeksa iegūšana
        input_ids = tokenized_examples["input_ids"][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)

        # Paņemam sekvenci, kas atbilst šim piemēram (lai zinātu, kas ir konteksts un kas ir jautājums)
        sequence_ids = tokenized_examples.sequence_ids(i)

        # Viens piemērs var dot vairākus spanus, šis ir indekss piemēram, kas satur šo tekstu
        sample_index = sample_mapping[i]
        answers = examples["answers"][sample_index]

        if len(answers["answer_start"]) == 0:
            tokenized_examples["start_positions"].append(cls_index)
            tokenized_examples["end_positions"].append(cls_index)
        else:
            # Atbilžu sākuma/beigu rakstzīmju indeksi tekstā
            start_char = answers["answer_start"][0]
            end_char = start_char + len(answers["text"][0])

            # Sākuma tokena indekss pašreizējam spanam tekstā
            token_start_index = 0
            while sequence_ids[token_start_index] != (1 if pad_on_right else 0):
                token_start_index += 1

            # Beigu tokena indekss pašreizējam spanam tekstā
            token_end_index = len(input_ids) - 1
            while sequence_ids[token_end_index] != (1 if pad_on_right else 0):
                token_end_index -= 1

            # Pārbaudām, vai atbilde ir ārpus spana (šajā gadījumā šis paraugs tiek marķēts ar CLS indeksu)
            if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
                tokenized_examples["start_positions"].append(cls_index)
                tokenized_examples["end_positions"].append(cls_index)
            else:
                # Pārvietojam token_start_index un token_end_index uz atbildes abiem galiem
                while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                    token_start_index += 1
                tokenized_examples["start_positions"].append(token_start_index - 1)
                while offsets[token_end_index][1] >= end_char:
                    token_end_index -= 1
                tokenized_examples["end_positions"].append(token_end_index + 1)

    return tokenized_examples

def prepare_validation_features(examples: collections.OrderedDict or dict) -> tokenization_utils_base.BatchEncoding:
    # Tokenizējam mūsu piemērus ar trancēšanu un varbūt arī ar spilventiņu, bet paturam pārplūdes, izmantojot stride. Tas rezultējas
    # vienā piemēra iespējā dot vairākus paraugus, ja konteksts ir garš, katram no šiem paraugiem ir
    # konteksts, kas pārklājas mazliet ar iepriekšējā parauga kontekstu
    tokenized_examples = tokenizer(
        examples["question" if pad_on_right else "context"],
        examples["context" if pad_on_right else "question"],
        truncation="only_second" if pad_on_right else "only_first",
        max_length=max_length,
        stride=doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    # Tā kā viens piemērs var mums dot vairākus paraugus, ja tajā ir garš konteksts, mums ir vajadzīga karte no parauga uz
    # tā atbilstošo piemēru. Šī atslēga mums to dod
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")

    # Offsetu kartējumi mums dos karti no tokena uz rakstzīmes pozīciju sākotnējā kontekstā. Tas
    # palīdzēs mums aprēķināt starta_pozīcijas un beigu_pozīcijas
    offset_mapping = tokenized_examples["offset_mapping"]
    tokenized_examples["start_positions"] = []
    tokenized_examples["end_positions"] = []

    for i, offsets in enumerate(offset_mapping):
        # CLS indekss
        input_ids = tokenized_examples["input_ids"][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)

        # Iegūstam secību, kas atbilst šim piemēram (lai zinātu, kas ir konteksts un kas ir jautājums)
        sequence_ids = tokenized_examples.sequence_ids(i)

        # Viens piemērs var dot vairākus posmus, šis ir piemēra indekss, kas satur šo teksta posmu
        sample_index = sample_mapping[i]
        answers = examples["answers"][sample_index]

        if len(answers["answer_start"]) == 0:
            tokenized_examples["start_positions"].append(cls_index)
            tokenized_examples["end_positions"].append(cls_index)
        else:
            # Atbilžu sākuma/beigu rakstzīmju indeksi tekstā
            start_char = answers["answer_start"][0]
            end_char = start_char + len(answers["text"][0])

            # Sākuma tokena indekss pašreizējā teksta posmā
            token_start_index = 0
            while sequence_ids[token_start_index] != (1 if pad_on_right else 0):
                token_start_index += 1

            # Beigu tokena indekss pašreizējā teksta posmā
            token_end_index = len(input_ids) - 1
            while sequence_ids[token_end_index] != (1 if pad_on_right else 0):
                token_end_index -= 1

            # Nosaka, vai atbilde ir ārpus posma (šajā gadījumā šī iezīme tiek marķēta ar CLS indeksu)
            if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
                tokenized_examples["start_positions"].append(cls_index)
                tokenized_examples["end_positions"].append(cls_index)
            else:
                # Pretējā gadījumā pārvieto token_start_index un token_end_index uz abiem atbildes galiem
                # Piezīme: mēs varētu pāriet aiz pēdējā ofseta, ja atbilde ir pēdējais vārds (izņēmuma gadījums)
                while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                    token_start_index += 1
                tokenized_examples["start_positions"].append(token_start_index - 1)
                while offsets[token_end_index][1] >= end_char:
                    token_end_index -= 1
                tokenized_examples["end_positions"].append(token_end_index + 1)

    # Mēs saglabājam example_id, kas mums deva šo iezīmi, un mēs saglabāsim ofsetu kartējumus
    tokenized_examples["example_id"] = []

    for i in range(len(tokenized_examples["input_ids"])):
        # Iegūstam secību, kas atbilst šim piemēram (lai zinātu, kas ir konteksts un kas ir jautājums)
        sequence_ids = tokenized_examples.sequence_ids(i)
        context_index = 1 if pad_on_right else 0

        # Viens piemērs var dot vairākus posmus, šis ir piemēra indekss, kas satur šo teksta posmu
        sample_index = sample_mapping[i]
        tokenized_examples["example_id"].append(examples["id"][sample_index])

        # Nosaka par None ofsetu kartējumus, kas nav daļa no konteksta, lai būtu viegli noteikt, vai tokena
        # pozīcija ir daļa no konteksta vai nē
        tokenized_examples["offset_mapping"][i] = [
            (o if sequence_ids[k] == context_index else None)
            for k, o in enumerate(tokenized_examples["offset_mapping"][i])
        ]

    return tokenized_examples

# Sagatavojam datus
if FLATTEN:
    _ = SquadDatasetFlat(data_files['train'], '', flattened_train_file_1, flattened_val_file_1)
    _ = SquadDatasetFlat(data_files['validation'], '', flattened_train_file_2, flattened_val_file_2)

train_data = load_dataset("json", data_files=flattened_data_files['train'], field='data')
val_data = load_dataset("json", data_files=flattened_data_files['validation'], field='data')

train_dataset = train_data.map(prepare_squad_dataset, batched=True, remove_columns=train_data['train'].column_names)
val_dataset = val_data['train'].map(prepare_validation_features, batched=True, remove_columns=val_data['train'].column_names)

In [None]:
print(train_dataset['train'][:5])

# SQuAD datu kopas analīze

In [None]:
def get_text(answer: list) -> str:
  if len(answer) == 0:
      return ''
  else:
      return answer[0]

def get_json_data(json_path: str) -> dict:
    f = open(json_path)

    # Atgriež JSON objektu kā sarakstu
    json_data = json.load(f)

    f.close()
    return json_data

train_dataframe = pd.json_normalize(get_json_data(flattened_train_file_1), record_path='data')
train_dataframe["answers.text"] = train_dataframe["answers.text"].apply(get_text)

train_dataframe

In [None]:
figsize = (10,6)
train_dataframe['context'].apply(len).plot.hist(title="Konteksta garuma histogramma", bins=20, figsize=figsize, grid=True)

In [None]:
train_dataframe['question'].apply(len).plot.hist(title="Jautājumu garuma histogramma", bins=20, figsize=figsize, grid=True)

# Metriku aprēķināšanas funkcijas

In [None]:
metrics_history = []

metric = evaluate.load("squad_v2" if SQuAD_version == '2.0' else "squad")

def postprocess_qa_predictions(examples: arrow_dataset.Dataset,
                               features: arrow_dataset.Dataset,
                               raw_predictions: tuple,
                               n_best_size: int = 20,
                               max_answer_length: int = 50) -> collections.OrderedDict:
    all_start_logits, all_end_logits = raw_predictions

    # Build a map example to its corresponding features.
    example_id_to_index = {k: i for i, k in enumerate(examples["id"])}
    features_per_example = collections.defaultdict(list)

    for i, feature in enumerate(features):
        features_per_example[example_id_to_index[feature["example_id"]]].append(i)

    # The dictionaries we have to fill.
    all_predictions = collections.OrderedDict()
    all_nbest_json = collections.OrderedDict()

    if SQuAD_version == '2.0':
        scores_diff_json = collections.OrderedDict()

    # Let's loop over all the examples!
    for example_index, example in enumerate(tqdm(examples)):
        # Those are the indices of the features associated to the current example.
        feature_indices = features_per_example[example_index]

        min_null_prediction = None
        prelim_predictions = []

        # Looping through all the features associated to the current example.
        for feature_index in feature_indices:
            # We grab the predictions of the model for this feature.
            start_logits = all_start_logits[feature_index]
            end_logits = all_end_logits[feature_index]
            # This is what will allow us to map some the positions in our logits to span of texts in the original
            # context.
            offset_mapping = features[feature_index]["offset_mapping"]
            # Optional `token_is_max_context`, if provided we will remove answers that do not have the maximum context
            # available in the current feature.
            token_is_max_context = features[feature_index].get("token_is_max_context", None)

            # Update minimum null prediction.
            feature_null_score = start_logits[0] + end_logits[0]
            if min_null_prediction is None or min_null_prediction["score"] > feature_null_score:
                min_null_prediction = {
                    "offsets": (0, 0),
                    "score": feature_null_score,
                    "start_logit": start_logits[0],
                    "end_logit": end_logits[0],
                }

            # Go through all possibilities for the `n_best_size` greater start and end logits.
            start_indexes = np.argsort(start_logits)[-1 : -n_best_size - 1 : -1].tolist()
            end_indexes = np.argsort(end_logits)[-1 : -n_best_size - 1 : -1].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:
                    # Don't consider out-of-scope answers, either because the indices are out of bounds or correspond
                    # to part of the input_ids that are not in the context.
                    if (
                        start_index >= len(offset_mapping)
                        or end_index >= len(offset_mapping)
                        or offset_mapping[start_index] is None
                        or len(offset_mapping[start_index]) < 2
                        or offset_mapping[end_index] is None
                        or len(offset_mapping[end_index]) < 2
                    ):
                        continue
                    # Don't consider answers with a length that is either < 0 or > max_answer_length.
                    if end_index < start_index or end_index - start_index + 1 > max_answer_length:
                        continue
                    # Don't consider answer that don't have the maximum context available (if such information is
                    # provided).
                    if token_is_max_context is not None and not token_is_max_context.get(str(start_index), False):
                        continue

                    prelim_predictions.append(
                        {
                            "offsets": (offset_mapping[start_index][0], offset_mapping[end_index][1]),
                            "score": start_logits[start_index] + end_logits[end_index],
                            "start_logit": start_logits[start_index],
                            "end_logit": end_logits[end_index],
                        }
                    )
        if SQuAD_version == '2.0' and min_null_prediction is not None:
            # Add the minimum null prediction
            prelim_predictions.append(min_null_prediction)
            null_score = min_null_prediction["score"]

        # Only keep the best `n_best_size` predictions.
        predictions = sorted(prelim_predictions, key=lambda x: x["score"], reverse=True)[:n_best_size]

        # Add back the minimum null prediction if it was removed because of its low score.
        if (
            SQuAD_version == '2.0'
            and min_null_prediction is not None
            and not any(p["offsets"] == (0, 0) for p in predictions)
        ):
            predictions.append(min_null_prediction)

        # Use the offsets to gather the answer text in the original context.
        context = example["context"]
        for pred in predictions:
            offsets = pred.pop("offsets")
            pred["text"] = context[offsets[0] : offsets[1]]

        # In the very rare edge case we have not a single non-null prediction, we create a fake prediction to avoid
        # failure.
        if len(predictions) == 0 or (len(predictions) == 1 and predictions[0]["text"] == ""):
            predictions.insert(0, {"text": "empty", "start_logit": 0.0, "end_logit": 0.0, "score": 0.0})

        # Compute the softmax of all scores (we do it with numpy to stay independent from torch/tf in this file, using
        # the LogSumExp trick).
        scores = np.array([pred.pop("score") for pred in predictions])
        exp_scores = np.exp(scores - np.max(scores))
        probs = exp_scores / exp_scores.sum()

        # Include the probabilities in our predictions.
        for prob, pred in zip(probs, predictions):
            pred["probability"] = prob

        # Pick the best prediction. If the null answer is not possible, this is easy.
        if not SQuAD_version == '2.0':
            all_predictions[example["id"]] = predictions[0]["text"]
        else:
            # Otherwise we first need to find the best non-empty prediction.
            i = 0
            while predictions[i]["text"] == "":
                i += 1
            best_non_null_pred = predictions[i]

            # Then we compare to the null prediction using the threshold.
            score_diff = null_score - best_non_null_pred["start_logit"] - best_non_null_pred["end_logit"]
            scores_diff_json[example["id"]] = float(score_diff)  # To be JSON-serializable.
            if score_diff > 0.2:
                all_predictions[example["id"]] = ""
            else:
                all_predictions[example["id"]] = best_non_null_pred["text"]

        # Make `predictions` JSON-serializable by casting np.float back to float.
        all_nbest_json[example["id"]] = [
            {k: (float(v) if isinstance(v, (np.float16, np.float32, np.float64)) else v) for k, v in pred.items()}
            for pred in predictions
        ]

    return all_predictions

def compute_metrics(pred: trainer_utils.EvalPrediction) -> dict:
    # Lai iegūtu galīgās prognozes, mēs varam pielietot mūsu pēcapstrādes funkciju mūsu sākotnējām prognozēm
    final_predictions = postprocess_qa_predictions(val_data['train'], val_dataset, pred.predictions)

    if SQuAD_version == '2.0':
        formatted_predictions = [{"id": str(k), "prediction_text": v, "no_answer_probability": 0.0} for k, v in final_predictions.items()]
    else:
        formatted_predictions = [{"id": str(k), "prediction_text": v} for k, v in final_predictions.items()]

    # Mums tikai jānomaina prognozes un marķējumi, jo metrika sagaida saraksta sarakstu, nevis vienu lielu sarakstu
    references = [{"id": ex["id"], "answers": ex["answers"]} for ex in val_data["train"]]

    # Aprēķinām rezultātu
    results = metric.compute(predictions=formatted_predictions, references=references)

    results_squad = {
        'f1': results['f1'],
        'exact': results['exact']
    }

    metrics_history.append(results_squad)

    return results_squad

# Modeļa trenēšana

In [None]:
training_args = TrainingArguments(
    output_dir=weights_dir,
    num_train_epochs=3,
    per_device_train_batch_size=12,
    per_device_eval_batch_size=12,
    eval_strategy=IntervalStrategy.STEPS,
    eval_steps=2500,
    warmup_steps=1000,
    weight_decay=0.001,
    learning_rate=3e-5,
    logging_dir=logs_dir,
    logging_steps=2500,
    save_strategy=IntervalStrategy.EPOCH,
    report_to="none",
    fp16=True,
    label_names=["start_positions", "end_positions"]
)

# Inicializējam traineri
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset['train'],
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    data_collator=default_data_collator
)

if TRAIN:
    trainer.train()

# Apmācītā modeļa analīze

In [None]:
def plot_results(training_history):
    # Definējam sarakstus uzskaitei
    steps = []
    training_losses = []
    validation_losses = []

    # Ejam cauri katram ierakstam vēsturē
    for entry in training_history:
        if 'loss' in entry:  # Apmācības zaudējums
            steps.append(entry.get('step', len(steps) + 1))
            training_losses.append(entry['loss'])

        if 'eval_loss' in entry:  # Validācijas zaudējums
            validation_losses.append(entry['eval_loss'])

    # Izveidojam grafiku
    plt.figure(figsize=(12, 6))

    if training_losses:
        plt.plot(steps[:len(training_losses)], training_losses, label='Apmācības zaudējums', marker='o')

    if validation_losses:
        validation_steps = steps[:len(validation_losses)]
        plt.plot(validation_steps, validation_losses, label='Validācijas zaudējums', marker='x', linestyle='--')

    plt.xlabel('Soļu skaits')
    plt.ylabel('Zaudējums')
    plt.title('Apmācības un validācijas zaudējums apmācības laikā')
    plt.legend()
    plt.grid(True)
    plt.show()

def plot_metrics_history(metrics_history):
    epochs = list(range(1, len(metrics_history)+1))
    plt.figure(figsize=(12, 8))

    # Sagatavojam datus parādīšanai grafikā
    exact_match_scores = [x['exact'] for x in metrics_history]
    f1_scores = [x['f1'] for x in metrics_history]

    # Izveidojam grafiku
    plt.plot(epochs, exact_match_scores, label='Exact Match', marker='o')
    plt.plot(epochs, f1_scores, label='F1', marker='o')

    plt.xlabel('Iterācija')
    plt.ylabel('Rezultāts')
    plt.title('Vērtēšanas metriku rezultāti atkarībā no iterācijas')
    plt.legend()
    plt.grid(True)
    plt.show()

# Izpildām tikai, ja ir notikusi modeļa pielāgošana, citādi tas tiks darīts "Neapmācīta modeļa analīze" sadaļā
if TRAIN:
    plot_results(trainer.state.log_history)
    print()
    plot_metrics_history(metrics_history)

# Apmācītā modeļa saglabāšana

In [None]:
if TRAIN:
    model_weights = f'{model_name}-modelis.pth'
    torch.save(model.state_dict(), model_weights)

# Neapmācīta modeļa analīze



In [None]:
if FINE_TUNED:
    !wget model_link

    # Ielādējam saglabātos svarus
    model.load_state_dict(torch.load(model_weight_file))

    # Pārbaudām CUDA pieejamību un uzstādām modeli uz atbilstošās ierīces
    model.to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))

if TRAIN == False:
    trainer.evaluate(val_dataset)
    print(f'Results: {metrics_history}')

# "Čatbota" saskarne

In [None]:
def generate_text(model, tokenizer, input_text, max_length=150):
    # Tokenizējam ievadīto tekstu
    input_ids = tokenizer.encode(input_text, return_tensors='pt')

    # Ģenerējam uzmanības masku
    attention_mask = torch.ones(input_ids.shape, dtype=torch.long)

    input_ids = input_ids.to(model.device)
    attention_mask = attention_mask.to(model.device)

    # Modelis ģenerē tekstu
    output = model.generate(
        input_ids,
        attention_mask=attention_mask,
        max_length=max_length + len(input_ids[0]),
        num_return_sequences=1,
        no_repeat_ngram_size=2,
        temperature=0.9,
        top_k=50,
        top_p=0.95,
        pad_token_id=tokenizer.eos_token_id,
        do_sample=True
    )

    # Dekodējam tekstu un atgriežam to
    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
    return generated_text

def chat_with_model(model, tokenizer):
    print("Sāciet tērzēšanu ar modeli (ierakstiet 'q', lai izietu):")

    while True:
        input_text = input("You: ")

        if input_text.lower() == 'q':
            break

        response = generate_text(model, tokenizer, input_text)
        print("Bot:", response)

if CHAT:
    chat_with_model(model, tokenizer)