## Preamble: Execute this if checking any answer!

In [None]:
# allows update of external libraries without need to reload package
%load_ext autoreload
%autoreload 2

In [None]:
!pwd

In [None]:
import sys
import os
import pathlib

import numpy as np
import xarray

# location of scripts folder for bootcamp
sys.path.append("/p/home/jusers/ehlert1/juwels/notebooks/bootcamp_testing/scripts")
import normalize_text_bootcamp
import utils_bootcamp
import plotting
import dataset_bootcamp

import re

In [None]:
# may take some time ...
import sys
import pathlib
import string
import re
import os
import logging
import functools

logging.basicConfig(level=logging.INFO)

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import xarray

# Pytorch modules
import torch
import torch.nn.functional

# scikit-learn modules
import sklearn.metrics
import sklearn.model_selection

# "Hugging Face" modules
import datasets
import transformers

In [None]:
FOLDER_TO_TWEETS = "/p/project/training2223/a2/data/tweets/tweets_2017_era5_normed_filtered.nc"
# FOLDER_TO_TWEETS = "../../data/tweets/tweets_2017_normalized.nc"

In [None]:
!ls /p/project/training2223/a2/data/tweets

## Task 3

In [None]:
def load_tweets_dataset():
    ds = xarray.load_dataset(FOLDER_TO_TWEETS)
    ds = dataset_bootcamp.reset_index_coordinate(ds)
    return ds

In [None]:
ds_tweets = load_tweets_dataset()

In [None]:
# again define labels
key_tp = "tp_h"
ds_tweets["raining"] = (["index"], ds_tweets[key_tp].values > 1e-8)

In [None]:
indices_train, indices_test = sklearn.model_selection.train_test_split(
    np.arange(ds_tweets["index"].shape[0]),
    random_state=42,
    shuffle=True,
    test_size=0.2,
    stratify=ds_tweets.raining.values,
)

## Task 4

In [None]:
# load the pretrained tokenizer
model_nm = (
    "/p/project/training2223/a2/models/deberta-v3-small/"  # model repo downloaded from Hugging Face to the cluster
)

tokenizer = transformers.AutoTokenizer.from_pretrained(model_nm)
db_config_base = transformers.AutoConfig.from_pretrained(model_nm, num_labels=2)

In [None]:
# define functions to convert the dataset to a format that is used by Hugging Face


def tok_func(x, tokenizer):
    """
    tokenizes the field 'inputs' stored in x including padding
    """
    return tokenizer(x["inputs"], padding=True)


def get_dataset(ds, tok_func, tokenizer, indices_train, indices_test, train=True):
    """
    converts dataset to 'dataset' format required by Hugging Face

    Parameters:
    ----------
    ds: dataset
    tok_func: functiond use for tokenization
    indices_train: indices corresponding to the training set
    indices_test: indices corresponding to the training set
    train: if used for training

    Returns
    -------
    header of file
    """
    # converting dataset to pandas as Hugging Face datasets has inbuilt function that converts pandas dataframe to a Hugging Face dataset
    df = ds[["text_normalized", "raining"]].to_pandas()
    df = df.rename(columns={"text_normalized": "inputs"})
    df = df.rename(columns={"raining": "label"})
    datasets_ds = datasets.Dataset.from_pandas(df)
    tok_function_partial = functools.partial(tok_func, tokenizer=tokenizer)
    tok_ds = datasets_ds.map(tok_function_partial, batched=True)
    if train:
        return datasets.DatasetDict({"train": tok_ds.select(indices_train), "test": tok_ds.select(indices_test)})
    else:
        return tok_ds

In [None]:
# create Hugging Face 'dataset'
dataset = get_dataset(ds_tweets, tok_func, tokenizer, indices_train, indices_test)

## Task 5

In [None]:
parameters = {}
parameters["learning_rate"] = 8e-5
parameters["batch_size"] = 16
parameters["weight_decay"] = 0.01
parameters["epochs"] = 1
parameters["warmup_ratio"] = 0.1
parameters["cls_dropout"] = 0.3
parameters["lr_scheduler_type"] = "cosine"

FOLDER_TO_OUTPUT = "/p/project/training2223/a2/models/output_debertav3_tweets_2017/"

In [None]:
def get_model(params, db_config_base, model_nm):
    """
    function to retrieve model, format follows Hugging Face convention (parameter -> 'params')
    """
    db_config = db_config_base
    if params is not None:
        db_config.update({"cls_dropout": params["cls_dropout"]})
    db_config.update({"num_labels": 2})
    model = transformers.AutoModelForSequenceClassification.from_pretrained(model_nm, config=db_config)
    return model


def compute_metrics(eval_pred):
    """
    compute f1 metrics of both labels, format follows Hugging Face convention

    Parameters:
    ----------
    eval_pred: evaluation/test set probalities for classification task

    Returns
    -------
    dictionary returning labeled f1 score of "not raining" and "raining"
    """
    predictions, labels = eval_pred
    predictions = predictions.argmax(axis=-1)
    classification_report = sklearn.metrics.classification_report(
        labels, predictions, target_names=["not raining", "raining"], output_dict=True
    )
    f1_not_raining = classification_report["not raining"]["f1-score"]
    f1_raining = classification_report["raining"]["f1-score"]
    return {"f1_not_raining": f1_not_raining, "f1_raining": f1_raining}


def get_trainer(dataset, db_config_base, model_nm, FOLDER_TO_OUTPUT, parameters):
    """
    initializes `transformers.Trainer`, which is used to train models with Hugging Face

    Hyper parameters are here assigned to model.
    Parameters:
    ----------
    dataset: dataset in format required by Hugging Face
    db_config_base: default model configurations
    model_nm: model folder
    FOLDER_TO_OUTPUT: folder where trained model, tokenizer,... will be saved
    parameters: dictionary of hyper-parameters

    Returns
    -------
    trainer with assigned parameters used for training
    """
    args = transformers.TrainingArguments(
        FOLDER_TO_OUTPUT,
        learning_rate=parameters["learning_rate"],
        warmup_ratio=parameters["warmup_ratio"],
        lr_scheduler_type=parameters["lr_scheduler_type"],
        disable_tqdm=False,
        fp16=True,
        evaluation_strategy="epoch",
        per_device_train_batch_size=parameters["batch_size"],
        per_device_eval_batch_size=parameters["batch_size"],
        num_train_epochs=parameters["epochs"],
        weight_decay=parameters["weight_decay"],
        report_to="none",
        save_strategy="epoch",
        load_best_model_at_end=True,
    )
    # convert `get_model` to partial function to pass it as an argument in `transformers.Trainer`
    # see https://www.geeksforgeeks.org/partial-functions-python/ for quick tutorial
    get_model_partial = functools.partial(get_model, db_config_base=db_config_base, model_nm=model_nm)
    return transformers.Trainer(
        model_init=get_model_partial,
        args=args,
        train_dataset=dataset["train"],
        eval_dataset=dataset["test"],
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )

## Task 6

In [None]:
trainer = get_trainer(dataset, db_config_base, model_nm, FOLDER_TO_OUTPUT, parameters)

In [None]:
trainer.train()

## Task 7

In [None]:
# if loading required of saved model
def load_saved_trained_model(ds, FOLDER_TO_OUTPUT, db_config_base, model_nm, parameters):
    # load the pretrained tokenizer
    model_nm = FOLDER_TO_OUTPUT
    tokenizer = transformers.AutoTokenizer.from_pretrained(model_nm)
    db_config_base = transformers.AutoConfig.from_pretrained(model_nm, num_labels=2)
    dataset = get_dataset(ds, tok_func, tokenizer, indices_train, indices_test)
    trainer = get_trainer(dataset, db_config_base, model_nm, FOLDER_TO_OUTPUT, parameters)
    return trainer


trainer_evaluate = load_saved_trained_model(
    ds_tweets,
    FOLDER_TO_OUTPUT + "checkpoint-4605/",
    db_config_base,
    model_nm,
    parameters,
)

In [None]:
# obtain test dataset in Huggin Face format
test_ds = get_dataset(
    ds_tweets.sel(index=indices_test),
    tok_func,
    tokenizer,
    indices_train,
    indices_test,
    train=False,  # not training anymore
)

In [None]:
# make predictions
preds = torch.nn.functional.softmax(torch.Tensor(trainer.predict(test_ds).predictions)).numpy()

In [None]:
ds_test = ds_tweets.sel(index=indices_test)

## 7.1

In [None]:
%matplotlib inline
truth = ds_test.raining.values
prediction = preds.argmax(-1)
report = plotting.analysis.check_prediction(truth, prediction)
print(report)

## Task 7.2

In [None]:
truth = ds_test.raining.values
prediction_probability = preds[:, 1]

plotting.analysis.plot_roc(truth, prediction_probability)

## 7.3

In [None]:
truth = ds_test.raining.values
prediction_probability = preds[:, 1]

plotting.analysis.plot_predictions_confidence(
    truth,
    prediction_probability,
    bins=10,
    x_label="raining",
    y_label="preds_raining",
    filename=None,
)