<a href="https://colab.research.google.com/github/qte77/ML/blob/main_CU-042621/HF-WnB-PoC.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# HuggingFace PoC

## Links
* [Matthias Bussonnier pip_magic repository](https://github.com/Carreau/pip_magic), e.g. `!jupyter kernelspec list`
* [Installing Python Packages from a Jupyter Notebook](https://jakevdp.github.io/blog/2017/12/05/installing-python-packages-from-jupyter/)

## System info

In [None]:
!type -a python
!type -a pip
!type -a pip3
!jupyter kernelspec list

## Install and load modules

In [None]:
import sys

In [None]:
#!{sys.executable} -m pip3 install --upgrade -r requirements.txt

In [None]:
#no module pip3
!{sys.executable} -m pip install -qqq wandb
#remove version from wandb
!{sys.executable} -m pip  uninstall -yyy -qqq folium
!{sys.executable} -m pip  install -qqq 'folium == 0.2.1'

In [None]:
!{sys.executable} -m pip install -qqq datasets transformers bert_score
# Optional -> install latest version from source
#!{sys.executable} -m pip3 install -qqq git+https://github.com/huggingface/transformers

In [None]:
#setuptools, freeze, 
!{sys.executable} -m pip freeze

In [None]:
from datasets import load_dataset, list_datasets, list_metrics
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from transformers import Trainer, TrainingArguments
from datasets import load_metric
import wandb
import bert_score
import numpy as np
import os

In [None]:
#run_glue.py
#!wget https://raw.githubusercontent.com/huggingface/transformers/master/examples/pytorch/text-classification/run_glue.py

## Connect to WandB
* [Get key here](https://wandb.ai/authorize) or in section `API keys` of [the settings](https://wandb.ai/settings)
* specify parameters like the dataset to use
* specify wandb-env

In [None]:
!wandb login --relogin
#wandb.login()
#wandb.finish()

In [None]:
taskname='mrpc'
project='BERT-MRPC-GPU'
entity='ba'

In [None]:
#https://docs.wandb.ai/guides/track/advanced/environment-variables
%env WANDB_WATCH="all"
%env WANDB_LOG_MODEL=true
%env WANDB_PROJECT=f'{project}'
%env WANDB_ENTITY=f'{entity}'
%env WANDB_SAVE_CODE=true
#avoid error:
#The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
%env TOKENIZERS_PARALLELISM=true

## Use run_glue.py as wrapper

Notiz: WandB loggt beim Wrapper nur die Systemdaten und nicht auch die vom Modell.
Recherche nötig oder direkt API nutzen.

In [None]:
models = {
    'yahoo': 'distilbert-base-uncased',
    'mrpc': 'bert-base-uncased'
}
model = models.get(taskname)

!python run_glue.py \
  --report_to wandb \
  --model_name_or_path model \
  --task_name taskname \
  --learning_rate 1e-4 \
  --do_train \
  --do_eval \
  --max_steps 300 \
  --logging_steps 30 \
  --evaluation_strategy steps \
  --output_dir f'/tmp/{taskname}' \
  --overwrite_output_dir \
  --run_name taskname
#  --remove_unused_columns False

wandb.finish()

## Use API without wrapper

### Load dataset

In [None]:
dataset_param = {
    'yahoo': ['yahoo_answers_topics','','topic','distilbert-base-uncased'],
    'mrpc': ['glue','mrpc','label','bert-base-uncased']
}
ds_col, ds_name, dscol_rename, mod = dataset_param.get(taskname)

In [None]:
#list_datasets()
#load_ds also also splits into train/eval
dataset = load_dataset(ds_col, ds_name)
label_list = dataset['train'].unique(dscol_rename)
num_labels = len(label_list)

In [None]:
dataset.column_names

In [None]:
#rename label/topci column to 'labels' for model args input
for name in dataset:
  if dscol_rename in dataset[name].column_names:
    dataset[name] = dataset[name].rename_column(dscol_rename, 'labels')
  else:
    print("Attribute/Feature/Column '%s' not found in '%s'. Found:" % (dscol_rename, name))

  print(dataset[name].column_names)

### Tokenize dataset

In [None]:
# try max_length=X, try fast=False
tokenizer = AutoTokenizer.from_pretrained(mod, use_fast=True, truncation=True, padding=True)

In [None]:
#sample_input = dataset['train'][0]
#sample_input
#tokenizer(sample_input)

In [None]:
'''
def tokenize_function(example):
    return tokenizer(example["sentence1"], example["sentence2"], truncation=True)

tokenized_datasets = dataset_mrpc.map(tokenize_function, batched=True)
'''
#tokenizing and padding
#try tokenizer(padding="max_length", max_length=X)
if taskname == 'yahoo':
  tokenized_datasets = dataset.map(lambda x: tokenizer(
      x['question_title'], truncation=True), batched=True
    ).remove_columns(
      ['question_title', 'id', 'best_answer', 'question_content']
    )
elif taskname == 'mrpc':
  tokenized_datasets = dataset.map(lambda x: tokenizer(
      x['sentence1'], x['sentence2'], truncation=True
    ), batched=True).remove_columns(
      ['sentence1', 'sentence2', 'idx']
    )
else:
  print('Not found.')

tokenized_datasets.column_names

### Load model

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(mod, num_labels=num_labels)
print(model.config)
print(model.bert.embeddings)
print(model.bert.encoder.layer[0])

In [None]:
#yahoo
#import torch
'''
def get_topic(sentence, tokenize=tokenizer, model=model):
    # tokenize the input
    inputs = tokenizer(sentence, return_tensors='pt')
    # ensure model and inputs are on the same device (GPU)
    inputs = {name: tensor.cuda() for name, tensor in inputs.items()}
    model = model.cuda()
    # get prediction - 10 classes "probabilities" (not really true because they still need to be normalized)
    with torch.no_grad():
        predictions = model(**inputs)[0].cpu().numpy()
    # get the top prediction class and convert it to its associated label
    top_prediction = predictions.argmax().item()
    return dataset['train'].features['labels'].int2str(top_prediction)
'''

In [None]:
#yahoo
#get_topic('Why is cheese so much better with wine?')

### Training Arguments
[HF args documentation](https://huggingface.co/transformers/main_classes/trainer.html#transformers.TrainingArguments)

In [None]:
lr=5e-6
max_steps=3000

eval_steps=round(max_steps/5)
save_steps=round(eval_steps*2)

#PyTorch: setting up devices
args = TrainingArguments(
    report_to = 'wandb',                     # enable logging to W&B
    output_dir = taskname,
    overwrite_output_dir = True,
    evaluation_strategy = 'steps',          # check evaluation metrics at each epoch
    learning_rate = lr,                   # we can customize learning rate
    max_steps = max_steps,
    logging_steps = 100,                    # we will log every 100 steps
    eval_steps = eval_steps,                      # we will perform evaluation every 5000 steps
    save_steps = save_steps,
    load_best_model_at_end = True,
    metric_for_best_model = 'accuracy',
    run_name = taskname,           # name of the W&B run
#    remove_unused_columns = True  # avoid warning 'The following columns in the evaluation set  don't have a corresponding argument'
)
args

### Custom Metrics


In [None]:
#list_metrics()
#downloading metrics builder scripts
#bertscore = load_metric('bertscore')
metrics_to_load = ['accuracy','precision','recall','f1','mae','mse']
loaded_metrics = []
metrics_calc = []

for met in metrics_to_load:
  loaded_metrics.append(load_metric(met))

In [None]:
#define custom metrics
#datasets.list_metrics()
#F1 = 2 * (precision * recall) / (precision + recall)
#Recall = TP / (TP + FN)
#bertscore: bert_score needs to be installed
#   cosine-similarity, precision, recall, F1
#glue: cfg names ["sst2", "mnli", "mnli_mismatched", "mnli_matched", "cola", "stsb", "mrpc", "qqp", "qnli", "rte", "wnli", "hans"]'
#glue mrpc: accuracy, f1

def compute_metrics(eval_pred):
  predictions, labels = eval_pred
  predictions = np.argmax(predictions, axis=1) #predictions.argmax(-1)
  
  for i, met in enumerate(loaded_metrics):
    metrics_calc[i] = met.compute(predictions=predictions, references=labels)

  print("*************")
  for met in metrics_calc:
    wandb.log(met)
    print(met)
  print("*************")  

  return metrics_calc[metrics_to_load.index('accuracy')]

### Build trainer

In [None]:
trainer = Trainer(
  model = model,
  args = args,
  train_dataset=tokenized_datasets['train'],
  eval_dataset=tokenized_datasets['test'],
  tokenizer=tokenizer,            # for padding batched data
  compute_metrics=compute_metrics # for custom metrics
)

In [None]:
wandb.init(project=project, entity=entity, save_code = True)
#!wandb init

### Pre-evaluate

In [None]:
#trainer.evaluate()

### Train

In [None]:
#import torch
#device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("*************")
print("Taskname: %s, Labels#: %s, lr: %s" % (taskname, num_labels, lr))
#print(os.environ)
print("*************")

trainer.train()

wandb.finish()

### Predict

In [None]:
#yahoo
#get_topic('Why is cheese so much better with wine?')
wandb.finish()