In [1]:
# For Colab: Install FARM
!pip install torch==1.6.0+cu101 -f https://download.pytorch.org/whl/torch_stable.html
!pip install farm==0.5.0
!pip install -U -q emoji soynlp
!git clone https://github.com/e9t/nsmc

Looking in links: https://download.pytorch.org/whl/torch_stable.html
Collecting torch==1.6.0+cu101
[?25l  Downloading https://download.pytorch.org/whl/cu101/torch-1.6.0%2Bcu101-cp37-cp37m-linux_x86_64.whl (708.0MB)
[K     |████████████████████████████████| 708.0MB 26kB/s 
[31mERROR: torchvision 0.9.0+cu101 has requirement torch==1.8.0, but you'll have torch 1.6.0+cu101 which is incompatible.[0m
[31mERROR: torchtext 0.9.0 has requirement torch==1.8.0, but you'll have torch 1.6.0+cu101 which is incompatible.[0m
Installing collected packages: torch
  Found existing installation: torch 1.8.0+cu101
    Uninstalling torch-1.8.0+cu101:
      Successfully uninstalled torch-1.8.0+cu101
Successfully installed torch-1.6.0+cu101
Collecting farm==0.5.0
[?25l  Downloading https://files.pythonhosted.org/packages/a3/e4/2f47c850732a1d729e74add867e967f058370f29a313da05dc871ff8465e/farm-0.5.0-py3-none-any.whl (207kB)
[K     |████████████████████████████████| 215kB 7.0MB/s 
Collecting flask-restpl

In [None]:
import os
import re
import emoji
import pandas as pd
from pathlib import Path
from soynlp.normalizer import repeat_normalize

def read_data(path:str, header=None):
    return pd.read_csv(path, sep='\t', header=header)

def clean(x):
    emojis = ''.join(emoji.UNICODE_EMOJI.keys())
    pattern = re.compile(f'[^ .,?!/@$%~％·∼()\x00-\x7Fㄱ-힣{emojis}]+')
    url_pattern = re.compile(
        r'https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)')
    
    x = pattern.sub(' ', x)
    x = url_pattern.sub('', x)
    x = x.strip()
    x = repeat_normalize(x, num_repeats=2)
    return x

def preprocess_dataframe(df:pd.DataFrame):
    r"""
    Changed the code
    source from: https://colab.research.google.com/drive/1IPkZo1Wd-DghIOK6gJpcb0Dv4_Gv2kXB
    """

    label_dict = {0:"bad", 1:"good"}
    df['document'] = df['document'].apply(lambda x: clean(str(x)))
    df['label'] = df['label'].apply(label_dict.get)
    return df

df_train = preprocess_dataframe(read_data("./nsmc/ratings_train.txt", header=0))
df_test = preprocess_dataframe(read_data("./nsmc/ratings_test.txt", header=0))
df_train.loc[:, ["label", "document"]].to_csv("./nsmc/train.tsv", sep="\t", index=False)
df_test.loc[:, ["label", "document"]].to_csv("./nsmc/test.tsv", sep="\t", index=False)

In [2]:
!ls nsmc

code		  ratings_train.txt  raw	synopses.json
ratings_test.txt  ratings.txt	     README.md


In [3]:
import sys
import torch
from pathlib import Path
from farm.modeling.tokenization import Tokenizer
from farm.data_handler.processor import TextClassificationProcessor
from farm.data_handler.data_silo import DataSilo
from farm.modeling.language_model import LanguageModel
from farm.modeling.prediction_head import TextClassificationHead
from farm.modeling.adaptive_model import AdaptiveModel
from farm.modeling.optimization import initialize_optimizer
from farm.train import Trainer
from farm.utils import MLFlowLogger

repo_path = Path().absolute().parent
sys.path.append(str(repo_path))

ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/")
ml_logger.init_experiment(experiment_name="FARM_tutorial", run_name="NSMC")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Devices available: {}".format(device))

03/29/2021 08:08:42 - INFO - farm.modeling.prediction_head -   Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex .


<center><img src="https://drive.google.com/uc?id=1hbtUClFoXg45IbViZoFRLnnDGVlr9Dlb" alt="Fine-tuning" width="30%" height="30%"></center>

# FARM

> Framework for Adapting Representation Models

Fine-tuning에 최적화된 도구

## Core Features

- **Easy fine-tuning of language models** to your task and domain language
- **Speed**: AMP(Automatic Mixed Precision) optimizers (~35% faster) and parallel preprocessing (16 CPU cores => ~16x faster)
- **Modular design** of language models and prediction heads
- Switch between heads or combine them for **multitask learning**
- **Full Compatibility** with HuggingFace Transformers' models and model hub
- **Smooth upgrading** to newer language models
- Integration of **custom datasets** via Processor class
- Powerful **experiment tracking** & execution
- **Checkpointing & Caching** to resume training and reduce costs with spot instances
- Simple **deployment** and **visualization** to showcase your model

<details>
<summary> AMP </summary>

**Reference**
- https://github.com/NVIDIA/apex
- https://forums.fast.ai/t/mixed-precision-training/20720

**mixed precision training이란**
- 처리 속도를 높이기 위한 FP16(16bit floating point)연산과 정확도 유지를 위한 FP32 연산을 섞어 학습하는 방법
- Tensor Core를 활용한 FP16연산을 이용하면 FP32연산 대비 절반의 메모리 사용량과 8배의 연산 처리량 & 2배의 메모리 처리량 효과가 있다
</details>

## Fine-tuning Process

<center><img src="https://drive.google.com/uc?id=1j9pn8Lpg7sy6S8Ubvq3E7JLWf28KvRt4" alt="Fine-tuning" width="50%" height="50%" align="center"></center>

Fine-tuning Processing 그림과 같이 진행된다.

* Load Data: 데이터를 알맞는 형식(json, csv 등)으로 불러온다.
* Create Dataset: 데이터세트(Dataset) 만들기
    * Tokenization: 텍스트를 토큰으로 나누고, 단어장(vocab)을 생성한다.
    * ToTensor: vocab에 해당하는 단어를 수치화하는 과정 (`input_ids` in transformers)
    * Attention Mask: 패딩계산을 피하기 위해 Attention 해야할 토큰만 masking(`attention_mask` in transformers)
* Create Dataloader: 훈련, 평가시 배치크기 단위로 데이터를 불러오는 객체
* Create Model:
    * Pretrained Language Model: 대량의 텍스트 데이터로 사전에 훈련된 모델
    * Fine-tuninig Layer: Downstream Task에 맞춰서 변화
* Train Model
* Eval Model
* Inference

# NSMC 데이터 세트로 알아보기

In [None]:
# from src import read_data

DATA_PATH = repo_path / "nsmc"
df = read_data(DATA_PATH / "train.tsv", header=0)
df.head(5)

## Processor & Data Silo

<center><img src="https://drive.google.com/uc?id=1XCc0AJpPBMFcC81NW0A6w0mpswZ2KU7h" alt="Fine-tuning" width="60%" height="50%" align="center"></center>

* **Processor**는 file 혹은 request를 PyTorch Datset로 만들어 주는 역할
* **Data Silo**는 train, dev, test sets를 관리하고, Processor의 function들 이용해 각 set를 DataLoader로 변환한다.
    * **Samples**, **SampleBasket**은 raw document를 관리하는 객체이며 tokenized, features등 데이터를 저장하고 있다. 이렇게 하는 이유는 하나의 소스 텍스트(raw text)에서 여러개의 샘플을 생성할 수도 있기 때문이다(e.g. QA task)

In [None]:
PRETRAINED_MODEL_NAME_OR_PATH = "beomi/kcbert-base"  # Reference: https://github.com/Beomi/KcBERT
MAX_LENGTH = 300
LABEL_LIST = ["bad", "good"]
TRAIN_FILE = "train.tsv"
TEST_FILE = "test.tsv"
TASK_TYPE = "text_classification"

tokenizer = Tokenizer.load(
    pretrained_model_name_or_path=PRETRAINED_MODEL_NAME_OR_PATH,
    do_lower_case=False,
)

processor = TextClassificationProcessor(
    tokenizer=tokenizer,
    train_filename=TRAIN_FILE,
    test_filename=TEST_FILE,
    dev_split=0.1,
    header=0,
    max_seq_len=MAX_LENGTH,
    data_dir=str(DATA_PATH),
    label_list=LABEL_LIST,
    metric="acc",
    label_column_name="label",
    text_column_name="document",
    delimiter="\t"
)

<center><img src="https://drive.google.com/uc?id=1DVPT_Rjv_SI4ggJZzqfPh0MgsMa1Q9El" alt="Fine-tuning" width="100%" height="50%" align="center"></center>

```plaintext
03/28/2021 22:12:15 - INFO - farm.data_handler.processor -   

      .--.        _____                       _      
    .'_\/_'.     / ____|                     | |     
    '. /\ .'    | (___   __ _ _ __ ___  _ __ | | ___ 
      "||"       \___ \ / _` | '_ ` _ \| '_ \| |/ _ \ 
       || /\     ____) | (_| | | | | | | |_) | |  __/
    /\ ||//\)   |_____/ \__,_|_| |_| |_| .__/|_|\___|
   (/\||/                             |_|           
______\||/___________________________________________                     

ID: 437-0
Clear Text: 
 	text_classification_label: good
 	text: 이 영화를 보고 두통이 나았습니다. ㅠ ㅠ
Tokenized: 
 	tokens: ['이', '영화를', '보고', '두', '##통이', '나', '##았습니다', '.', '[UNK]', '[UNK]']
 	offsets: [0, 2, 6, 9, 10, 13, 14, 18, 20, 22]
 	start_of_word: [True, True, True, True, False, True, False, False, True, True]
Features: 
 	input_ids: [2, 2451, 25833, 8198, 917, 11765, 587, 21809, 17, 1,
      1, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
 	padding_mask: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
 	segment_ids: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
 	text_classification_label_ids: [1]
```

## Modeling Layers: AdaptiveModel = LanguageModel + PredictionHead

<center><img src="https://drive.google.com/uc?id=1OLWdr8rh7ucpF9t55gzVeMawMBJbRiEC" alt="Fine-tuning" width="60%" height="50%" align="center"></center>

In [None]:
# LanguageModel: Build pretrained language model
EMBEDS_DROPOUT_PROB = 0.1

language_model = LanguageModel.load(PRETRAINED_MODEL_NAME_OR_PATH, language="korean")
# PredictionHead: Build predictor layer
prediction_head = TextClassificationHead(
    num_labels=len(LABEL_LIST), 
    class_weights=data_silo.calculate_class_weights(
        task_name=TASK_NAME
    )
)
model = AdaptiveModel(
    language_model=language_model,
    prediction_heads=[prediction_head],
    embeds_dropout_prob=EMBEDS_DROPOUT_PROB,
    lm_output_types=["per_sequence"],
    device=device
)

for k, v in model.named_children():
    print(k)
print(model.dropout)

In [None]:
from transformers import BertForSequenceClassification
bert = BertForSequenceClassification.from_pretrained(MODEL_NAME_OR_PATH)
print(bert.dropout)
print(bert.classifier)

<center><img src="https://drive.google.com/uc?id=1bD54igqAn7T96gDCFZ2uxzFHpZIL5GOh" alt="Fine-tuning" width="60%" height="50%" align="center"></center>

In [None]:
from transformers import BertForSequenceClassification
bert = BertForSequenceClassification.from_pretrained(MODEL_NAME_OR_PATH)
print(bert.dropout)
print(bert.classifier)

Some weights of the model checkpoint at beomi/kcbert-base were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initiali

Dropout(p=0.1, inplace=False)
Linear(in_features=768, out_features=2, bias=True)


## TASK Supported

|Task|BERT|RoBERTa*|XLNet|ALBERT|DistilBERT|XLMRoBERTa|ELECTRA|MiniLM|
|---|---|---|---|---|---|---|---|---|
|Text classification|x|x|x|x|x|x|x|x|
|NER|x|x|x|x|x|x|x|x|
|Question Answering|x|x|x|x|x|x|x|x|
|Language Model Fine-tuning|x||||||||
|Text Regression|x|x|x|x|x|x|x|x|
|Multilabel Text classif.|x|x|x|x|x|x|x|x|
|Extracting embeddings|x|x|x|x|x|x|x|x|
|LM from scratch|x||||||||
|Text Pair Classification|x|x|x|x|x|x|x|x|
|Passage Ranking|x|x|x|x|x|x|x|x|
|Document retrieval (DPR)|x|x||x|x|x|x|x|

In [None]:
for k, v in model.named_children():
    print(k)
print(model.dropout)

language_model
prediction_heads
dropout
Dropout(p=0.09, inplace=False)


## Train

In [None]:
parser = argparse.ArgumentParser(description="run farm")
parser.add_argument("--tracking_uri", type=str, default="https://public-mlflow.deepset.ai/",
    help="MLFlow - tracking uri ")
parser.add_argument("--experiment_name", type=str, default="FARM_tutorial",
    help="MLFlow - experiment name")
parser.add_argument("--run_name", type=str, default="NSMC",
    help="MLFlow - run name")
parser.add_argument("--pretrained_model_name_or_path", type=str, default="beomi/kcbert-base",
    help="Tokenizer, LanguageModel - pretrained model name")

parser.add_argument("--train_filename", type=str, default="train.tsv",
    help="Processor - train file name")
parser.add_argument("--test_filename", type=str, default="test.tsv",
    help="Processor - test file name")
parser.add_argument("--max_seq_len", type=int, default=150,
    help="Processor - max sequence lenght of tokens")
parser.add_argument("--data_dir",  type=str, default="./nsmc/",
    help="Processor - data directory")
parser.add_argument("--label_list", nargs="*", default=["bad", "good"],
    help="Processor - label list with string")
parser.add_argument("--metric",  type=str, default="acc",
    help="Processor - acc or f1_macro")
parser.add_argument("--label_column_name",  type=str, default="label",
    help="Processor - label column name")
parser.add_argument("--text_column_name",  type=str, default="document",
    help="Processor - text column name")
parser.add_argument("--ckpt_path",  type=str, default="./ckpt",
    help="Processor - checkpoint to save processor")
parser.add_argument("--batch_size", type=int, default=256,
    help="DataSilo - train batch size")
parser.add_argument("--eval_batch_size", type=int, default=256,
    help="DataSilo - eval batch size")
parser.add_argument("--embeds_dropout_prob", type=float, default=0.1,
    help="AdaptiveModel - The probability that a value in the embeddings returned by the language model will be zeroed.")
parser.add_argument("--learning_rate", type=float, default=2e-5,
    help="initialize_optimizer - learning rate")
parser.add_argument("--n_epochs", type=int, default=1,
    help="initialize_optimizer - number of epochs")
parser.add_argument("--n_gpu", type=int, default=4,
    help="Trainer - number of gpus")
parser.add_argument("--checkpoint_root_dir", type=str, default="./ckpt",
    help="Trainer - checkpoint root directory")
parser.add_argument("--checkpoints_to_keep", type=int, default=3,
    help="Trainer - number of checkpoint to keep")
parser.add_argument("--checkpoint_every", type=int, default=200,
    help="Trainer - checkpoint every")
parser.add_argument("--evaluate_every", type=int, default=200,
    help="Trainer - evaluate steps")
args = parser.parse_known_args()

In [None]:
LEARNING_RATE = 2e-5
N_EPOCHS = 1

model, optimizer, lr_schedule = initialize_optimizer(
    model=model,
    device=device,
    learning_rate=LEARNING_RATE,
    n_batches=len(data_silo.loaders["train"]),
    n_epochs=N_EPOCHS
)

03/26/2021 09:12:17 - INFO - farm.modeling.optimization -   Loading optimizer `TransformersAdamW`: '{'correct_bias': False, 'weight_decay': 0.01, 'lr': 2e-05}'
03/26/2021 09:12:18 - INFO - farm.modeling.optimization -   Using scheduler 'get_linear_schedule_with_warmup'
03/26/2021 09:12:18 - INFO - farm.modeling.optimization -   Loading schedule `get_linear_schedule_with_warmup`: '{'num_warmup_steps': 1674.8000000000002, 'num_training_steps': 16748}'


In [None]:
N_GPU = 1

trainer = Trainer(
    model=model,
    optimizer=optimizer,
    data_silo=data_silo,
    epochs=N_EPOCHS,
    n_gpu=N_GPU,
    lr_schedule=lr_schedule,
    device=device, 
)

In [None]:
!nvidia-smi

Fri Mar 26 09:12:26 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.56       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   44C    P0    23W /  75W |   1069MiB /  7611MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
model = trainer.train()

03/26/2021 09:12:29 - INFO - farm.train -   
 

          &&& &&  & &&             _____                   _             
      && &\/&\|& ()|/ @, &&       / ____|                 (_)            
      &\/(/&/&||/& /_/)_&/_&     | |  __ _ __ _____      ___ _ __   __ _ 
   &() &\/&|()|/&\/ '%" & ()     | | |_ | '__/ _ \ \ /\ / / | '_ \ / _` |
  &_\_&&_\ |& |&&/&__%_/_& &&    | |__| | | | (_) \ V  V /| | | | | (_| |
&&   && & &| &| /& & % ()& /&&    \_____|_|  \___/ \_/\_/ |_|_| |_|\__, |
 ()&_---()&\&\|&&-&&--%---()~                                       __/ |
     &&     \|||                                                   |___/
             |||
             |||
             |||
       , -=-~  .-^- _
              `

Train epoch 0/0 (Cur. train loss: 0.5005):   1%|          | 100/16748 [00:36<1:28:20,  3.14it/s]
Evaluating:   0%|          | 0/1942 [00:00<?, ?it/s][A
Evaluating:   6%|▋         | 125/1942 [00:10<02:25, 12.49it/s][A
Evaluating:  13%|█▎        | 250/1942 [00:20<02:15, 

In [None]:
# Test your model on a sample (Inference)
from farm.infer import Inferencer
from pprint import PrettyPrinter

infer_model = Inferencer(
    processor=processor, 
    model=model, 
    task_type="text_classification", 
    gpu=True
)

basic_texts = [
    {"text": "기생충,,, 이 영화 정말 재밌네요."},
    {"text": "황정민 나오는 영화는 다 볼만한듯?"},
]
result = infer_model.inference_from_dicts(dicts=basic_texts)
PrettyPrinter().pprint(result)