<a href="https://colab.research.google.com/github/pszemraj/ai-msgbot/blob/update-notebooks/notebooks/colab-huggingface-API/finetune_gpt_j_6B_8bit.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# <center> Fine-tuning 6-Billion param GPT-J in colab with LoRA and 8-bit compression </center>

> goal: fine-tune a GPT-J 8-bit model on a custom dataset

This notebook essentially combines the workflow presented in the huggingface documentation [here](https://huggingface.co/docs/transformers/training) with the work done in fine-tuning [GPT-J-6B](https://huggingface.co/EleutherAI/gpt-j-6B) with limited memory. A detailed explanation of how it works can be found in [this model card](https://huggingface.co/hivemind/gpt-j-6B-8bit).

---


## system setups

In [None]:
#@title print GPU status
!nvidia-smi

In [None]:
#@markdown add auto-Colab formatting with `IPython.display`
from IPython.display import HTML, display
# colab formatting
def set_css():
    display(
        HTML(
            """
  <style>
    pre {
        white-space: pre-wrap;
    }
  </style>
  """
        )
    )

get_ipython().events.register("pre_run_cell", set_css)

In [None]:
#@title print out the VM's CPU stats
#@markdown - a high-RAM runtime is recommended as the model file itself is around
#@markdown 10 gb. That gets loaded into ram, so 12 gb RAM will not cut it
from psutil import virtual_memory
import os
ram_gb = round(virtual_memory().total / (1024**3), 1)
print(f'Runtime has {ram_gb} gigs of memory and {os.cpu_count()} processors')

if ram_gb < 20: print("WARNING - your CPU RAM allocated is less than 20.",
                      " You may experience errors loading")

# setup - params

# setup

In [None]:
#@title Key User Inputs- Parameters
#@markdown >Note that `hf_name` is the model for config/tokenizer and `model_8bit`
#@markdown is for the model itself.

SAVE_GDRIVE = False #@param {type:"boolean"}
hf_name = "EleutherAI/gpt-j-6B" #@param {type:"string"}
model_8bit = "ethzanalytics/GPT-J-6B-8bit-Convo-D3E" #@param {type:"string"}
N_EPOCHS =  1#@param {type:"number"}
model_name_header = "Converse" #@param {type:"string"}
dataset = "WoW" #@param {type:"string"}


In [None]:
#@markdown Logging
import os
import sys
import logging
# Setup logging
logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
    datefmt="%m/%d/%Y %H:%M:%S",
    handlers=[logging.StreamHandler(sys.stdout)],
    level=logging.INFO,
)

# for trainer
from os.path import join
os.makedirs(join(os.getcwd(), "logs"), exist_ok=True )

In [None]:
#@title **install primary packages**

#@markdown These were all from the original notebook except for `datasets` 
!pip install -U transformers -q
!pip install -U datasets -q
!pip install bitsandbytes-cuda111==0.26.0 -q

In [None]:
#@markdown **install added packages**

#@markdown primarily utils like joblib etc
!pip install -U joblib
!sudo apt-get install git-lfs
import joblib

In [None]:
#@markdown import packages (from deepspeed notebook)
import os
from urllib import request
from os.path import join
import pandas as pd
import torch
from torch.utils.data import Dataset, random_split
from transformers import GPT2Tokenizer, TrainingArguments, Trainer, GPTNeoForCausalLM
from tqdm.auto import tqdm 

torch.manual_seed(42)

In [None]:
#@markdown import packages (all the rest)
import transformers
import datasets
import torch
import torch.nn.functional as F
from torch import nn
from torch.cuda.amp import custom_fwd, custom_bwd

from bitsandbytes.functional import quantize_blockwise, dequantize_blockwise

from tqdm.auto import tqdm

---

In [None]:
#@title <font color="orange"> Sign in to HF </font>
#@markdown create an account on their website if zou don't have one - you need somewhere to put this.

#@markdown also imports a lot of the functions from the package
from huggingface_hub import (
    # User management
    login,
    logout,
    notebook_login,
    whoami,
    # Repository creation and management
    create_repo,
    delete_repo,
    update_repo_visibility,
    # And some methods to retrieve/change information about the content
    list_models,
    list_datasets,
    list_metrics,
    list_repo_files,
    upload_file,
    delete_file,
)

notebook_login()


In [None]:
#@markdown sign in to drive
!git config --global credential.helper store
from google.colab import drive
if SAVE_GDRIVE:
    drive.mount('/content/drive')
else:
    print("not saving temp files in gdrive")
base_drive_loc = "/content/drive/MyDrive/Programming"
folder_name = "eleuther6b" #@param {type:"string"}


In [None]:
import os
from os.path import join
if SAVE_GDRIVE:

    outpath = join(os.getcwd(), repo_name)
    os.makedirs(outpath, exist_ok=True)
    print(f"created the folder {outpath}.. asking python if it agrees = {os.path.exists(outpath)}")

---

# functions

In [None]:
#@title basics
import os
from datetime import datetime

def get_workers():

    cores = os.cpu_count()

    if cores > 2:
        return cores - 2 # save 2 for ... things so colab does not crash
    else:
        return 1

def get_timestamp():
    return datetime.now().strftime("%b-%d-%Y_t-%H")


def print_spacer(n=1):
    """print_spacer - print a spacer line"""
    print("\n   --------    " * n)


def remove_trailing_punctuation(text: str):
    """
    remove_trailing_punctuation - remove trailing punctuation from a string

    Args:
        text (str): [string to be cleaned]

    Returns:
        [str]: [cleaned string]
    """
    return text.strip("?!.,;:")


### Converting the model to 8 bits.

We convert EleutherAI's GPT-J-6B model to 8 bits using facebook's [bitsandbytes](https://github.com/facebookresearch/bitsandbytes) library. This reduces the model's size from 20Gb down to just 6Gb.

Note that we don't convert linear layer biases to 8 bit as they take up less that 1% of the model's weight anyway.

In [None]:
#@title bitsandbytes custom classes
class FrozenBNBLinear(nn.Module):
    def __init__(self, weight, absmax, code, bias=None):
        assert isinstance(bias, nn.Parameter) or bias is None
        super().__init__()
        self.out_features, self.in_features = weight.shape
        self.register_buffer("weight", weight.requires_grad_(False))
        self.register_buffer("absmax", absmax.requires_grad_(False))
        self.register_buffer("code", code.requires_grad_(False))
        self.adapter = None
        self.bias = bias
 
    def forward(self, input):
        output = DequantizeAndLinear.apply(input, self.weight, self.absmax, self.code, self.bias)
        if self.adapter:
            output += self.adapter(input)
        return output
 
    @classmethod
    def from_linear(cls, linear: nn.Linear) -> "FrozenBNBLinear":
        weights_int8, state = quantize_blockise_lowmemory(linear.weight)
        return cls(weights_int8, *state, linear.bias)
 
    def __repr__(self):
        return f"{self.__class__.__name__}({self.in_features}, {self.out_features})"
 
 
class DequantizeAndLinear(torch.autograd.Function): 
    @staticmethod
    @custom_fwd
    def forward(ctx, input: torch.Tensor, weights_quantized: torch.ByteTensor,
                absmax: torch.FloatTensor, code: torch.FloatTensor, bias: torch.FloatTensor):
        weights_deq = dequantize_blockwise(weights_quantized, absmax=absmax, code=code)
        ctx.save_for_backward(input, weights_quantized, absmax, code)
        ctx._has_bias = bias is not None
        return F.linear(input, weights_deq, bias)
 
    @staticmethod
    @custom_bwd
    def backward(ctx, grad_output: torch.Tensor):
        assert not ctx.needs_input_grad[1] and not ctx.needs_input_grad[2] and not ctx.needs_input_grad[3]
        input, weights_quantized, absmax, code = ctx.saved_tensors
        # grad_output: [*batch, out_features]
        weights_deq = dequantize_blockwise(weights_quantized, absmax=absmax, code=code)
        grad_input = grad_output @ weights_deq
        grad_bias = grad_output.flatten(0, -2).sum(dim=0) if ctx._has_bias else None
        return grad_input, None, None, None, grad_bias
 
 
class FrozenBNBEmbedding(nn.Module):
    def __init__(self, weight, absmax, code):
        super().__init__()
        self.num_embeddings, self.embedding_dim = weight.shape
        self.register_buffer("weight", weight.requires_grad_(False))
        self.register_buffer("absmax", absmax.requires_grad_(False))
        self.register_buffer("code", code.requires_grad_(False))
        self.adapter = None
 
    def forward(self, input, **kwargs):
        with torch.no_grad():
            # note: both quantuized weights and input indices are *not* differentiable
            weight_deq = dequantize_blockwise(self.weight, absmax=self.absmax, code=self.code)
            output = F.embedding(input, weight_deq, **kwargs)
        if self.adapter:
            output += self.adapter(input)
        return output 
 
    @classmethod
    def from_embedding(cls, embedding: nn.Embedding) -> "FrozenBNBEmbedding":
        weights_int8, state = quantize_blockise_lowmemory(embedding.weight)
        return cls(weights_int8, *state)
 
    def __repr__(self):
        return f"{self.__class__.__name__}({self.num_embeddings}, {self.embedding_dim})"
 
 
def quantize_blockise_lowmemory(matrix: torch.Tensor, chunk_size: int = 2 ** 20):
    assert chunk_size % 4096 == 0
    code = None
    chunks = []
    absmaxes = []
    flat_tensor = matrix.view(-1)
    for i in range((matrix.numel() - 1) // chunk_size + 1):
        input_chunk = flat_tensor[i * chunk_size: (i + 1) * chunk_size].clone()
        quantized_chunk, (absmax_chunk, code) = quantize_blockwise(input_chunk, code=code)
        chunks.append(quantized_chunk)
        absmaxes.append(absmax_chunk)
 
    matrix_i8 = torch.cat(chunks).reshape_as(matrix)
    absmax = torch.cat(absmaxes)
    return matrix_i8, (absmax, code)
 
 
def convert_to_int8(model):
    """Convert linear and embedding modules to 8-bit with optional adapters"""
    for module in list(model.modules()):
        for name, child in module.named_children():
            if isinstance(child, nn.Linear):
                print(name, child)
                setattr( 
                    module,
                    name,
                    FrozenBNBLinear(
                        weight=torch.zeros(child.out_features, child.in_features, dtype=torch.uint8),
                        absmax=torch.zeros((child.weight.numel() - 1) // 4096 + 1),
                        code=torch.zeros(256),
                        bias=child.bias,
                    ),
                )
            elif isinstance(child, nn.Embedding):
                setattr(
                    module,
                    name,
                    FrozenBNBEmbedding(
                        weight=torch.zeros(child.num_embeddings, child.embedding_dim, dtype=torch.uint8),
                        absmax=torch.zeros((child.weight.numel() - 1) // 4096 + 1),
                        code=torch.zeros(256),
                    )
                )

### create blocking functions 

they convert anything that could be assigned to the model to 8-bit (I think)

In [None]:
class GPTJBlock(transformers.models.gptj.modeling_gptj.GPTJBlock):
    def __init__(self, config):
        super().__init__(config)

        convert_to_int8(self.attn)
        convert_to_int8(self.mlp)


class GPTJModel(transformers.models.gptj.modeling_gptj.GPTJModel):
    def __init__(self, config):
        super().__init__(config)
        convert_to_int8(self)
        

class GPTJForCausalLM(transformers.models.gptj.modeling_gptj.GPTJForCausalLM):
    def __init__(self, config):
        super().__init__(config)
        convert_to_int8(self)


transformers.models.gptj.modeling_gptj.GPTJBlock = GPTJBlock  # monkey-patch GPT-J

# load pretrained model, config, etc files

In [None]:
%%time
gpt_eos = '<|endoftext|>'
config = transformers.GPTJConfig.from_pretrained(hf_name)
tokenizer = transformers.AutoTokenizer.from_pretrained(hf_name, 
                                                       pad_token=gpt_eos,
                                                       eos_token=gpt_eos,
                                                    )

In [None]:
%%time
gpt = GPTJForCausalLM.from_pretrained(model_8bit,
                                      use_auth_token=True,
                                      low_cpu_mem_usage=False,
                                      gradient_checkpointing=True,
                                      use_cache=False, # prevent weird hf trainer warnings popping up
                                      )

device = 'cuda' if torch.cuda.is_available() else 'cpu'
gpt.to(device)

### Text generation example

> testing the base 6B version that was loaded off of hf to validate that it works. This is done on the GPU, so if you are running into memory issues try moving `gpt.to(device)` in a prior cell to after the text has been generated.

In [None]:
%%time
import pprint as pp
std_test = "today I woke up and then" #@param {type:"string"}
prompt = tokenizer(std_test, return_tensors='pt')
prompt = {key: value.to(device) for key, value in prompt.items()}
ex_min = len(std_test) + 64
out = gpt.generate(**prompt, 
                   min_length=ex_min,
                   max_length=ex_min + 64,
                   do_sample=True,
                    top_k=50,
                   top_p=0.9,
                   repetition_penalty =1.5,
                   length_penalty=1.2,
                   no_repeat_ngram_size=2,
                   clean_up_tokenization_spaces=True,
                   remove_invalid_values=True,
                   )

example_res = tokenizer.decode(out[0])
print(f"Total generated text is: \n")
pp.pprint(example_res)

# Prep for Training

In [None]:
#@title load packages for dataset prep
#@markdown imports etc are here
!pip install -U -q datasets
import re
import json
from sklearn.model_selection import train_test_split
from datasets import load_dataset
import math
import os
import sys
from dataclasses import dataclass, field
from itertools import chain
from typing import Optional

In [None]:
#@markdown **Load Train/Test files from URL**
#@markdown works for any URL that `wget` would. Otherwise try using `files.upload`
#@markdown from the `google.colab` package or click on the side panel
train_link = "https://www.dropbox.com/s/olnx438omur7j72/wow-train.txt.txt?dl=1" #@param {type:"string"}
test_link = "https://www.dropbox.com/s/t2hhawpsiocypyt/ScriptParse-wow-train-kilt_4.txt?dl=1" #@param {type:"string"}

vm_wd = os.getcwd()
train_path = join(vm_wd, "train_dataset.txt")
request.urlretrieve(train_link, train_path)

# test file
test_path = join(vm_wd, "test_dataset.txt")
request.urlretrieve(test_link, test_path)

print(f"DL complete. Path to train data file is:\n{train_path}\nand path to test data is:\n{test_path}")

In [None]:
#@markdown **initialize `datasets` object** 
import warnings
#@markdown a lot of the next couple sections are borrowed from [here](https://github.com/huggingface/transformers/blob/master/examples/pytorch/language-modeling/run_mlm.py)
#@markdown because they decided to change the APIs. Also, [this page](https://huggingface.co/docs/datasets/access.html)
#@markdown is a decent place to start for datasets / API. Additionally, [this doc page](https://huggingface.co/docs/datasets/loading.html)
#@markdown goes slightly more in depth because datasets are confusing and complicated at first.

process_style = "line_by_line" #@param ["line_by_line", "merge"]
train_keep = 92 #@param {type:"slider", min:1, max:100, step:1}
val_keep = 97 #@param {type:"slider", min:1, max:100, step:1}
data_files = {}
if train_path is not None:
    data_files["train"] = train_path
    extension = train_path.split(".")[-1]
if test_path is not None:
    data_files["validation"] = test_path
    extension = test_path.split(".")[-1]
if extension == "txt":
    extension = "text"

# original
# raw_datasets = load_dataset(extension, data_files=data_files,)
# tokenizer defined earlier
rds_train = load_dataset(extension, 
                         data_files=data_files["train"],
                         split=f"train[:-{train_keep}%]")
rds_val = load_dataset(extension, 
                       data_files=data_files["validation"],
                         split=f"train[:-{val_keep}%]")


# raw_datasets = datasets.DatasetDict(rds_train, rds_val)
raw_datasets = datasets.DatasetDict({"train":rds_train,"validation":rds_val})
    # "train":rds_train,
    # "validation":rds_val,
column_names = raw_datasets["train"].column_names
text_column_name = "text" if "text" in column_names else column_names[0]

max_seq_length = tokenizer.model_max_length
if max_seq_length > 1024 and max_seq_length != 2048:
    warnings.warn(
        f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). "
        "Picking 1024 instead. You can change that default value by editing this cell wow!!"
    )
    max_seq_length = 1024

pp.pprint(raw_datasets)

In [None]:
#@title where the ~~magic~~ tokenization happens
if process_style == "line_by_line":
    # When using line_by_line, we just tokenize each nonempty line.
    padding = "max_length" # if you do want padding then change this to False

    def tokenize_function(examples):
        # Remove empty lines
        examples[text_column_name] = [
            line for line in examples[text_column_name] if len(line) > 0 and not line.isspace()
        ]
        return tokenizer(
            examples[text_column_name],
            padding=padding,
            truncation=True,
            max_length=max_seq_length,
            # We use this option because DataCollatorForLanguageModeling (see below) is more efficient when it
            # receives the `special_tokens_mask`.
            return_special_tokens_mask=True,
        )

    tokenized_datasets = raw_datasets.map(
        tokenize_function,
        batched=True,
        num_proc=get_workers(),
        remove_columns=[text_column_name],
        desc="Running tokenizer on dataset line_by_line",
    )
else:
    # Otherwise, we tokenize every text, then concatenate them together before splitting them in smaller parts.
    # We use `return_special_tokens_mask=True` because DataCollatorForLanguageModeling (see below) is more
    # efficient when it receives the `special_tokens_mask`.
    def tokenize_function(examples):
        return tokenizer(examples[text_column_name], padding=padding,
                         return_special_tokens_mask=True)

    tokenized_datasets = raw_datasets.map(
                    tokenize_function,
                    batched=True,
                    num_proc=get_workers(),
                    remove_columns=column_names,
                    desc="Running tokenizer on every text in dataset",
                )

    # Main data processing function that will concatenate all texts from our dataset and generate chunks of
    # max_seq_length.
    def group_texts(examples):
        # Concatenate all texts.
        concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
        total_length = len(concatenated_examples[list(examples.keys())[0]])
        # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
        # customize this part to your needs.
        if total_length >= max_seq_length:
            total_length = (total_length // max_seq_length) * max_seq_length
        # Split by chunks of max_len.
        result = {
            k: [t[i : i + max_seq_length] for i in range(0, total_length, max_seq_length)]
            for k, t in concatenated_examples.items()
        }
        return result

    # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a
    # remainder for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value
    # might be slower to preprocess.
    #
    # To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
    # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map

    tokenized_datasets = tokenized_datasets.map(
        group_texts,
        batched=True,
        num_proc=get_workers(),
        desc=f"Grouping texts in chunks of {max_seq_length}",
    )

train_dataset = tokenized_datasets["train"]
eval_dataset = tokenized_datasets["validation"]
print(f"created train and evaluation with {train_dataset.num_rows} and {eval_dataset.num_rows} rows respectively")

In [None]:
#@markdown **Create Data collator**
from transformers import DataCollatorForLanguageModeling
#@markdown no random masking needed because this is text gen. Docs [here](https://huggingface.co/docs/transformers/main_classes/data_collator#transformers.DataCollatorForLanguageModeling)

data_collator = DataCollatorForLanguageModeling(
                    tokenizer=tokenizer,
                    mlm=False,
                )

# LoRA fine-tuning example
Here we demonstrate how to fine-tune the proposed model using low-rank adapters [(Hu et al, 2021)](https://arxiv.org/abs/2106.09685) and [8-bit Adam](https://arxiv.org/abs/2110.02861). We also use [dataset streaming API](https://huggingface.co/docs/datasets/dataset_streaming.html) to avoid downloading the large dataset.

In [None]:
%%capture
def add_adapters(model, adapter_dim=16):
    assert adapter_dim > 0

    for module in model.modules():
        if isinstance(module, FrozenBNBLinear):
            module.adapter = nn.Sequential(
                nn.Linear(module.in_features, adapter_dim, bias=False),
                nn.Linear(adapter_dim, module.out_features, bias=False),
            )
        elif isinstance(module, FrozenBNBEmbedding):
            module.adapter = nn.Sequential(
                nn.Embedding(module.num_embeddings, adapter_dim),
                nn.Linear(adapter_dim, module.embedding_dim, bias=False),
            )

add_adapters(gpt)
gpt.to(device)

In [None]:
#@title training args

#@markdown > here is where you enter / select parameters for variables that are used
#@markdown > throughout the rest of the notebook.

#@markdown - the docs on the [trainer](https://huggingface.co/docs/transformers/main_classes/trainer#transformers.TrainingArguments) are very lovely and short. You can also reference [the main training overview](https://huggingface.co/docs/transformers/training) from earlier
#@markdown - if you have issues maintaing a stable colab session, try changing the `save_strategy ` to `steps` and choose a reasonable number (based on the specific situation)

START_LR = 1e-4 #@param {type:'number'}
GC_STEPS =  64#@param {type:'integer'}
MAX_GRAD_NORM = 0.5 #@param {type:'number'}
USE_FP16 = True #@param {type:"boolean"}
HUB_PUSH = True #@param {type:"boolean"}
BATCH_SIZE =  2#@param {type:"integer"}

full_out_name = f"{hf_name.split('/')[-1]}-{model_name_header}_DS-{dataset}_Ep-{N_EPOCHS}_Bs-{BATCH_SIZE}"
# model.push_to_hub(full_out_name, auth)
full_out_name = full_out_name.replace(".", "pt")
print(f"\nmodel will be stored on HF as:\n {full_out_name}")



In [None]:
#@title Define Metrics

#@markdown <font color="orange"> please note that at present no evaluation is completed in 
#@markdown by default - it provides to add a second "layer" of GPU compute that seems to be too much
#@markdown so that will be computed separately.

#@markdown Taken from [this](https://towardsdatascience.com/how-to-evaluate-text-generation-models-metrics-for-automatic-evaluation-of-nlp-models-e1c251b04ec1)
#@markdown great article on medium:

#@markdown > BLEU and Rouge are the most popular evaluation metrics that are used to compare models in the NLG domain. BLEU is a precision focused metric that calculates n-gram overlap of the reference and generated texts. This n-gram overlap means the evaluation scheme is word-position independent apart from n-grams’ term associations. One thing to note in BLEU — there is a brevity penalty i.e. a penalty applied when the generated text is too small compared to the target text.

#@markdown > Rouge: Recall Oriented Understudy for Gisting Evaluation  It is very similar to the BLEU definition, the difference being that Rouge is recall focused whereas BLEU was precision focused. There are 3 types of Rouge: n-rouge, the most common rouge type which means n-gram overlap. eg. (2-rouge, 1-rouge for 2-grams and 1-gram respectively). Second is l-rouge which checks for Longest Common Subsequence instead of n-gram overlap. The third is s-rouge which focuses on skip grams. Standard implementations of these can be found in most ML libraries, n-rouge is most commonly used.

#@markdown ---

#@markdown reference [here]( https://huggingface.co/docs/transformers/training) from hf docs
import numpy as np
from datasets import load_metric, list_metrics
MY_METRIC = "bleu" #@param {type:'string'}
EVAL_WHILE_TRAIN = False #@param {type:'boolean'}

metric = load_metric(MY_METRIC)# accuracy or check list_metrics()

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

if EVAL_WHILE_TRAIN: print(f"will compute the {MY_METRIC} metric while training, expect higher mem usage")

In [None]:
from transformers import Trainer, TrainingArguments


training_args = TrainingArguments(

    output_dir=join(outpath, "checkpoints") if SAVE_GDRIVE else './checkpoints', 
    save_total_limit=1,
    logging_dir='/content/logs',
    num_train_epochs=1, 
    # TODO figure out and fix issue. no eval mid-train for now
    evaluation_strategy='epoch' if EVAL_WHILE_TRAIN else 'no', 
    save_strategy='steps', # switched to every N steps because it did not seem to work otherwise
    save_steps=30,
    logging_steps=10,

    overwrite_output_dir=True, #overwrite the content of the output directory
    per_device_train_batch_size=BATCH_SIZE, 
    per_device_eval_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=GC_STEPS, 
    eval_accumulation_steps= 0 if GC_STEPS == 0 else 4,
    gradient_checkpointing=False if GC_STEPS == 0 else True,
    max_grad_norm=MAX_GRAD_NORM,
    lr_scheduler_type  = "cosine_with_restarts",
    learning_rate=START_LR,
    warmup_ratio=0.05,
    weight_decay=0.01,
    # bf16=True,  # only switch to BF16 if you have A100 or better (somehow??)
    # bf16_full_eval=True,
    fp16_full_eval=USE_FP16, # if metrics are crucial set to False
    fp16=USE_FP16,
    fp16_opt_level='O1',
    # deepspeed=ds_config, # use deepspeed.
    push_to_hub=HUB_PUSH,
    hub_model_id=full_out_name,
    hub_strategy='checkpoint',
)


In [None]:
#@title define Schedule and Optimizer
#@markdown because we are using the `trainer()` api instead of boilerplate code, need to
#@markdown define these. See [here](https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate)
#@markdown for docs on learning rate scheduling
from bitsandbytes.optim import Adam8bit
from torch.optim import lr_scheduler
from torch.utils.checkpoint import checkpoint

gpt.resize_token_embeddings(len(tokenizer))
gpt.gradient_checkpointing_enable()

#@markdown `LR_GAMMA` is the Multiplicative factor of learning rate decay / epoch.
LR_GAMMA = 0.66 #@param {type:'number'}
optimizer = Adam8bit(gpt.parameters(), lr=START_LR) 
scheduler = lr_scheduler.ExponentialLR(optimizer, 
                                       gamma=LR_GAMMA)



In [None]:
 #@title Init Trainer
 #@markdown basically, now this just puts all the pieces together.
from datasets import load_metric, list_metrics

trainer = Trainer(
    model=gpt,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    optimizers=(optimizer,scheduler),
    compute_metrics=compute_metrics,
)

---

## Run Fine-Tuning

In [None]:
import gc
for ep in range(N_EPOCHS):
    gc.collect()
    trainer.train()
    trainer.push_to_hub()
    tokenizer.push_to_hub()

## save & convert

After training is done you can save the model by calling `save_model()`. This will save the trained model to our `output_dir` from our `TrainingArguments`.

In [None]:
#@markdown prep work - imports, create directories for conversion
import os
from os.path import join
from google.colab import files 
import gc
fin_zero = join(os.getcwd(), "final_zero_weights")
os.makedirs(fin_zero, exist_ok=True )

fin_loc = join(os.getcwd(), "final_model")
os.makedirs(fin_loc, exist_ok=True )

print(f"final model file will be saved to {fin_loc}")

In [None]:
trainer.save_model(output_dir=fin_loc) # save to one directory with zero weights

In [None]:
del trainer
del gpt
torch.cuda.empty_cache()
gc.collect()

---

# Push latest to hub

In [None]:
#@markdown re-load tokenizer from the original model if something happened and it 
#@markdown does not exist
from transformers import AutoTokenizer

if "_tokenizer" not in globals():
    tokenizer = AutoTokenizer.from_pretrained(hf_name, use_fast=False,
                                            max_length=2048,
                                            model_max_length=2048,
                                            )

In [None]:
# Push the tokenizer to your namespace with the name full_out_name with no local clone.
tokenizer.push_to_hub(full_out_name,
                       use_auth_token=True,
                    use_temp_dir=True)

In [None]:
#@title Reload the "Converted" Model
%%capture
from os.path import join
from transformers import AutoModelForCausalLM, AutoTokenizer

finetuned_model =GPTJForCausalLM.from_pretrained(fin_loc,
                                      low_cpu_mem_usage=True,
                                      use_cache=False, # prevent weird hf trainer warnings popping up
                                      )

device = 'cuda' if torch.cuda.is_available() else 'cpu'
finetuned_model.to(device)

In [None]:
finetuned_model.push_to_hub(full_out_name,
                            use_auth_token=True,
                            use_temp_dir=True)

In [None]:
torch.cuda.empty_cache()


# Generate text with trained models

## add prompts

In [None]:
from transformers import pipeline

finetuned_model.to('cuda')

my_chatbot = pipeline('text-generation', model=finetuned_model, 
                      tokenizer=tokenizer,
                      device=0)

In [None]:
#@title define speaker and responder
#@markdown for testing the models this should not need to be changed. 
#@markdown if testing a model related to [ai-msgbot](https://github.com/pszemraj/ai-msgbot)
#@markdown trained on data that **was not** using the entries below, update as needed.
speaker = "person alpha" #@param {type:"string"}
responder = "person beta" #@param {type:"string"}

## define prompt messages

the reason `f"{responder}:\n"` is added at the end of each prompt is to force the text-gen model to actually _respond_ to the prompt as opposed to adding on to it.

In [None]:
prompts = [
           "hi! how are you doing?",
           "what should I bring to the party?",
           "do you like memes?",
           "can we go on a date together this weekend?",
           "what's up homie?",
           "do you know how can I make friends here?",
           "so what do you like to do for fun?",
           "what is your favorite brand of cereal?",
           "what is the meaning of existence?",
]

In [None]:
#@title encode prompts
#@markdown defines `encode_prompt` as a helper


def encode_prompt(prompt:str, spkr=None, rspndr=None):
    line1 = f"{spkr}:\n" if spkr is not None else ""
    line2 = f"{prompt}\n\n"
    line3 = f"{rspndr}:\n" if rspndr is not None else ""

    return [line1, line2, line3]


encoded_prompts = [encode_prompt(p, speaker, responder) for p in prompts]

In [None]:
#@markdown set amount of text to generate (higher # = longer RT)
resp_len =  128#@param {type:"integer"}
resp_temp =  0.72#@param {type:"number"}

In [None]:
# note that responses output the prompt as part of the output (and that counts 
# for part of the max length reqs)
for i, question in enumerate(encoded_prompts):
    this_prompt = "".join(question)
    ex_min = len(this_prompt) + resp_len
    result = my_chatbot(
                        this_prompt,
                        min_length=ex_min,
                        max_length=ex_min + resp_len,
                        do_sample=True,
                        top_k=35,
                        top_p=0.90,
                        temperature=resp_temp,
                        # repetition_penalty =1.5,
                        # length_penalty=1.2,
                        no_repeat_ngram_size=4,
                        clean_up_tokenization_spaces=True,
                    )
    
    print(f"==========Testing Prompt-ID #{i} ==========")
    print(f"PROMPT TEXT:\n{''.join(question)}")
    print("----------FULL GENERATED TEXT:")
    print(result[0]['generated_text'])
    print("\n" * 3)

# Metadata

In [None]:
metadata = training_args.to_sanitized_dict()
metadata["configs_src"] = hf_name
metadata["model_src"] = model_8bit
metadata["train_tag"] = model_name_header
metadata["data_tag"] = dataset
metadata["LR_scheduler_gamma"] = LR_GAMMA

pp.pprint(metadata)

In [None]:
from google.colab import files
import json

metadata_path = f"{model_name_header}_training_metadata.json"
with open(metadata_path, "w") as write_file:
    json.dump(metadata, write_file)

files.download(metadata_path)

# ideas

- train with [story-cloze](https://huggingface.co/datasets/story_cloze) dataset