<a href="https://colab.research.google.com/github/softmurata/colab_notebooks/blob/main/nlp/gpt2_finetuning_eng.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!nvidia-smi

In [None]:
#@title fine-tuning gpt neo

In [None]:
# 綺麗な画像を生成するために使われたpromptとその映像のリンク等のメタデータを保存してあるcsvファイル
!wget -O prompts.csv https://raw.githubusercontent.com/krea-ai/open-prompts/main/data/1k.csv

In [2]:
import pandas as pd

In [3]:
prompts = pd.read_csv("prompts.csv")

In [None]:
prompts.head()

In [None]:
prompts_gt6 = prompts.loc[prompts.prompt.str.split(' ').str.len() > 6]
prompts_gt6.head()

In [None]:
# downgrade for aitextgen
# !pip install git+https://github.com/llimllib/aitextgen@fix_tpu_available
!pip uninstall -qqy torch torchvision torchtext torchaudio fastai
!pip install -qq torch==1.9.0 pytorch-lightning==1.7.7 aitextgen==0.6.0 

In [1]:
model_name = "EleutherAI/gpt-neo-125M"

In [8]:
# delete raw data
prompts_gt6 = prompts_gt6.drop('raw_data', axis = 1).reset_index(drop=True)

In [9]:
prompts_gt6.to_csv("input_text_cleaned.txt", columns=["prompt"], header=False, index=False)

In [None]:
import pytorch_lightning as pl
print(pl.__version__)

In [None]:
from aitextgen.TokenDataset import TokenDataset
data = TokenDataset('./input_text_cleaned.txt', line_by_line=True)

In [None]:
from aitextgen import aitextgen
ai = aitextgen(model = model_name,  to_gpu=True)

In [None]:
ai.train('input_text_cleaned.txt',
         line_by_line=True,
         from_cache=False,
         num_steps=500,
         generate_every=100,
         save_every=500,
         save_gdrive=False,
         learning_rate=1e-3,
         fp16=False,
         batch_size=1, 
         )

In [None]:
ai.save()

In [None]:
prompt_ai = aitextgen(model_folder = '.', to_gpu=True)

In [None]:
print(prompt_ai.generate(prompt = "astronaut "))

In [None]:
#@title upload hugging face hub

In [None]:
ai.save_for_upload('sd-prompt-generator-gpt-neo')

In [None]:
# Please create huggingface access token with write right by huggingface web ui
from huggingface_hub import notebook_login

notebook_login()

In [None]:
!sudo apt-get install git-lfs

In [12]:
from huggingface_hub import HfApi
api = HfApi()

In [None]:
api.upload_folder(
    folder_path="./sd-prompt-generator-gpt-neo",
    path_in_repo = ".",
    repo_id="mutatasu/sd-prompt-generator-gpt-neo",
    repo_type="model"
)

In [None]:
#@title gpt2 finetuning
# https://www.ai-shift.co.jp/techblog/3170

In [None]:
!nvidia-smi

In [None]:
# test data
!wget https://instruct-pix2pix.eecs.berkeley.edu/human-written-prompts.jsonl

In [None]:
!pip install sentencepiece transformers

In [3]:
!mkdir myoutputs

In [14]:
import json
import logging
import os
import random
import sys
from dataclasses import dataclass

import numpy as np
import torch
import torch.nn as nn
from torch.optim import AdamW
from tqdm import tqdm
from transformers import AutoConfig, AutoTokenizer, AutoModelForCausalLM
from transformers import T5Tokenizer

seed = 42
train_file = 'human-written-prompts.jsonl'
eval_file = 'human-written-prompts.jsonl'
eval_out_file = 'myoutputs/out.txt'
output_dir = 'myoutputs'
num_train_epochs = 1
n_prompt_tokens = 200
train_batch_size = 2
learning_rate = 3e-4
max_new_tokens = 10
model_name =  "gpt2"

In [2]:
class PromptTuningLM(nn.Module):
    def __init__(
        self,
        model_name: str,
        n_prompt_tokens: int,
        config: AutoConfig,
        soft_prompt_path: str = None,
    ):
        super(PromptTuningLM, self).__init__()
        self.n_prompt_tokens = n_prompt_tokens 
        # 事前学習済みのGPTの呼び出し
        self.lm = AutoModelForCausalLM.from_pretrained(model_name, config=config)
        # Promptに対する埋め込みベクトルの作成
        self.soft_prompt = nn.Embedding(n_prompt_tokens, config.hidden_size)
        torch.nn.init.xavier_uniform_(self.soft_prompt.weight)

        # GPTの重みを固定
        for param in self.lm.parameters():
            param.requires_grad = False

        # [推論時] Promptに対する学習済みの埋め込みベクトルをロード
        if soft_prompt_path is not None: 
            logger.info(f"Set soft prompt. ({n_prompt_tokens} tokens)")
            self.soft_prompt = torch.load(soft_prompt_path)

    def _extend_inputs(self, input_ids) -> torch.Tensor:
        """
        Promptに対する埋め込みベクトルを付与する
        """
        # input_idsをベクトルに変換する（事前学習モデルが異なる場合は変更する必要あり）
        inputs_embeds = self.lm.transformer.wte(input_ids)
        if len(list(inputs_embeds.shape)) == 2:
            inputs_embeds = inputs_embeds.unsqueeze(0)
        # Promptに対する埋め込みベクトルとinputs_embedsを連結する
        batch_size = inputs_embeds.size(0)
        learned_embeds = self.soft_prompt.weight.repeat(batch_size, 1, 1)
        extended_embeds = torch.cat([learned_embeds, inputs_embeds], dim=1)
        return extended_embeds

    def _extend_labels(self, labels, ignore_index=-100) -> torch.Tensor:
        """
        inputに合わせて正解ラベルにPromptに対するラベルを付与する
        """
        if len(list(labels.shape)) == 1:
            labels = labels.unsqueeze(0)
        n_batches = labels.shape[0]
        # Promptに対してignore_indexを付与（-100に設定していれば損失が計算されない）
        prompt_labels = torch.full((n_batches, self.n_prompt_tokens), 
                                    ignore_index).to(labels.device)
        # Promptに対するラベルと元の正解ラベルを連結する
        extended_labels = torch.cat([prompt_labels, labels], dim=1)
        return extended_labels

    def save_soft_prompt(self, path: str, filename: str):
        """
        Promptに対する埋め込みベクトルの保存
        """
        torch.save(self.soft_prompt, os.path.join(path, filename))
        logger.info(f"Saved soft prompt: {os.path.join(path, filename)}")

    def forward(self, input_ids, labels=None, return_dict=None):
        # Promptを付与したベクトル
        inputs_embeds = self._extend_inputs(input_ids)
        if labels is not None:
            labels = self._extend_labels(labels)

        return self.lm(
            inputs_embeds=inputs_embeds,
            labels=labels,
            return_dict=return_dict,
        )

    def generate(self, input_text, tokenizer, max_new_tokens, eos_token_id, device):
        """
        [推論時]自己回帰で回答を生成する
        """
        input_ids = tokenizer.encode(input_text, add_special_tokens=False)
        cur_ids = torch.tensor(input_ids).unsqueeze(0).to(device)
        # 最大でmax_new_tokensだけ単語を生成する
        for _ in range(max_new_tokens):
            outputs = self.forward(cur_ids)
            softmax_logits = torch.softmax(outputs.logits[0,-1], dim=0)
            # 最大確率の単語を次の単語として一意に決定
            next_token_id = int(softmax_logits.argmax().to('cpu'))
            # もし選択された単語がeos_tokenなら生成を終了する
            if next_token_id == eos_token_id:
                break
            # 選択された単語をcur_idsに追加して次の処理を行う
            next_token_id = torch.tensor([[next_token_id]]).to(device)
            cur_ids = torch.cat([cur_ids, next_token_id], dim=1)

        # 生成した単語ID列をテキストに変換する
        output_list = list(cur_ids.squeeze().to('cpu').numpy())
        output_text = tokenizer.decode(output_list)
        return output_text

In [15]:
logging.basicConfig(level=logging.INFO, stream=sys.stdout)
logger = logging.getLogger(__name__)

os.makedirs(output_dir, exist_ok=True)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()

random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if n_gpu > 0:
    torch.cuda.manual_seed_all(seed)

# tokenizer = T5Tokenizer.from_pretrained(model_name)  # for japanese
tokenizer = AutoTokenizer.from_pretrained(model_name)
config = AutoConfig.from_pretrained(model_name)
model = PromptTuningLM(
    model_name,
    n_prompt_tokens=n_prompt_tokens,
    config=config,
)

model.to(device)

# LayerNorm.{weight, bias}に対してweight_decay=0.01を設定
param_optimizer = list(model.named_parameters())
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not ('ln' in n)],
     'weight_decay': 0.01},
    {'params': [p for n, p in param_optimizer if 'ln' in n],
     'weight_decay': 0.0}
]
optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)

In [38]:
@dataclass
class InputExample():
    input: str
    edit: str
    output: str

def create_examples(filename):
    examples = []
    with open(filename, 'r') as f:
        for line in f:
            example = json.loads(line)
            examples.append(InputExample(
                input = example['input'],
                edit = example['edit'],
                output = example['output']))
    return examples

"""
class CustomDataset(torch.utils.data.IterableDataset):
    def __init__(self, tokenizer, input_generator, output_generator):
        super().__init__()
        self._tokenizer = tokenizer
        self._input_generator = input_generator
        self._output_generator = output_generator

    @classmethod
    def from_texts(cls, tokenizer, input_texts, output_texts):
        return cls(tokenizer=tokenizer, input_generator=lambda: input_texts, 
                   output_generator=lambda: output_texts)

    def __iter__(self):
        for text in self._generator():
            ids = self._tokenizer.encode(text)
            yield {"input_ids": ids, "labels": ids}
"""

from torch.utils.data import Dataset

class CustomDataset(Dataset):
    def __init__(self, tokenizer, input_texts, output_texts):
      super().__init__()
      self._tokenizer = tokenizer
      self._input_texts = input_texts
      self._output_texts = output_texts
    def __getitem__(self, index):
        input_ids = self._tokenizer.encode(self._input_texts[index])
        output_ids = self._tokenizer.encode(self._output_texts[index])
        return {"input_ids": input_ids, "labels": output_ids}

    def __len__(self):
        return len(self._input_texts)

def collate_fn(samples):
    batch = {'input_ids': [], 'labels': []}
    for sample in samples:
        batch['input_ids'].append(torch.tensor(sample['input_ids']))
        batch['labels'].append(torch.tensor(sample['labels']))
    batch['input_ids'] = torch.nn.utils.rnn.pad_sequence(
                    batch['input_ids'], batch_first=True, padding_value=3)
    batch['labels'] = torch.nn.utils.rnn.pad_sequence(
                    batch['labels'], batch_first=True, padding_value=3)
    return batch

In [16]:
tokenizer.special_tokens_map

{'bos_token': '<|endoftext|>',
 'eos_token': '<|endoftext|>',
 'unk_token': '<|endoftext|>'}

In [None]:
import torch

In [40]:
logger.info("***** Running training *****")
train_examples = create_examples(train_file)
train_input_texts = [example.input + "+" + example.edit
                for example in train_examples]
train_output_texts = ["="+example.output for example in train_examples]
train_data = CustomDataset(tokenizer, train_input_texts, train_output_texts)
train_dataloader = torch.utils.data.DataLoader(dataset=train_data,
                    batch_size=train_batch_size, collate_fn=collate_fn)

model.train()
num_train_epochs = 2
for epoch in range(int(num_train_epochs)):
    logger.info(f'Epoch: {epoch+1}')
    for batch in tqdm(train_dataloader):
        input_ids = batch['input_ids'].to(device)
        labels = batch['labels'].to(device)
        print(input_ids.shape, labels.shape)  # need to same shape input and labels?
        outputs = model(input_ids, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

# Promptに対する埋め込みベクトルのみ保存する
model_to_save = model.module if hasattr(model, 'module') else model
model_to_save.save_soft_prompt(output_dir, 'soft_prompt.pt')

  0%|          | 0/350 [00:00<?, ?it/s]

torch.Size([2, 33]) torch.Size([2, 29])





ValueError: ignored

In [21]:
#@title inference
logger.info("***** Running evaluation *****")
tokenizer = AutoTokenizer.from_pretrained(model_name)
config = AutoConfig.from_pretrained(model_name)
model = PromptTuningLM(
    model_name,
    n_prompt_tokens=n_prompt_tokens,
    soft_prompt_path=os.path.join(output_dir, 'soft_prompt.pt'),
    config=config,
)
model.to(device)
model.eval()
output_texts = []

with open(eval_file, 'r') as f:
    for line in f:
        data = json.loads(line)
        input_text = data['input'] + "+" + data["edit"]
        output_text = model.generate(input_text, tokenizer,
                        max_new_tokens, tokenizer.eos_token_id, device)
        
        print("input_text:", input_text, "  output_text:", output_text)
        output_texts.append(output_text+'\n')
with open(eval_out_file, 'w') as f:
    f.writelines(output_texts)

input_text: Summer Street With Blooming Flowers (PRT_1003) - Canvas Art Print - 25in X 20in+Change it to mexican muralism./   output_text: Summer Street With Blooming Flowers (PRT_1003) - Canvas Art Print - 25in X 20in+Change it to mexican muralism./$1.00$1.00$1
input_text: rainforest Jaguar by Lisa McLaughlin+Turn the Jaguar into a lizard/   output_text: rainforest Jaguar by Lisa McLaughlin+Turn the Jaguar into a lizard/dragon/dragon/dragon/dragon/dragon/
input_text: Frog in a Flower Pot - Image 0+make the frog purple/   output_text: Frog in a Flower Pot - Image 0+make the frog purple/green/blue/yellow/purple/yellow
input_text: Autumn colors framed by traditional Japanese sliding doors and tatami mats+Make it an Ukiyo-e painting/   output_text: Autumn colors framed by traditional Japanese sliding doors and tatami mats+Make it an Ukiyo-e painting/museum+Make it an Ukiyo-
input_text: Anime girl with short pink hair | Drawing 15 | Pinterest+as a medieval illustrated manuscript./   output