<a href="https://colab.research.google.com/github/sol-sun/Generative_Deep_Learning_2nd_Pytorch/blob/main/05_autoregressive/lstm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 🥙 レシピデータ上のLSTM

本ノートブックでは，レシピデータセットを使ってLSTMを学習する．

In [1]:
!pip install datasets | tail -n 1
!pip install japanize_matplotlib | tail -n 1
!pip install kagglehub | tail -n 1
!pip install torchinfo | tail -n 1

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
gcsfs 2024.10.0 requires fsspec==2024.10.0, but you have fsspec 2024.9.0 which is incompatible.[0m[31m
[0mSuccessfully installed datasets-3.1.0 dill-0.3.8 fsspec-2024.9.0 multiprocess-0.70.16 xxhash-3.5.0
Successfully installed japanize_matplotlib-1.1.3
Successfully installed torchinfo-1.8.0


In [2]:
import numpy as np
import os
import re
import string
import shutil
import matplotlib.pyplot as plt
import japanize_matplotlib
from datasets import Dataset as HFDataset
import torchvision
import torch
import torchinfo
import random
import json
import pandas as pd
import kagglehub
from torch import nn
from torch.utils.data import Dataset, DataLoader, Subset
from transformers import AutoTokenizer, PretrainedConfig, PreTrainedModel, pipeline
from transformers.models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
from transformers.modeling_outputs import CausalLMOutput

import sys
print(sys.version)
print(torch.__version__)
print(torchvision.__version__)

3.10.12 (main, Nov  6 2024, 20:22:13) [GCC 11.4.0]
2.5.1+cu121
0.20.1+cu121


In [3]:
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.backends.mps.is_available():
        # MPSを使用する場合の設定
        torch.backends.mps.deterministic = True
    elif torch.cuda.is_available():
        # CUDAが利用可能な場合
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False

def save_model(model, path):
    directory = os.path.dirname(path)
    if directory != '':
        os.makedirs(directory, exist_ok=True) # ディレクトリを作成
    model.eval()
    torch.save(model.state_dict(), path)

def load_model(model, path, device):
    model.load_state_dict(torch.load(path, map_location=device, weights_only=False))
    model = model.to(device)
    model.eval()
    print(f"Model loaded from {path}")
    return model

def save_metrics(metrics_per_epoch, filename):
    directory = os.path.dirname(path)
    if directory != '':
        os.makedirs(directory, exist_ok=True) # ディレクトリを作成
    # ファイルにメトリクスを保存
    with open(filename, "w") as f:
        json.dump(metrics_per_epoch, f, indent=2)
    with open(filename, "w") as f:
        json.dump(metrics_per_epoch, f, indent=2)

# シード値を設定
set_seed(1234)

if torch.cuda.is_available():
    device = torch.device('cuda')
elif torch.backends.mps.is_available():
    device = torch.device('mps')
else:
    device = torch.device('cpu')

print(device)

cpu


## 0. パラメータ

In [4]:
# VOCAB_SIZE = 10000
MAX_LEN = 200
EMBEDDING_DIM = 100
N_UNITS = 128
VALIDATION_SPLIT = 0.2
SEED = 42
LOAD_MODEL = False
BATCH_SIZE = 32
EPOCHS = 25

## 1. データを準備する

In [5]:
# kagglehubを使ってダウンロード -> pathに保存
path = kagglehub.dataset_download("hugodarwood/epirecipes")
print("Path to dataset files:", path)
# 属性情報を読み込む
with open(os.path.join(path, "full_format_recipes.json")) as f:
    recipe_data = json.load(f)
# [SEP]を文末に挿入（ストップトークンとして扱う）
eos = '[SEP]'
text_data = [
    'Recipe for ' + x['title'] + ' | ' + ' '.join(x['directions']) + eos
    for x in recipe_data
    if 'title' in x
    and x['title'] is not None
    and 'directions' in x
    and len(x['directions']) > 0  # directions is not None
]
# レシピ数
n_recipes = len(text_data)
print(f"{n_recipes} レシピを読み込みました．")

example = text_data[9]
print(example)

Downloading from https://www.kaggle.com/api/v1/datasets/download/hugodarwood/epirecipes?dataset_version_number=2...


100%|██████████| 11.3M/11.3M [00:00<00:00, 12.9MB/s]

Extracting files...





Path to dataset files: /root/.cache/kagglehub/datasets/hugodarwood/epirecipes/versions/2
20098 レシピを読み込みました．
Recipe for Ham Persillade with Mustard Potato Salad and Mashed Peas  | Chop enough parsley leaves to measure 1 tablespoon; reserve. Chop remaining leaves and stems and simmer with broth and garlic in a small saucepan, covered, 5 minutes. Meanwhile, sprinkle gelatin over water in a medium bowl and let soften 1 minute. Strain broth through a fine-mesh sieve into bowl with gelatin and stir to dissolve. Season with salt and pepper. Set bowl in an ice bath and cool to room temperature, stirring. Toss ham with reserved parsley and divide among jars. Pour gelatin on top and chill until set, at least 1 hour. Whisk together mayonnaise, mustard, vinegar, 1/4 teaspoon salt, and 1/4 teaspoon pepper in a large bowl. Stir in celery, cornichons, and potatoes. Pulse peas with marjoram, oil, 1/2 teaspoon pepper, and 1/4 teaspoon salt in a food processor to a coarse mash. Layer peas, then potato s

## 2. データをトークン化する

In [6]:
# テキストをベクトル化して、xとyを作成する関数
def tokenize_and_shift(batch, tokenizer, device):
    texts = [text for text in batch["full_text"]]
    encoding = tokenizer(
        texts,
        padding="max_length",
        truncation=True,
        max_length=MAX_LEN + 1, # [SEP]を含めて
        return_tensors="pt",
        add_special_tokens=False  # [SEP]を[EOS]として扱う（データ作成時に手動追加）
        # 確認用(BertTokenizerでは，文頭に[CLS]，文末(文間)に[SEP]を追加している．わざわざエンコーダベースのBertTokenizerを使わなくてもいいが学習のため)
        # print(tokenizer.encode(tokenizer.pad_token, add_special_tokens=True))
        # print(tokenizer.encode(tokenizer.sep_token, add_special_tokens=True))
        # print(tokenizer.encode(tokenizer.cls_token, add_special_tokens=True))
    ).to(device)

    x = encoding["input_ids"][:, :-1]
    y = encoding["input_ids"][:, 1:]

    return {"x": x, "y": y}

## 3. 学習用データの作成

In [7]:
processed_data = []
for item in text_data:
    title, recipe = item.split('|', 1)  # '|'で分割し、前後に格納
    processed_data.append({'title': title.strip(),
                            'recipe': recipe.strip(),
                            'full_text': item.strip()})

# 各項目のリストを辞書として整理
processed_data_dict = {'title': [data['title'] for data in processed_data],
                        'recipe': [data['recipe'] for data in processed_data],
                        'full_text': [data['full_text'] for data in processed_data]
                        }

# 辞書形式でデータセットを作成
trainset = HFDataset.from_dict(processed_data_dict)
print(trainset)

# Tokenizerを読み込む
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
# SEPをEOS（文の終わり）として扱う
tokenizer.eos_token_id = tokenizer.sep_token_id
VOCAB_SIZE = tokenizer.vocab_size
print(VOCAB_SIZE)

# データセットをトークン化
trainset = trainset.map(tokenize_and_shift, batched=True, fn_kwargs={'device': device, 'tokenizer': tokenizer})
trainset = trainset.with_format("torch")

# データローダーの作成
trainloader = DataLoader(
    trainset,
    batch_size=BATCH_SIZE,
    shuffle=True,
    pin_memory=True,
)

Dataset({
    features: ['title', 'recipe', 'full_text'],
    num_rows: 20098
})


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

30522


Map:   0%|          | 0/20098 [00:00<?, ? examples/s]

In [8]:
# データをロードして表示
for batch in trainloader:
    x_batch = batch["x"]
    y_batch = batch["y"]
    texts = batch["full_text"][0]
    print("x_batch:", x_batch, x_batch.shape)
    print("y_batch:", y_batch, y_batch.shape)
    print(texts)
    break

x_batch: tensor([[17974,  2005, 21628,  ...,     0,     0,     0],
        [17974,  2005, 24792,  ...,  5420,  7906,  1010],
        [17974,  2005, 11968,  ...,     0,     0,     0],
        ...,
        [17974,  2005, 20548,  ...,     0,     0,     0],
        [17974,  2005, 11840,  ..., 11840,  3659,  1012],
        [17974,  2005,  3034,  ...,     0,     0,     0]]) torch.Size([32, 200])
y_batch: tensor([[ 2005, 21628,  4014,  ...,     0,     0,     0],
        [ 2005, 24792, 14684,  ...,  7906,  1010,  3139],
        [ 2005, 11968,  7834,  ...,     0,     0,     0],
        ...,
        [ 2005, 20548, 11968,  ...,     0,     0,     0],
        [ 2005, 11840,  1010,  ...,  3659,  1012,  3573],
        [ 2005,  3034,  1064,  ...,     0,     0,     0]]) torch.Size([32, 200])
Recipe for Tabil Spice Blend  | Finely grind coriander seeds, cumin seeds, caraway seeds, and crushed red pepper flakes in a spice mill. DO AHEAD: Can be made 1 month ahead. Store airtight at room temperature.[SEP]

## 4. LSTMモデルを作成する

In [9]:
class LSTMModel(PreTrainedModel):
    def __init__(self, config, vocab_size, embedding_dim=256, n_units=512):
        super().__init__(config)
        self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, n_units, bidirectional=False, batch_first=True)
        self.fc = nn.Linear(n_units, vocab_size)

    def forward(self, input_ids, attention_mask=None, return_dict=False, token_type_ids=None):
        x = self.embedding(input_ids)
        x, _ = self.lstm(x)
        x = self.fc(x)
        if not return_dict:
            return x
        return CausalLMOutput(logits=x)

    def prepare_inputs_for_generation(self, input_ids, **kwargs):
        return {"input_ids": input_ids}

In [10]:
config = PretrainedConfig()
config.is_decoder = True
config.eos_token_id = tokenizer.eos_token_id

model = LSTMModel(config=config, vocab_size=VOCAB_SIZE, embedding_dim=EMBEDDING_DIM, n_units=N_UNITS)
print(model)
torchinfo.summary(model.eval(), input_size=(32, MAX_LEN), dtypes=[torch.long])

LSTMModel has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.


LSTMModel(
  (embedding): Embedding(30522, 100, padding_idx=0)
  (lstm): LSTM(100, 128, batch_first=True)
  (fc): Linear(in_features=128, out_features=30522, bias=True)
)


Layer (type:depth-idx)                   Output Shape              Param #
LSTMModel                                [32, 200, 30522]          --
├─Embedding: 1-1                         [32, 200, 100]            3,052,200
├─LSTM: 1-2                              [32, 200, 128]            117,760
├─Linear: 1-3                            [32, 200, 30522]          3,937,338
Total params: 7,107,298
Trainable params: 7,107,298
Non-trainable params: 0
Total mult-adds (M): 977.33
Input size (MB): 0.05
Forward/backward pass size (MB): 1574.40
Params size (MB): 28.43
Estimated Total Size (MB): 1602.88

## 5. LSTMモデルを学習する

In [11]:
criterion = nn.CrossEntropyLoss(reduction="mean", ignore_index=0)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

### 学習する場合は以下(GPU推奨)

In [12]:
# model = model.to(device)
# model.train()
# metrics_per_epoch = {
#     "loss": [],
# }
# for epoch in range(EPOCHS):
#     epoch_metrics = {metric: 0 for metric in metrics_per_epoch}
#     batch_count = 0

#     for i, data in enumerate(trainloader, 0):
#         inputs, labels = data["x"], data["y"]
#         inputs, labels = inputs.to(device), labels.to(device)
#         optimizer.zero_grad()
#         outputs = model(inputs, return_dict=False)
#         loss = criterion(outputs.permute(0, 2, 1), labels)  # 元の入力と出力を比較してlossを計算
#         loss.backward()  # backpropagation
#         optimizer.step()  # モデルのパラメータ更新

#         for key in epoch_metrics:
#             epoch_metrics[key] += loss.item()
#         batch_count += 1

#     for key in epoch_metrics:
#         epoch_metrics[key] /= batch_count
#         metrics_per_epoch[key].append(epoch_metrics[key])

#     # epoch終了時点での誤算の平均値
#     print("Epoch {}:".format(epoch + 1), epoch_metrics)

In [13]:
# # modelを保存
# save_path = f"./models/lstm_model_epoch_{EPOCHS}.pth"
# save_model(model, save_path)

# # metricsを保存
# metrics_save_path = f"./metrics/lstm_metrics.json"
# save_metrics(metrics_per_epoch=metrics_per_epoch, filename=metrics_save_path)

In [14]:
import urllib.request

# モデルのURL
model_url = "https://github.com/sol-sun/Generative_Deep_Learning_2nd_Pytorch/raw/main/05_autoregressive/models/lstm_model_epoch_25.pth"
local_model_path = "lstm_model_epoch_25.pth"

# モデルをダウンロード
urllib.request.urlretrieve(model_url, local_model_path)

# モデルをデバイスに移動して読み込み
model = load_model(model, local_model_path, device)

Model loaded from lstm_model_epoch_25.pth


## 6. 学習済みLSTMモデルを用いたテキストの生成

In [15]:
from pprint import pprint

MODEL_FOR_CAUSAL_LM_MAPPING_NAMES["lstm"] = model.__class__.__name__  # "LSTMModel"

generator_sample = pipeline(
   'text-generation',
    model = model,
    temperature = 1.0,
    # repetition_penalty=2.0
    device=device,
    tokenizer=tokenizer, # トークナイザ
    max_new_tokens=80, # 生成する最大トークン数
    do_sample=True, # 生成をサンプル
    num_return_sequences=3, # 生成するシーケンスの数
    add_special_tokens=False # 入力テキストの前後に特殊トークンを追加しない
    )

generator_deterministic = pipeline(
   'text-generation',
    model = model,
    device=device,
    temperature = 0.05,
    # repetition_penalty=2.0
    tokenizer=tokenizer, # トークナイザ
    max_new_tokens=80, # 生成する最大トークン数
    do_sample=True, # 生成をサンプル
    num_return_sequences=3, # 生成するシーケンスの数
    add_special_tokens=False # 入力テキストの前後に特殊トークンを追加しない
    )

In [16]:
# Temperature = 1.0
print("Temperature = 1.0")
pprint(generator_sample("recipe for potato salad | "))

Temperature = 1.0
[{'generated_text': 'recipe for potato salad |  soak leeks in reserved garlic '
                    'and set aside. ( the stems should look cooled completely '
                    ') and cool. rinse well and pat dry. fill a large heavy '
                    'skillet with cumin, then add butter ; cook 3 to 5 seconds '
                    'to a well - seasoned 4 tablespoons scalded coconut oil or '
                    'with enough cold water to cover by 2 inches. cook, '
                    'covered with lid, until'},
 {'generated_text': 'recipe for potato salad |  whisk together oil, 1 / 4 '
                    'teaspoon sesame seeds, 1 / 2 tsp. salt, and olive oil in '
                    'a medium bowl. preheat oven to 400°f. mix 1 / 3 cup '
                    'butter, then 1 / 4 teaspoon salt in spice grinder and '
                    'grind with salt until a few lumps are well browned. '
                    'transfer to a large bowl. repeat'},
 {'generated_text': 

In [17]:
# Temperature = 0.05
print("Temperature = 0.05")
pprint(generator_deterministic("recipe for potato salad | "))

Temperature = 0.05
[{'generated_text': 'recipe for potato salad |  whisk together vinegar, '
                    'mustard, and salt in a small bowl. add oil and toss to '
                    'coat. season with salt and pepper.'},
 {'generated_text': 'recipe for potato salad |  whisk together vinegar, sugar, '
                    'and salt in a food processor until finely chopped. add '
                    'oil and pulse until finely chopped. add garlic and pulse '
                    'until finely chopped. add garlic and pulse until finely '
                    'chopped. add garlic and pulse until finely chopped. add '
                    'garlic and pulse until finely chopped. add garlic and '
                    'pulse until finely chopped. add garlic and pulse until '
                    'finely chopped. add garlic and pulse until finely'},
 {'generated_text': 'recipe for potato salad |  whisk together vinegar, sugar, '
                    'and salt in a food processor until finely 