## Import Module

In [1]:
!pip install transformers
!pip install optuna

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.27.1-py3-none-any.whl (6.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.7/6.7 MB[0m [31m43.8 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m57.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.2-py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.2/199.2 KB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.2 tokenizers-0.13.2 transformers-4.27.1


In [28]:
import optuna
import torch
from torch.utils.data import Dataset, random_split
from transformers import GPT2LMHeadModel, GPT2Tokenizer,  AdamW

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [15]:
# pretrained된 tokneizer & model 

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2').to(device)

In [16]:
# Defining Dataset

class MyDataset(Dataset):
    def __init__(self, file_path):
        # Load your data from a file or other source
        with open(file_path, 'r', encoding='utf-8') as f:
            self.data = f.read().splitlines()

        # Tokenize the data using the GPT-2 tokenizer
        from transformers import GPT2Tokenizer
        self.tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
        self.tokenized_data = [self.tokenizer.encode(text) for text in self.data]

    def __len__(self):
        return len(self.tokenized_data) - 1

    def __getitem__(self, idx):
        input_ids = self.tokenized_data[idx][:-1]
        target_ids = self.tokenized_data[idx][1:]
        return torch.tensor(input_ids), torch.tensor(target_ids)

In [17]:
# Define the collate function

def collate_fn(batch):
    input_ids, target_ids = zip(*batch)
    max_length = max(len(seq) for seq in input_ids)
    padded_input_ids = torch.zeros(len(batch), max_length, dtype=torch.long)
    padded_target_ids = torch.zeros(len(batch), max_length, dtype=torch.long)
    for i, (input_seq, target_seq) in enumerate(zip(input_ids, target_ids)):
        padded_input_ids[i, :len(input_seq)] = input_seq
        padded_target_ids[i, :len(target_seq)] = target_seq
    return padded_input_ids, padded_target_ids

In [18]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [19]:
train_file_path = 'drive/MyDrive/data/txt/chatbot.txt'

In [20]:
batch_size = 16

In [21]:
# Example usage
dataset = MyDataset(train_file_path)
dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, collate_fn=collate_fn)

In [29]:
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

In [30]:
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, collate_fn=collate_fn)
val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size,collate_fn=collate_fn )

In [22]:
# training settings

num_epochs = 3
learning_rate = 5e-5
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
model.train()

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dro

In [23]:
for epoch in range(num_epochs):
    total_loss = 0
    for batch_idx, (input_ids, target_ids) in enumerate(dataloader):
        optimizer.zero_grad()
        input_ids = input_ids.to(device)
        target_ids = target_ids.to(device)
        outputs = model(input_ids, labels=target_ids)
        loss = outputs[0]
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        if (batch_idx + 1) % 100 == 0:
            avg_loss = total_loss / (batch_idx + 1)
            print(f'Epoch: {epoch + 1}, Batch: {batch_idx + 1}, Avg. Loss: {avg_loss:.4f}')

Epoch: 1, Batch: 100, Avg. Loss: 2.4022
Epoch: 2, Batch: 100, Avg. Loss: 1.9481
Epoch: 3, Batch: 100, Avg. Loss: 1.7844


In [24]:
model.eval()


prompt = "This is the start of the generated text."
generated_text = tokenizer.encode(prompt)

for i in range(100):
    input_ids = torch.tensor(generated_text[-1024:]).unsqueeze(0)
    input_ids = input_ids.to('cuda')
    output = model(input_ids)
    logits = output[0][:, -1, :]
    new_token = torch.argmax(logits, dim=-1).item()
    generated_text.append(new_token)
generated_text = tokenizer.decode(generated_text)

print(generated_text)

This is the start of the generated text. you can to to to to to to!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!


In [32]:
def objective(trial):
    learning_rate = trial.suggest_float("learning_rate", 1e-5, 1e-3, log=True)
    num_train_epochs = trial.suggest_int("num_train_epochs", 1, 10)
    batch_size = trial.suggest_int("batch_size", 2, 32, log=True)
    optimizer = AdamW(model.parameters(), lr=learning_rate)

    
    best_loss = float('inf')
    for epoch in range(num_train_epochs):
        for batch in train_dataloader:
            # Retrieve inputs and labels
            inputs, labels = batch
            # Move inputs and labels to device
            inputs = inputs.to(device)
            labels = labels.to(device)
            # Zero gradients
            optimizer.zero_grad()
            # Forward pass
            outputs = model(inputs, labels=labels)
            # Compute loss
            loss = outputs.loss
            # Backward pass
            loss.backward()
            # Update weights
            optimizer.step()
            # Update best loss
            if loss.item() < best_loss:
                best_loss = loss.item()

    return best_loss

[32m[I 2023-03-16 05:58:35,435][0m A new study created in memory with name: no-name-b55cb9dc-fd96-445a-a798-84e9fc54d082[0m
[32m[I 2023-03-16 05:59:57,392][0m Trial 0 finished with value: 0.6451632976531982 and parameters: {'learning_rate': 5.633187443248085e-05, 'num_train_epochs': 3, 'batch_size': 3}. Best is trial 0 with value: 0.6451632976531982.[0m
[32m[I 2023-03-16 06:02:08,463][0m Trial 1 finished with value: 0.3365764021873474 and parameters: {'learning_rate': 0.0003602476629430723, 'num_train_epochs': 5, 'batch_size': 31}. Best is trial 1 with value: 0.3365764021873474.[0m
[32m[I 2023-03-16 06:05:35,726][0m Trial 2 finished with value: 0.14394426345825195 and parameters: {'learning_rate': 0.0003868877642720656, 'num_train_epochs': 8, 'batch_size': 3}. Best is trial 2 with value: 0.14394426345825195.[0m
[32m[I 2023-03-16 06:09:02,834][0m Trial 3 finished with value: 0.3945985436439514 and parameters: {'learning_rate': 3.4092480519798124e-05, 'num_train_epochs': 8,

Best trial:
  Loss: 0.139
  Params: 
    learning_rate: 0.00015320261153701007
    num_train_epochs: 9
    batch_size: 14


In [None]:
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=10)

print("Best trial:")
trial = study.best_trial
print(f"  Loss: {trial.value:.3f}")
print("  Params: ")
for key, value in trial.params.items():
    print(f"   {key}: {value}")

In [36]:
trial.params.items()

dict_items([('learning_rate', 0.00015320261153701007), ('num_train_epochs', 9), ('batch_size', 14)])

In [43]:
# training settings

num_epochs = 9
learning_rate = 0.00015320261153701007
batch_size = 14
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
model.train()

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dro

In [44]:
for epoch in range(num_epochs):
    total_loss = 0
    for batch_idx, (input_ids, target_ids) in enumerate(dataloader):
        optimizer.zero_grad()
        input_ids = input_ids.to(device)
        target_ids = target_ids.to(device)
        outputs = model(input_ids, labels=target_ids)
        loss = outputs[0]
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        if (batch_idx + 1) % 100 == 0:
            avg_loss = total_loss / (batch_idx + 1)
            print(f'Epoch: {epoch + 1}, Batch: {batch_idx + 1}, Avg. Loss: {avg_loss:.4f}')

Epoch: 1, Batch: 100, Avg. Loss: 1.7518
Epoch: 2, Batch: 100, Avg. Loss: 1.4910
Epoch: 3, Batch: 100, Avg. Loss: 1.2830
Epoch: 4, Batch: 100, Avg. Loss: 1.0854
Epoch: 5, Batch: 100, Avg. Loss: 0.9145
Epoch: 6, Batch: 100, Avg. Loss: 0.7888
Epoch: 7, Batch: 100, Avg. Loss: 0.6835
Epoch: 8, Batch: 100, Avg. Loss: 0.5942
Epoch: 9, Batch: 100, Avg. Loss: 0.5309


In [45]:
model.eval()


prompt = "This is the start of the generated text."
generated_text = tokenizer.encode(prompt)

for i in range(100):
    input_ids = torch.tensor(generated_text[-1024:]).unsqueeze(0)
    input_ids = input_ids.to('cuda')
    output = model(input_ids)
    logits = output[0][:, -1, :]
    new_token = torch.argmax(logits, dim=-1).item()
    generated_text.append(new_token)
generated_text = tokenizer.decode(generated_text)

print(generated_text)

This is the start of the generated text.!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!


- Text Generation 기본 코드 예제를 찾아서 전체적인 코드를 진행해봄.
- 추후 한글 데이터를 토대로 kogpt를 통해 만드는 것을 목표로 함.