In [1]:
!pip install transformers -U



In [12]:
!pip install --upgrade transformers
!pip install --upgrade torch




In [2]:
!pip install pydantic

Collecting pydantic
  Downloading pydantic-2.4.2-py3-none-any.whl (395 kB)
                                              0.0/395.8 kB ? eta -:--:--
     ------------------------               256.0/395.8 kB 7.9 MB/s eta 0:00:01
     -------------------------------------- 395.8/395.8 kB 6.1 MB/s eta 0:00:00
Collecting annotated-types>=0.4.0 (from pydantic)
  Downloading annotated_types-0.6.0-py3-none-any.whl (12 kB)
Collecting pydantic-core==2.10.1 (from pydantic)
  Downloading pydantic_core-2.10.1-cp311-none-win_amd64.whl (2.0 MB)
                                              0.0/2.0 MB ? eta -:--:--
     -----                                    0.3/2.0 MB 6.1 MB/s eta 0:00:01
     ------------                             0.6/2.0 MB 7.9 MB/s eta 0:00:01
     ------------------                       0.9/2.0 MB 7.1 MB/s eta 0:00:01
     -------------------------                1.3/2.0 MB 7.4 MB/s eta 0:00:01
     ---------------------------------        1.6/2.0 MB 7.5 MB/s eta 0:00:01
  

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tensorflow-intel 2.13.0 requires numpy<=1.24.3,>=1.22, but you have numpy 1.26.1 which is incompatible.
tensorflow-intel 2.13.0 requires typing-extensions<4.6.0,>=3.6.6, but you have typing-extensions 4.8.0 which is incompatible.
torchvision 0.15.2 requires torch==2.0.1, but you have torch 2.1.0 which is incompatible.


In [6]:
!pip install --upgrade pydantic transformers



In [19]:
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import AutoModelForPreTraining, AutoTokenizer, TrainingArguments, Trainer

# 데이터셋 클래스 정의
class CustomDataset(Dataset):
    def __init__(self, input_ids, attention_mask):
        self.input_ids = input_ids
        self.attention_mask = attention_mask

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_mask[idx]
        }

# 데이터셋 생성 (라벨 제외)
train_dataset = CustomDataset(input_ids, attention_mask)

# 데이터로더 생성
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)

# 학습 설정
training_args = TrainingArguments(
    output_dir="./kcbert-finetuned",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=32,
    save_steps=10_000,
    save_total_limit=2,
    logging_dir="./logs",
    logging_steps=10,
)

# 파인튜닝
model_name = "beomi/kcbert-base"
model = AutoModelForPreTraining.from_pretrained(model_name)

# 모델 파라미터를 동결 (fine-tuning을 방지하기 위함)
for param in model.parameters():
    param.requires_grad = False

# 모델 학습
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset  # 데이터셋 객체를 전달
)

trainer.train()


  0%|          | 0/30 [00:00<?, ?it/s]

ValueError: The model did not return a loss from the inputs, only the following keys: prediction_logits,seq_relationship_logits. For reference, the inputs it received are input_ids,attention_mask.

In [20]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset

# 데이터셋 클래스 정의
class CustomDataset(Dataset):
    def __init__(self, input_data):
        self.input_data = input_data

    def __len__(self):
        return len(self.input_data)

    def __getitem__(self, idx):
        return {
            'input_data': self.input_data[idx]
        }

# AutoEncoder 모델 정의
class AutoEncoder(nn.Module):
    def __init__(self):
        super(AutoEncoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, latent_size)
        )
        self.decoder = nn.Sequential(
            nn.Linear(latent_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, input_size),
            nn.Sigmoid()
        )

    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded

# 모델 및 데이터 준비
input_size = 64  # 입력 데이터의 차원
hidden_size = 32  # 은닉층 크기
latent_size = 16  # 잠재 변수의 크기
model = AutoEncoder()
data = torch.randn(1000, input_size)  # 라벨이 없는 데이터 예시

train_dataset = CustomDataset(data)
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)

# 손실 함수 정의 (오토인코더의 출력과 입력 데이터 간의 차이)
criterion = nn.MSELoss()

# 옵티마이저 정의
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# 학습
num_epochs = 10
for epoch in range(num_epochs):
    for batch in train_dataloader:
        inputs = batch['input_data']
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, inputs)
        loss.backward()
        optimizer.step()
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item()}')

# 학습된 모델로 데이터 재구성
sample_data = data[0]
reconstructed_data = model(sample_data)


Epoch [1/10], Loss: 1.2797846794128418
Epoch [2/10], Loss: 1.050167441368103
Epoch [3/10], Loss: 1.1313446760177612
Epoch [4/10], Loss: 0.9230650067329407
Epoch [5/10], Loss: 0.8911839723587036
Epoch [6/10], Loss: 0.9274351000785828
Epoch [7/10], Loss: 1.0243639945983887
Epoch [8/10], Loss: 0.9424793124198914
Epoch [9/10], Loss: 0.9382832050323486
Epoch [10/10], Loss: 1.0548819303512573


In [18]:
import pandas as pd

data_path = 'output_data.csv'
additional_data = pd.read_csv(data_path)
augmented_text = additional_data['augmented_text'].tolist()

In [19]:
from transformers import AutoTokenizer

model_name = "beomi/kcbert-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)

input_data = tokenizer(augmented_text, return_tensors="pt", padding='max_length', truncation=True, max_length=128)
input_ids = input_data['input_ids']
attention_mask = input_data['attention_mask']


In [20]:
import torch

train_dataset = torch.utils.data.TensorDataset(input_ids, attention_mask)


In [21]:
from transformers import AutoModelForSequenceClassification

model_name = "beomi/kcbert-base"  # 여기서 적절한 모델 선택
model = AutoModelForSequenceClassification.from_pretrained(model_name)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at beomi/kcbert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [22]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./kcbert-finetuned",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=32,
    save_steps=10_000,
    save_total_limit=2,
    logging_dir="./logs",
    logging_steps=10,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
)

trainer.train()

  0%|          | 0/30 [00:00<?, ?it/s]

TypeError: vars() argument must have __dict__ attribute

In [23]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from transformers import AutoModel, AutoTokenizer


In [24]:
model_name = "beomi/kcbert-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
kcbert_model = AutoModel.from_pretrained(model_name)


In [25]:
additional_data = pd.read_csv('output_data.csv')
augmented_text = additional_data['augmented_text'].tolist()
input_data = tokenizer(augmented_text, return_tensors="pt", padding='max_length', truncation=True, max_length=128)
input_ids = input_data['input_ids']
attention_mask = input_data['attention_mask']
train_dataset = TensorDataset(input_ids, attention_mask)

RuntimeError: mat1 and mat2 must have the same dtype, but got Long and Float