In [1]:
!pip install transformers

[0m

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from transformers import ElectraTokenizer, ElectraForSequenceClassification, AdamW
from torch.utils.data import DataLoader, Dataset
import torch
from tqdm import tqdm

class CustomTrainset(Dataset):
    def __init__(self, data, tokenizer, max_length):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        content = str(self.data.iloc[index]["content"])
        label = self.data.iloc[index]["label"]
        encoding = self.tokenizer.encode_plus(
            content,
            add_special_tokens=True,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_attention_mask=True,
            return_tensors="pt",
        )
        return {
            "input_ids": encoding["input_ids"].flatten(),
            "attention_mask": encoding["attention_mask"].flatten(),
            "label": torch.tensor(label, dtype=torch.long),

        }

class CustomTestset(Dataset):
    def __init__(self, data, tokenizer, max_length):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        content = str(self.data.iloc[index]["content"])
        encoding = self.tokenizer.encode_plus(
            content,
            add_special_tokens=True,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_attention_mask=True,
            return_tensors="pt",
        )
        return {
            "input_ids": encoding["input_ids"].flatten(),
            "attention_mask": encoding["attention_mask"].flatten(),
        }

    
def train_model(model, train_loader, optimizer, device):
    model.train()
    model.to(device)

    for batch in train_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

def evaluate_model(model, val_loader, device):
    model.eval()
    model.to(device)

    val_loss = 0
    correct_predictions = 0

    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["label"].to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            logits = outputs.logits

            val_loss += loss.item()

            preds = torch.argmax(logits, dim=1)
            correct_predictions += torch.sum(preds == labels).item()

    val_loss /= len(val_loader)
    accuracy = correct_predictions / len(val_loader.dataset)

    return val_loss, accuracy


# 데이터 로드
train_data = pd.read_csv("/kaggle/input/jbnu-sw-ai/train_f.csv")
train_data = train_data[["content", "label"]]

test_data = pd.read_csv("/kaggle/input/jbnu-sw-ai/test_f.csv")
test_data = test_data[["content"]]

le = preprocessing.LabelEncoder()
train_data["label"] = le.fit_transform(train_data["label"])

# 데이터 전처리
tokenizer = ElectraTokenizer.from_pretrained("google/electra-base-discriminator")
max_length = 512
train_data, val_data = train_test_split(train_data, test_size=0.3, random_state=42)

train_dataset = CustomTrainset(train_data, tokenizer, max_length)
val_dataset = CustomTrainset(val_data, tokenizer, max_length)
test_dataset = CustomTestset(test_data, tokenizer, max_length)

# 데이터 로더 생성
batch_size = 18
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/27.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/666 [00:00<?, ?B/s]

In [None]:
# 디바이스 설정
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# ELECTRA 모델 초기화
num_labels = 24
model = ElectraForSequenceClassification.from_pretrained("google/electra-base-discriminator", num_labels=num_labels)

# 옵티마이저 설정
optimizer = AdamW(model.parameters(), lr=2e-5)

# 모델 학습
epochs = 5
for epoch in range(epochs):
    print(f"Epoch {epoch + 1}/{epochs}")
    train_model(model, train_loader, optimizer, device)
    val_loss, val_accuracy = evaluate_model(model, val_loader, device)
    print(f"Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracy:.4f}")

# 모델 저장
torch.save(model.state_dict(), "/kaggle/working/electra_base_model.pth")

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at google/electra-base-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.bias', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-base-discriminator and are newly initialized: ['classifier.d

Epoch 1/5


In [None]:
# 테스트 데이터로 예측
predictions = []
model.eval()
model.to(device)

with torch.no_grad():
    for batch in tqdm(test_loader):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits

        preds = torch.argmax(logits, dim=1)
        predictions.extend(preds.cpu().numpy())

preds = le.inverse_transform(predictions)

submit = pd.read_csv("/kaggle/input/jbnu-sw-ai/sample_submission.csv")
submit["label"] = preds
submit.to_csv("/kaggle/working/electra-base-prompt.csv", index=False)