In [1]:
import torch

In [2]:
torch.cuda.is_available()

True

In [7]:
import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset, random_split
import pandas as pd
from transformers import BertTokenizerFast, BertForSequenceClassification, AdamW, get_scheduler
from tqdm.auto import tqdm
import wandb

In [8]:
wandb.init(project="imdb_genre_classification", config={
    "model_name": "bert-base-uncased",
    "batch_size": 8,
    "epochs": 3,
    "learning_rate": 2e-5,
    "max_length": 512,
    "num_workers": 4,
})
config = wandb.config


[34m[1mwandb[0m: Currently logged in as: [33mjammy9087[0m ([33mjammy9087-pusan-national-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


In [None]:
df = pd.read_csv("imdb_dataset.csv")

# primaryTitle과 description을 하나의 텍스트로 합치기
df['text'] = df['primaryTitle'].astype(str) + " " + df['description'].astype(str)

# genre 컬럼 전처리: 쉼표로 구분된 문자열을 리스트로 변환
def process_genres(genres_str):
    if pd.isna(genres_str):
        return []
    return [g.strip() for g in genres_str.split(',') if g.strip() != ""]

df['genre_list'] = df['genre'].apply(process_genres)

# 전체 genre 목록 생성
all_genres = set()
for genres in df['genre_list']:
    for genre in genres:
        all_genres.add(genre)
all_genres = sorted(list(all_genres))
genre2id = {genre: idx for idx, genre in enumerate(all_genres)}
num_labels = len(all_genres)
print("전체 장르:", all_genres)

# 각 샘플에 대해 멀티핫 인코딩된 레이블 생성
def encode_labels(genres):
    label = [0] * num_labels
    for g in genres:
        if g in genre2id:
            label[genre2id[g]] = 1
    return label

df['labels'] = df['genre_list'].apply(encode_labels)

# 모델 학습에 필요한 열만 선택
df_model = df[['text', 'labels']]

In [None]:
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")
max_length = 512

class IMDBDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=512):
        self.dataframe = dataframe.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        row = self.dataframe.iloc[idx]
        text = row['text']
        # label은 멀티레이블 멀티핫 인코딩 (리스트 형태)
        label = torch.tensor(row['labels'], dtype=torch.float)
        # 토큰화 (출력은 dict로, input_ids, attention_mask 등이 포함)
        encoding = self.tokenizer(
            text,
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors='pt'
        )
        # 토크나이저 결과의 차원 제거 (batch dimension 제거)
        encoding = {key: val.squeeze(0) for key, val in encoding.items()}
        encoding['labels'] = label
        return encoding

# Dataset 객체 생성
dataset = IMDBDataset(df_model, tokenizer, max_length=max_length)

In [None]:
dataset_size = len(dataset)
train_size = int(0.8 * dataset_size)
val_size = int(0.1 * dataset_size)
test_size = dataset_size - train_size - val_size

# random_split은 내부적으로 torch.Generator()를 사용해 seed 지정 가능 (재현성 위해)
generator = torch.Generator().manual_seed(42)
train_dataset, val_dataset, test_dataset = random_split(
    dataset, [train_size, val_size, test_size], generator=generator
)

In [None]:
batch_size = 8  # 사용 가능한 GPU 메모리 및 학습 속도에 따라 조정

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=4)
val_dataloader   = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=4)
test_dataloader  = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=4)


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# BERT 모델 로드; 문제 유형을 multi_label_classification으로 설정하면,
# 내부적으로 Sigmoid 활성화와 BCEWithLogitsLoss가 사용됩니다.
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=num_labels,
    problem_type="multi_label_classification"
)
model.to(device)

optimizer = AdamW(model.parameters(), lr=2e-5)
num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    name="linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)

wandb.watch(model, log="all")


In [None]:
progress_bar = tqdm(range(num_training_steps))

for epoch in range(num_epochs):
    print(f"\nEpoch {epoch+1} 시작")
    
    # Training
    model.train()
    total_train_loss = 0.0
    for batch in train_dataloader:
        batch = {key: value.to(device) for key, value in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        
        total_train_loss += loss.item()
        progress_bar.update(1)
    
    avg_train_loss = total_train_loss / len(train_dataloader)
    
    # Validation
    model.eval()
    total_val_loss = 0.0
    with torch.no_grad():
        for batch in val_dataloader:
            batch = {key: value.to(device) for key, value in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss
            total_val_loss += loss.item()
    
    avg_val_loss = total_val_loss / len(val_dataloader)
    print(f"Epoch {epoch+1} 완료: Train Loss: {avg_train_loss:.4f} | Val Loss: {avg_val_loss:.4f}")
    
    wandb.log({
        "epoch": epoch+1,
        "train_loss": avg_train_loss,
        "val_loss": avg_val_loss,
        "lr": optimizer.param_groups[0]['lr'],
    })


In [None]:
model.eval()
total_test_loss = 0.0
with torch.no_grad():
    for batch in test_dataloader:
        batch = {key: value.to(device) for key, value in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        total_test_loss += loss.item()

avg_test_loss = total_test_loss / len(test_dataloader)
print(f"\nTest Loss: {avg_test_loss:.4f}")
wandb.log({"test_loss": avg_test_loss})

In [None]:
model.save_pretrained("bert_imdb_finetuned")
tokenizer.save_pretrained("bert_imdb_finetuned")
wandb.save("bert_imdb_finetuned/*")
wandb.finish()