In [1]:
import torch

In [2]:
torch.cuda.is_available()

True

In [3]:
import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset, random_split
import pandas as pd
from transformers import BertTokenizerFast, BertForSequenceClassification, AdamW, get_scheduler
from tqdm.auto import tqdm
import wandb


import torch.nn as nn
import torch.optim as optim
from transformers import AutoTokenizer, XLNetModel
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_curve, f1_score
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm
2025-03-02 06:56:26.531545: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-03-02 06:56:26.549516: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1740898586.571544 2850187 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1740898586.578122 2850187 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-03-02 06:56:26.601119: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorF

In [6]:
wandb.init(project="multi-label-classification", name="LLM Finetune")

config = wandb.config
# wandb.finish()

In [7]:
df = pd.read_csv("cleaned_imdb_genre.csv")
df

Unnamed: 0.1,Unnamed: 0,id,title,desc,genre
0,0,tt0000005,Blacksmith Scene,Three men hammer on an anvil and pass a bottle...,Short
1,1,tt0000004,Un bon bock,Lost 1892 French short animated film directed ...,"Animation,Short"
2,2,tt0000002,Le clown et ses chiens,Lost short film consisting of 300 painted imag...,"Animation,Short"
3,3,tt0000003,Poor Pierrot,"One night, Arlequin come to see his lover Colo...","Animation,Comedy,Romance"
4,4,tt0000001,Carmencita,Performing on what looks like a small wooden s...,"Documentary,Short"
...,...,...,...,...,...
207356,390752,tt0407808,Frog and Toad Are Friends,Claymation version of Arnold Lobel's story of ...,"Animation,Comedy,Family"
207357,390753,tt0407810,From Ardoyne to the Áras: Inside the McAleese ...,Documentary on the private and public life of ...,Documentary
207358,390754,tt0407811,Frontstadt,A young filmmaker tries to gain a very persona...,Drama
207359,390755,tt0407815,Possible Changes,"Two friends, Moon-ho and Jong-kyu, in their mi...",Drama


In [8]:
# primaryTitle과 description을 하나의 텍스트로 합치기
df['text'] = df['title'].astype(str) + " " + df['desc'].astype(str)

# genre 컬럼 전처리: 쉼표로 구분된 문자열을 리스트로 변환
def process_genres(genres_str):
    if pd.isna(genres_str):
        return []
    return [g.strip() for g in genres_str.split(',') if g.strip() != ""]

df['genre_list'] = df['genre'].apply(process_genres)


In [9]:
df['genre'].unique()

array(['Short', 'Animation,Short', 'Animation,Comedy,Romance', ...,
       'Comedy,Drama,Reality-TV', 'Mystery,Reality-TV',
       'Documentary,Family,Western'], dtype=object)

In [None]:
# 전체 genre 목록 생성
all_genres = set()
for genres in df['genre_list']:
    for genre in genres:
        all_genres.add(genre)
all_genres = sorted(list(all_genres))
genre2id = {genre: idx for idx, genre in enumerate(all_genres)}
num_labels = len(all_genres)
print("전체 장르:", all_genres)

# 각 샘플에 대해 멀티핫 인코딩된 레이블 생성
def encode_labels(genres):
    label = [0] * num_labels
    for g in genres:
        if g in genre2id:
            label[genre2id[g]] = 1
    return label

df['labels'] = df['genre_list'].apply(encode_labels)



전체 장르: ['Action', 'Adult', 'Adventure', 'Animation', 'Biography', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Family', 'Fantasy', 'Film-Noir', 'Game-Show', 'History', 'Horror', 'Music', 'Musical', 'Mystery', 'News', 'Reality-TV', 'Romance', 'Sci-Fi', 'Short', 'Sport', 'Talk-Show', 'Thriller', 'War', 'Western']


In [19]:
all_genres

['Action',
 'Adult',
 'Adventure',
 'Animation',
 'Biography',
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Family',
 'Fantasy',
 'Film-Noir',
 'Game-Show',
 'History',
 'Horror',
 'Music',
 'Musical',
 'Mystery',
 'News',
 'Reality-TV',
 'Romance',
 'Sci-Fi',
 'Short',
 'Sport',
 'Talk-Show',
 'Thriller',
 'War',
 'Western']

In [13]:
# 모델 학습에 필요한 열만 선택
df_model = df[['text', 'genre_list', 'labels']]
df_model

Unnamed: 0,text,genre_list,labels
0,Blacksmith Scene Three men hammer on an anvil ...,[Short],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,Un bon bock Lost 1892 French short animated fi...,"[Animation, Short]","[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,Le clown et ses chiens Lost short film consist...,"[Animation, Short]","[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,"Poor Pierrot One night, Arlequin come to see h...","[Animation, Comedy, Romance]","[0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,Carmencita Performing on what looks like a sma...,"[Documentary, Short]","[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...
207356,Frog and Toad Are Friends Claymation version o...,"[Animation, Comedy, Family]","[0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, ..."
207357,From Ardoyne to the Áras: Inside the McAleese ...,[Documentary],"[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ..."
207358,Frontstadt A young filmmaker tries to gain a v...,[Drama],"[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ..."
207359,"Possible Changes Two friends, Moon-ho and Jong...",[Drama],"[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ..."


In [20]:
from transformers import pipeline

# Hugging Face의 LLM 모델 (예: GPT-3, T5 등)
classifier = pipeline("zero-shot-classification", model="deepseek-ai/DeepSeek-R1-Distill-Qwen-7B")

text = df_model.iloc[207358]['text']

# LLM을 사용한 Zero-shot 분류
result = classifier(text, all_genres)

# 예측된 장르 출력
print(result)


Downloading shards: 100%|██████████| 2/2 [01:31<00:00, 45.76s/it]
Loading checkpoint shards: 100%|██████████| 2/2 [00:05<00:00,  2.51s/it]
Some weights of Qwen2ForSequenceClassification were not initialized from the model checkpoint at deepseek-ai/DeepSeek-R1-Distill-Qwen-7B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use cuda:0


OutOfMemoryError: CUDA out of memory. Tried to allocate 50.00 MiB. GPU 0 has a total capacity of 15.73 GiB of which 33.56 MiB is free. Process 2726051 has 158.00 MiB memory in use. Process 2393 has 158.00 MiB memory in use. Process 4912 has 158.00 MiB memory in use. Including non-PyTorch memory, this process has 15.21 GiB memory in use. Of the allocated memory 14.97 GiB is allocated by PyTorch, and 60.50 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [23]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
XLNET_MODEL = "xlnet-base-cased"
BATCH_SIZE = 16
EPOCHS = 5
LEARNING_RATE = 2e-5

tokenizer = AutoTokenizer.from_pretrained(XLNET_MODEL)
xlnet_model = XLNetModel.from_pretrained(XLNET_MODEL).to(DEVICE)

In [29]:
@torch.no_grad()
def get_xlnet_embedding(text):
    tokens = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=32).to(DEVICE)
    output = xlnet_model(**tokens)
    return output.last_hidden_state[:, -1, :].squeeze()  # XLNet의 마지막 토큰 벡터 사용

# 각 장르를 XLNet으로 임베딩 (고정)
genre_embeddings = {genre: get_xlnet_embedding(f"This is a {genre} movie.") for genre in all_genres}
genre_embeddings_tensor = torch.stack([genre_embeddings[genre] for genre in all_genres]).to(DEVICE)


In [51]:
class MovieDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = list(texts)  # 리스트 변환 (안전성 향상)
        self.labels = list(labels)  # 리스트 변환 (안전성 향상)

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        return self.texts[idx], torch.tensor(self.labels[idx], dtype=torch.float)


In [72]:
df_model

Unnamed: 0,text,labels
0,Blacksmith Scene Three men hammer on an anvil ...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,Un bon bock Lost 1892 French short animated fi...,"[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,Le clown et ses chiens Lost short film consist...,"[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,"Poor Pierrot One night, Arlequin come to see h...","[0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,Carmencita Performing on what looks like a sma...,"[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...
207356,Frog and Toad Are Friends Claymation version o...,"[0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, ..."
207357,From Ardoyne to the Áras: Inside the McAleese ...,"[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ..."
207358,Frontstadt A young filmmaker tries to gain a v...,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ..."
207359,"Possible Changes Two friends, Moon-ho and Jong...","[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ..."


In [52]:
df_model = df_model.reset_index(drop=True)  # 기존 인덱스 제거하고 새로운 인덱스 할당

train_texts, val_texts, train_labels, val_labels = train_test_split(
    df_model["text"].tolist(),  # 리스트로 변환
    df_model["labels"].tolist(),  # 리스트로 변환
    test_size=0.2,
    random_state=42
)

# ✅ Dataset 생성
train_dataset = MovieDataset(train_texts, train_labels)
val_dataset = MovieDataset(val_texts, val_labels)

# ✅ DataLoader 생성
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [53]:
class GenreClassifier(nn.Module):
    def __init__(self, xlnet_model, genre_embeddings):
        super(GenreClassifier, self).__init__()
        self.xlnet = xlnet_model
        self.genre_embeddings = genre_embeddings  # 장르 벡터 (고정)
        self.dropout = nn.Dropout(0.1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, input_ids, attention_mask):
        output = self.xlnet(input_ids=input_ids, attention_mask=attention_mask)
        text_embedding = output.last_hidden_state[:, -1, :]  # XLNet의 마지막 토큰 벡터 사용
        text_embedding = self.dropout(text_embedding)

        # ✅ Cosine Similarity 기반 예측
        cosine_sim = F.cosine_similarity(text_embedding.unsqueeze(1), self.genre_embeddings.unsqueeze(0), dim=-1)
        return self.sigmoid(cosine_sim)  # Sigmoid로 확률값 변환

In [54]:
model = GenreClassifier(xlnet_model, genre_embeddings_tensor).to(DEVICE)
optimizer = optim.AdamW(model.parameters(), lr=LEARNING_RATE)
loss_fn = nn.BCELoss()  # Binary Cross-Entropy Loss

In [59]:
def train_model(model, train_dataloader, val_dataloader, epochs):
    for epoch in range(epochs):
        model.train()
        total_loss = 0

        train_loop = tqdm(train_dataloader, desc=f"Epoch {epoch+1}/{epochs} Training")

        for texts, labels in train_loop:
            tokens = tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=32)
            input_ids = tokens["input_ids"].to(DEVICE)
            attention_mask = tokens["attention_mask"].to(DEVICE)
            labels = labels.to(DEVICE)

            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask)
            loss = loss_fn(outputs, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

            train_loop.set_postfix(loss=loss.item())

        avg_train_loss = total_loss / len(train_dataloader)

        model.eval()
        all_labels = []
        all_probs = []

        val_loop = tqdm(val_dataloader, desc=f"Epoch {epoch+1}/{epochs} Validation")
        with torch.no_grad():
            for texts, labels in val_loop:
                tokens = tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=32)
                input_ids = tokens["input_ids"].to(DEVICE)
                attention_mask = tokens["attention_mask"].to(DEVICE)
                labels = labels.to(DEVICE)

                outputs = model(input_ids, attention_mask)
                all_labels.append(labels.cpu().numpy())
                all_probs.append(outputs.cpu().numpy())

        all_labels = np.vstack(all_labels)
        all_probs = np.vstack(all_probs)

        # 최적 Threshold 찾기
        best_thresholds = []
        for i in range(num_labels):
            precision, recall, thresholds = precision_recall_curve(all_labels[:, i], all_probs[:, i])
            f1_scores = 2 * (precision * recall) / (precision + recall + 1e-8)
            best_thresholds.append(thresholds[f1_scores.argmax()])

        # 최적 Threshold 적용
        preds = (all_probs > np.array(best_thresholds)).astype(int)

        f1 = f1_score(all_labels, preds, average='macro')

        print(f"Epoch {epoch+1}: Train Loss = {avg_train_loss:.4f}, F1-score = {f1:.4f}")

        wandb.log({
            "epoch": epoch+1,
            "train_loss": avg_train_loss,
            "f1_score": f1
        })

    wandb.finish()

In [71]:
# 데이터셋이 정상적으로 로드되는지 확인
for batch in train_dataloader:
    print(batch[1][0])
    break  # 한 개의 배치만 출력하고 종료


tensor([0., 0., 0., 0., 0., 1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 1., 0., 0., 0., 0., 0.])


In [61]:
wandb.watch(model, log="all")
train_model(model, train_dataloader, val_dataloader, EPOCHS)

Epoch 1/5 Training:   0%|          | 0/10368 [00:00<?, ?it/s]

Epoch 1/5 Training: 100%|██████████| 10368/10368 [17:29<00:00,  9.88it/s, loss=0.426]
Epoch 1/5 Validation: 100%|██████████| 2593/2593 [00:55<00:00, 46.55it/s]


Epoch 1: Train Loss = 0.4241, F1-score = 0.1504


Epoch 2/5 Training: 100%|██████████| 10368/10368 [17:31<00:00,  9.86it/s, loss=0.413]
Epoch 2/5 Validation: 100%|██████████| 2593/2593 [00:55<00:00, 46.70it/s]


Epoch 2: Train Loss = 0.4220, F1-score = 0.2054


Epoch 3/5 Training: 100%|██████████| 10368/10368 [17:29<00:00,  9.88it/s, loss=0.436]
Epoch 3/5 Validation: 100%|██████████| 2593/2593 [00:55<00:00, 46.62it/s]


Epoch 3: Train Loss = 0.4218, F1-score = 0.2262


Epoch 4/5 Training: 100%|██████████| 10368/10368 [17:36<00:00,  9.82it/s, loss=0.425]
Epoch 4/5 Validation: 100%|██████████| 2593/2593 [00:55<00:00, 46.67it/s]


Epoch 4: Train Loss = 0.4216, F1-score = 0.2206


Epoch 5/5 Training: 100%|██████████| 10368/10368 [17:31<00:00,  9.86it/s, loss=0.417]
Epoch 5/5 Validation: 100%|██████████| 2593/2593 [00:55<00:00, 46.64it/s]


Epoch 5: Train Loss = 0.4215, F1-score = 0.2264


0,1
epoch,▁▃▅▆█
f1_score,▁▆█▇█
train_loss,█▂▂▁▁

0,1
epoch,5.0
f1_score,0.22636
train_loss,0.4215


In [None]:
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")
max_length = 512

class IMDBDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=512):
        self.dataframe = dataframe.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        row = self.dataframe.iloc[idx]
        text = row['text']
        # label은 멀티레이블 멀티핫 인코딩 (리스트 형태)
        label = torch.tensor(row['labels'], dtype=torch.float)
        # 토큰화 (출력은 dict로, input_ids, attention_mask 등이 포함)
        encoding = self.tokenizer(
            text,
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors='pt'
        )
        # 토크나이저 결과의 차원 제거 (batch dimension 제거)
        encoding = {key: val.squeeze(0) for key, val in encoding.items()}
        encoding['labels'] = label
        return encoding

# Dataset 객체 생성
dataset = IMDBDataset(df_model, tokenizer, max_length=max_length)

In [None]:
dataset_size = len(dataset)
train_size = int(0.8 * dataset_size)
val_size = int(0.1 * dataset_size)
test_size = dataset_size - train_size - val_size

# random_split은 내부적으로 torch.Generator()를 사용해 seed 지정 가능 (재현성 위해)
generator = torch.Generator().manual_seed(42)
train_dataset, val_dataset, test_dataset = random_split(
    dataset, [train_size, val_size, test_size], generator=generator
)

In [None]:
batch_size = 8  # 사용 가능한 GPU 메모리 및 학습 속도에 따라 조정

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=4)
val_dataloader   = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=4)
test_dataloader  = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=4)


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# BERT 모델 로드; 문제 유형을 multi_label_classification으로 설정하면,
# 내부적으로 Sigmoid 활성화와 BCEWithLogitsLoss가 사용됩니다.
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=num_labels,
    problem_type="multi_label_classification"
)
model.to(device)

optimizer = AdamW(model.parameters(), lr=2e-5)
num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    name="linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)

wandb.watch(model, log="all")


In [None]:
progress_bar = tqdm(range(num_training_steps))

for epoch in range(num_epochs):
    print(f"\nEpoch {epoch+1} 시작")
    
    # Training
    model.train()
    total_train_loss = 0.0
    for batch in train_dataloader:
        batch = {key: value.to(device) for key, value in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        
        total_train_loss += loss.item()
        progress_bar.update(1)
    
    avg_train_loss = total_train_loss / len(train_dataloader)
    
    # Validation
    model.eval()
    total_val_loss = 0.0
    with torch.no_grad():
        for batch in val_dataloader:
            batch = {key: value.to(device) for key, value in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss
            total_val_loss += loss.item()
    
    avg_val_loss = total_val_loss / len(val_dataloader)
    print(f"Epoch {epoch+1} 완료: Train Loss: {avg_train_loss:.4f} | Val Loss: {avg_val_loss:.4f}")
    
    wandb.log({
        "epoch": epoch+1,
        "train_loss": avg_train_loss,
        "val_loss": avg_val_loss,
        "lr": optimizer.param_groups[0]['lr'],
    })


In [None]:
model.eval()
total_test_loss = 0.0
with torch.no_grad():
    for batch in test_dataloader:
        batch = {key: value.to(device) for key, value in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        total_test_loss += loss.item()

avg_test_loss = total_test_loss / len(test_dataloader)
print(f"\nTest Loss: {avg_test_loss:.4f}")
wandb.log({"test_loss": avg_test_loss})

In [None]:
model.save_pretrained("bert_imdb_finetuned")
tokenizer.save_pretrained("bert_imdb_finetuned")
wandb.save("bert_imdb_finetuned/*")
wandb.finish()