In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from torchvision import transforms
from PIL import Image
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BeitFeatureExtractor, BeitForImageClassification, AdamW
from torch.nn import CrossEntropyLoss
from tqdm import tqdm

In [2]:
# 데이터 불러오기
train_df = pd.read_csv(train_csv)
test_df = pd.read_csv(test_csv)
submission_df = pd.read_csv(submission_csv)

# Label Encoding
label_encoder = LabelEncoder() # 문자를 숫자로 수치화(beit모델특성)
train_df['label'] = label_encoder.fit_transform(train_df['label'])

# 데이터셋정의
class CustomDataset(Dataset):  
    def __init__(self, dataframe, feature_extractor, mode='train'):
        self.dataframe = dataframe  
        self.feature_extractor = feature_extractor
        self.mode = mode

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        if self.mode == 'train': # train
            img_path = self.dataframe.iloc[idx]['img_path']  
            label = self.dataframe.iloc[idx]['label'] 
        else: #test 
            img_path = self.dataframe.iloc[idx]['img_path']  
            label = -1  # test에는 label이 없으므로 더미레이블로 -1 사용

        image = Image.open(img_path).convert("RGB") 
        inputs = self.feature_extractor(images=image, return_tensors="pt")

        if self.mode == 'train': #train 
            return inputs['pixel_values'].squeeze(0), torch.tensor(label, dtype=torch.long)          
        else: #test
            return inputs['pixel_values'].squeeze(0), self.dataframe.iloc[idx]['id']

# Feature extractor
feature_extractor = BeitFeatureExtractor.from_pretrained('microsoft/beit-base-patch16-224-pt22k')

# 데이터셋 준비
train_dataset = CustomDataset(train_df, feature_extractor, mode='train')
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True) # 배치사이즈16, 데이터 무작위로 섞어서 로드

test_dataset = CustomDataset(test_df, feature_extractor, mode='test')
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False) # train은 성능향상과 균등한 학습을 위해 셔플 트루
                                                    # 테스트는 디버깅 편의성을 위해(샘플에대한 모델을 쉽게 추적하기위해)



In [3]:
# 모델준비
model = BeitForImageClassification.from_pretrained('microsoft/beit-base-patch16-224-pt22k', num_labels=len(label_encoder.classes_))
model.to('cuda') #gpu

# 옵티마이저
optimizer = AdamW(model.parameters(), lr=5e-5) 
criterion = CrossEntropyLoss()  

# 학습루프 
epochs = 5 
for epoch in range(epochs):
    model.train()
    running_loss = 0.0
    for batch in tqdm(train_loader):
        inputs, labels = batch
        inputs = inputs.to('cuda')
        labels = labels.to('cuda')

        optimizer.zero_grad()
        outputs = model(pixel_values=inputs)
        loss = criterion(outputs.logits, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
    
    print(f"Epoch {epoch+1}, Loss: {running_loss / len(train_loader)}")

Some weights of BeitForImageClassification were not initialized from the model checkpoint at microsoft/beit-base-patch16-224-pt22k and are newly initialized: ['beit.pooler.layernorm.bias', 'beit.pooler.layernorm.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|████████████████████████████████████████████████████████████████████████████████| 990/990 [06:08<00:00,  2.69it/s]


Epoch 1, Loss: 1.5247640064494177


100%|████████████████████████████████████████████████████████████████████████████████| 990/990 [05:12<00:00,  3.17it/s]


Epoch 2, Loss: 0.43669206224231405


100%|████████████████████████████████████████████████████████████████████████████████| 990/990 [05:03<00:00,  3.26it/s]


Epoch 3, Loss: 0.24004388450353284


100%|████████████████████████████████████████████████████████████████████████████████| 990/990 [05:14<00:00,  3.14it/s]


Epoch 4, Loss: 0.1505419069736481


100%|████████████████████████████████████████████████████████████████████████████████| 990/990 [05:14<00:00,  3.15it/s]

Epoch 5, Loss: 0.10930087733731576





In [4]:
# 평가, 예측
model.eval()
predictions = []
ids = []

with torch.no_grad():
    for batch in tqdm(test_loader):
        inputs, id_batch = batch
        inputs = inputs.to('cuda')

        outputs = model(pixel_values=inputs)
        _, preds = torch.max(outputs.logits, dim=1)
        
        predictions.extend(preds.cpu().numpy())
        ids.extend(id_batch)



100%|████████████████████████████████████████████████████████████████████████████████| 425/425 [00:53<00:00,  8.02it/s]


In [5]:
# 결과저장
submission_df['label'] = label_encoder.inverse_transform(predictions) #inverse_transform 수치를 문자로 다시 변환
submission_df.to_csv('submission.csv', index=False)

In [4]:
# Train 데이터 평가
model.eval() 
train_predictions = []
train_labels = []

with torch.no_grad():
    for batch in tqdm(train_loader):  
        inputs, labels = batch
        inputs = inputs.to('cuda')
        labels = labels.to('cuda')

        outputs = model(pixel_values=inputs)
        _, preds = torch.max(outputs.logits, dim=1)  # 예측결과
        
        train_predictions.extend(preds.cpu().numpy())
        train_labels.extend(labels.cpu().numpy())

# 정확도
correct = sum(p == t for p, t in zip(train_predictions, train_labels))
accuracy = correct / len(train_labels)
print(f'Train Accuracy: {accuracy * 100:.2f}%')

100%|████████████████████████████████████████████████████████████████████████████████| 990/990 [02:03<00:00,  8.05it/s]

Train Accuracy: 98.60%





In [5]:
print(accuracy)

0.9860426929392446
