In [119]:
import random
import pandas as pd
import numpy as np
import os
import cv2

from sklearn import preprocessing
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer 
from sklearn.feature_extraction import DictVectorizer
from tqdm.auto import tqdm
import torch
import torch.nn as nn
import torch.optim as optim 
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import f1_score, accuracy_score
import warnings
warnings.filterwarnings(action='ignore')

In [2]:
BATCH_SIZE = 32
EPOCHS = 10
LEARNING_RATE = 3e-4
SCHEDULER = 'Lambda'

In [3]:
train_df = pd.read_csv('./processed_train_samples.csv')
val_df = pd.read_csv("./processed_val_samples.csv")

In [4]:
le = preprocessing.LabelEncoder()
le.fit(train_df['cat3'].values)

train_df['cat3'] = le.transform(train_df['cat3'].values)
val_df['cat3'] = le.transform(val_df['cat3'].values)

In [114]:
# count_vectorizer = CountVectorizer(max_features=4096)
tfid_vectorizer = TfidfVectorizer(max_features=4096)
# hash_vectorizer = HashingVectorizer(n_features=4096)

In [115]:
tfid_train_vectors = tfid_vectorizer.fit_transform(train_df['overview'])

In [18]:
tfid_val_vectors = tfid_vectorizer.transform(val_df['overview'])

In [117]:
# for n, i in tqdm(enumerate(tfid_train_vectors.todense())):
#     if np.sum(i) == 0.0:
#         print(n)

13588it [00:00, 15723.30it/s]

13076





In [None]:
# df = pd.DataFrame(tfid_train_vectors.toarray(), columns=tfid_vectorizer.get_feature_names())

In [None]:
# tfid_vectorizer.vocabulary_
# tfid_vectorizer.get_feature_names_out()

In [118]:
class CustomDataset(Dataset):
    def __init__(self, text_vectors, label_list, infer=False):
        self.text_vectors = text_vectors
        self.label_list = label_list
        self.infer = infer
        
    def __getitem__(self, index):
        # NLP
        text_vetors = torch.Tensor(self.text_vectors[index]).view(-1)

        # Label
        if self.infer: # infer == True, test_data로부터 label "결과 추출" 시 사용
            return text_vetors
        else: # infer == False
            label = self.label_list[index] # dataframe에서 label 가져와 "학습" 시 사용
            return text_vetors, label
        
    def __len__(self):
        return len(self.text_vectors)

In [None]:
train_dataset = CustomDataset(tfid_train_vectors, train_df['cat3'].values)
train_loader = DataLoader(train_dataset, batch_size = BATCH_SIZE, shuffle=True, num_workers=0) # 6

val_dataset = CustomDataset(tfid_val_vectors, val_df['cat3'].values)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=0) # 6

In [None]:
class CustomModel(nn.Module):
    def __init__(self, num_classes=len(le.classes_)):
        super(CustomModel, self).__init__()
        # Image
        # Text
        self.count_extract = nn.Sequential(
            nn.Linear(4096, 2048), # 선형회귀. 4096개의 입력으로 2048개의 출력
            nn.ReLU(),
            nn.Linear(2048, 1024), # 선형회귀. 2048개의 입력으로 1024개의 출력
        )
        # Classifier
        self.classifier = nn.Sequential(
            nn.Linear(1024, num_classes))
            

    def forward(self, text1, text2, text3):
        text_feature1 = self.count_extract(text1)
        output = self.classifier(text_feature1) # classifier 적용
        return output

In [None]:
def score_function(real, pred):
    return f1_score(real, pred, average="weighted")

def validation(model, criterion, val_loader, device):
    model.eval() # nn.Module에서 train time과 eval time에서 수행하는 다른 작업을 수행할 수 있도록 switching 하는 함수
    
    model_preds = [] # 예측값
    true_labels = [] # 실제값
    
    val_loss = []
    
    with torch.no_grad():
        for text, label in tqdm(iter(val_loader)): # val_loader에서 img, text, label 가져옴
            tfid = text.to(device)
            label = label.type(torch.LongTensor) # label type을 LongTensor로 형변환, 추가하여 에러 해결
            label = label.to(device)
            
            model_pred = model(tfid)
            
            loss = criterion(model_pred, label) # 예측값, 실제값으로 손실함수 적용 -> loss 추출
            
            val_loss.append(loss.item()) # loss 출력, val_loss에 저장
            
            model_preds += model_pred.argmax(1).detach().cpu().numpy().tolist()
            true_labels += label.detach().cpu().numpy().tolist()
        
    test_weighted_f1 = score_function(true_labels, model_preds) # 실제 라벨값들과 예측한 라벨값들에 대해 f1 점수 계산
    return np.mean(val_loss), test_weighted_f1 # 각각 val_loss, val_score에 적용됨

In [None]:
def train(model, optimizer, train_loader, val_loader, scheduler, device):
    model.to(device) # gpu(cpu)에 적용

    criterion = nn.CrossEntropyLoss().to(device) # CrossEntropyLoss: 다중분류를 위한 손실함수
    best_score = 0
    best_model = None 
    dir_name = os.path.join("./model_weights/", f"ONE_TextVector_4096_scheduler_{SCHEDULER}_batch_{BATCH_SIZE}_lr_{LEARNING_RATE}_{random.randrange(10000,99000)}")
    os.mkdir(dir_name)
    for epoch in range(1,EPOCHS+1):
        model.train() 
        train_loss = []
        for text, label in tqdm(iter(train_loader)): # train_loader에서 img, text, label 가져옴
            tfid = text.to(device)
            label = label.type(torch.LongTensor) 
            label = label.to(device)
            
            optimizer.zero_grad() # 이전 루프에서 .grad에 저장된 값이 다음 루프의 업데이트에도 간섭하는 걸 방지, 0으로 초기화

            model_pred = model(tfid) # 예측
            loss = criterion(model_pred, label) # 예측값과 실제값과의 손실 계산

            loss.backward() # .backward() 를 호출하면 역전파가 시작
            optimizer.step() # optimizer.step()을 호출하여 역전파 단계에서 수집된 변화도로 매개변수를 조정

            train_loss.append(loss.item())
            
        # 모든 train_loss 가져옴
        tr_loss = np.mean(train_loss)
            
        val_loss, val_score = validation(model, criterion, val_loader, device) # 검증 시작, 여기서 validation 함수 사용
        if scheduler is not None:
            scheduler.step()
            print(f'Epoch [{epoch}], LR: [{scheduler.get_lr()[0]}]Train Loss : [{tr_loss:.5f}] Val Loss : [{val_loss:.5f}] Val Score : [{val_score:.5f}]')
        else:
            print(f'Epoch [{epoch}], Train Loss : [{tr_loss:.5f}] Val Loss : [{val_loss:.5f}] Val Score : [{val_score:.5f}]')
            
        if scheduler is not None:
            scheduler.step()
            # scheduler의 의미: Learning Rate Scheduler => learning rate를 조절한다. 
            # DACON에서는 CosineAnnealingLR 또는 CosineAnnealingWarmRestarts 를 주로 사용한다.
            
        if best_score < val_score: # 최고의 val_score을 가진 모델에 대해서만 최종적용을 시킴
            best_score = val_score
            best_model = model
            torch.save({'epoch': epoch,
            'model_state_dict': best_model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            },
            os.path.join(dir_name, f"ONE_TextVector_epoch_{epoch}_val_acc_{best_score}.pth"))
            
    
    return best_model

In [None]:

model = CustomModel()
model.eval()
optimizer = torch.optim.Adam(params = model.parameters(), lr = LEARNING_RATE)
scheduler = optim.lr_scheduler.LambdaLR(optimizer=optimizer,
                                        lr_lambda=lambda epoch: 0.95 ** epoch,
                                        last_epoch=-1,
                                        verbose=False)

infer_model = train(model, optimizer, train_loader, val_loader, scheduler, torch.device('cuda'))