In [1]:
!nvidia-smi

Tue Nov 22 08:28:34 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.27.04    Driver Version: 460.27.04    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  GeForce RTX 208...  Off  | 00000000:3B:00.0 Off |                  N/A |
| 30%   27C    P8     4W / 250W |   5950MiB / 11019MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  GeForce RTX 208...  Off  | 00000000:5E:00.0 Off |                  N/A |
| 30%   24C    P8     1W / 250W |   7480MiB / 11019MiB |      0%      Default |
|       

In [2]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "3"

In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset

from sklearn.metrics import f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

import os
import pandas as pd
import numpy as np
# from tqdm.auto import tqdm
from tqdm import tqdm
import random

import warnings
warnings.filterwarnings(action='ignore') 

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(device)
print(torch.cuda.device_count())

cuda
1


In [4]:
CONFIG = {
    'EPOCHS': 100,
    'LEARNING_RATE':1e-3,
    'BATCH_SIZE':512,
    'SEED':101
}

In [5]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(CONFIG['SEED'])

In [6]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

# Preprocessing

In [7]:
categorical_features = ['COMPONENT_ARBITRARY', 'YEAR']
# Inference(실제 진단 환경)에 사용하는 컬럼
test_stage_features = ['COMPONENT_ARBITRARY', 'ANONYMOUS_1', 'YEAR' , 'ANONYMOUS_2', 'AG', 'CO', 'CR', 'CU', 'FE', 'H2O', 'MN', 'MO', 'NI', 'PQINDEX', 'TI', 'V', 'V40', 'ZN']

In [8]:
train = train.fillna(0)
test = test.fillna(0)

In [9]:
all_X = train.drop(['ID', 'Y_LABEL'], axis = 1)
all_y = train['Y_LABEL']

test = test.drop(['ID'], axis = 1)

train_X, val_X, train_y, val_y = train_test_split(all_X, all_y, test_size=0.2, random_state=CONFIG['SEED']) #, stratify=all_y)

In [10]:
# ---------------------------
# Handling Numerical
# ---------------------------
def get_values(value):
    return value.values.reshape(-1, 1)

for col in train_X.columns:
    if col not in categorical_features:
        scaler = StandardScaler()
        train_X[col] = scaler.fit_transform(get_values(train_X[col]))
        val_X[col] = scaler.transform(get_values(val_X[col]))
        if col in test.columns:
            test[col] = scaler.transform(get_values(test[col]))
            
            
# ---------------------------
# Handling Categorical
# ---------------------------
le = LabelEncoder()
for col in categorical_features:    
    train_X[col] = le.fit_transform(train_X[col])
    val_X[col] = le.transform(val_X[col])
    if col in test.columns:
        test[col] = le.transform(test[col])

# Large Model (teacher)

In [11]:
# ---------------------------
# Custom Dataset
# ---------------------------
class CustomDataset(Dataset):
    def __init__(self, data_X, data_y, distillation=False):
        super(CustomDataset, self).__init__()
        self.data_X = data_X
        self.data_y = data_y
        self.distillation = distillation
        
    def __len__(self):
        return len(self.data_X)
    
    def __getitem__(self, index):
        if self.distillation:
            #------------------------------------
            # When Learing with distillation
            #------------------------------------
            teacher_X = torch.Tensor(self.data_X.iloc[index])
            student_X = torch.Tensor(self.data_X[test_stage_features].iloc[index])
            label_y   = torch.Tensor([self.data_y.iloc[index]])
            return teacher_X.to(device), \
                   student_X.to(device), \
                   label_y.to(device)
        else:
            #------------------------------------
            # When Learing with Normal Data
            #------------------------------------
            if self.data_y is None: # for submission dataset
                test_X = torch.Tensor(self.data_X.iloc[index])
                return test_X.to(device)
            
            else:
                teacher_X = torch.Tensor(self.data_X.iloc[index])
                label_y   = torch.Tensor([self.data_y.iloc[index]])
                return teacher_X.to(device), \
                       label_y.to(device)

In [12]:
# ---------------------------
# Dataset for Teacher Model
# ---------------------------
train_dataset = CustomDataset(train_X, train_y, False)
val_dataset = CustomDataset(val_X, val_y, False)

# ---------------------------
# Dataloader for Teacher Model
# ---------------------------
train_loader = DataLoader(train_dataset, batch_size = CONFIG['BATCH_SIZE'], shuffle=True)
val_loader   = DataLoader(val_dataset, batch_size = CONFIG['BATCH_SIZE'], shuffle=False)

In [13]:
val_dataset[0]

(tensor([ 0.0000e+00,  2.7493e-04,  8.0000e+00, -3.9161e-01, -3.4112e-01,
         -1.4855e-01, -1.0477e-01,  2.4425e+00, -3.0059e-01, -3.7119e-02,
          8.3046e-01, -6.7052e-02, -1.2607e-01, -1.1260e-01, -2.6032e-01,
          4.9347e-01,  1.1905e+00, -3.6082e-01,  7.6335e-01,  1.1156e+00,
          1.3627e+00, -3.6135e-01, -1.0803e-01, -3.9834e-02, -7.1962e-02,
         -1.0763e-01, -1.9431e-01, -3.3823e-01, -4.0253e-01, -4.1215e-04,
         -1.9460e-01,  8.8570e-02, -1.6250e-01, -2.6765e-01, -8.3927e-01,
         -1.9846e-01, -1.6062e-01, -2.4240e-01,  5.1207e-01, -1.0670e-01,
         -7.8102e-02, -1.0621e-01, -7.1571e-02, -1.1962e-01, -1.2277e-01,
         -1.3725e-01, -1.5107e-01, -1.9664e-01, -1.0860e-01,  1.6066e+00,
         -1.4155e-01,  9.7597e-01], device='cuda:0'),
 tensor([1.], device='cuda:0'))

In [14]:
# ---------------------------
# Teacher Model
# ---------------------------
class Teacher(nn.Module):
    def __init__(self):
        super(Teacher, self).__init__()
        self.classifier = nn.Sequential(
            nn.Linear(in_features=52, out_features=256),
            nn.BatchNorm1d(256),
            nn.GELU(),
            nn.Linear(in_features=256, out_features=512),
            nn.BatchNorm1d(512),
            nn.GELU(),
            nn.Linear(in_features=512, out_features=1024),
            nn.BatchNorm1d(1024),
            nn.GELU(),
            nn.Linear(in_features=1024, out_features=1024),
            nn.BatchNorm1d(1024),
            nn.GELU(),
            nn.Linear(in_features=1024, out_features=512),
            nn.BatchNorm1d(512),
            nn.GELU(),
            nn.Linear(in_features=512, out_features=256),
            nn.BatchNorm1d(256),
            nn.GELU(),
            nn.Linear(in_features=256, out_features=1),
            nn.Sigmoid()
        )
        
    def forward(self, x):
        output = self.classifier(x)
        return output

In [15]:
#--------------------------------------------------------------------------------
# Metrics
#--------------------------------------------------------------------------------
def competition_metric(true, pred):
    return f1_score(true, pred, average="macro")


#--------------------------------------------------------------------------------
# Training Teacher
#--------------------------------------------------------------------------------
def teacher_train(model, optimizer, train_loader, val_loader, scheduler):
    model.to(device)

    best_score = 0
    best_model = None
    criterion = nn.BCELoss().to(device)

    for epoch in range(CONFIG["EPOCHS"]):
        train_loss = []
        
        # ACTIVATE TRAINING MODE
        # --- normalisation layers1 use per-batch statistics
        # --- activates Dropout layers2
        model.train()
    
        for i,(X, y) in tqdm(enumerate(train_loader)):
            
            # ZERO GRADIENT
            optimizer.zero_grad()
            
            # FORWARD
            y_pred = model(X)
            loss = criterion(y_pred, y)
            
            # BACKWARD
            loss.backward()
            
            # UPDATE
            optimizer.step()

            train_loss.append(loss.item())

        val_loss, val_score = validation_teacher(model, val_loader, criterion)
        print(f'Epoch [{epoch}], Train Loss : [{np.mean(train_loss) :.5f}] Val Loss : [{np.mean(val_loss) :.5f}] Val F1 Score : [{val_score:.5f}]')

        if scheduler is not None:
            scheduler.step(val_score)
            
        if best_score < val_score:
            best_model = model
            best_score = val_score
        
    return best_model 


#--------------------------------------------------------------------------------
# Validation Teacher
#--------------------------------------------------------------------------------
def validation_teacher(model, val_loader, criterion):
    # ACTIVATE EVALUATION MODE
    # --- normalisation layers use running statistics
    # --- de-activates Dropout layers
    model.eval()

    val_loss = []
    pred_labels = []
    true_labels = []
    threshold = 0.4
    
    with torch.no_grad():
        for i,(X, y) in tqdm(enumerate(val_loader)):
            
            model_pred = model(X)
            
            loss = criterion(model_pred,y)
            val_loss.append(loss.item())
            
            model_pred = model_pred.squeeze(1)#.to('cpu')  
            pred_labels += model_pred.tolist()
            true_labels += y.tolist()
        
        pred_labels = np.where(np.array(pred_labels) > threshold, 1, 0)
        val_f1 = competition_metric(true_labels, pred_labels)
    return val_loss, val_f1

In [16]:
#--------------------------------------------------------------------------------
# Training Process
#--------------------------------------------------------------------------------
model = Teacher()
optimizer = torch.optim.Adam(model.parameters(), lr=CONFIG['LEARNING_RATE'])
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=5, threshold_mode='abs',min_lr=1e-6, verbose=True)

teacher_model = teacher_train(model, optimizer, train_loader, val_loader, scheduler)

23it [00:03,  6.37it/s]
6it [00:00,  7.42it/s]


Epoch [0], Train Loss : [0.35824] Val Loss : [0.21550] Val F1 Score : [0.73916]


23it [00:03,  6.76it/s]
6it [00:00,  7.41it/s]


Epoch [1], Train Loss : [0.22545] Val Loss : [0.21751] Val F1 Score : [0.75310]


23it [00:03,  6.76it/s]
6it [00:00,  7.43it/s]


Epoch [2], Train Loss : [0.21008] Val Loss : [0.19335] Val F1 Score : [0.77013]


23it [00:03,  6.79it/s]
6it [00:00,  7.30it/s]


Epoch [3], Train Loss : [0.23614] Val Loss : [0.18490] Val F1 Score : [0.77995]


23it [00:03,  6.77it/s]
6it [00:00,  7.39it/s]


Epoch [4], Train Loss : [0.19685] Val Loss : [0.17712] Val F1 Score : [0.79189]


23it [00:03,  6.79it/s]
6it [00:00,  7.34it/s]


Epoch [5], Train Loss : [0.18623] Val Loss : [0.17154] Val F1 Score : [0.79823]


23it [00:03,  6.79it/s]
6it [00:00,  7.27it/s]


Epoch [6], Train Loss : [0.18636] Val Loss : [0.17272] Val F1 Score : [0.79468]


23it [00:03,  6.60it/s]
6it [00:00,  7.34it/s]


Epoch [7], Train Loss : [0.18700] Val Loss : [0.16798] Val F1 Score : [0.81109]


23it [00:03,  6.77it/s]
6it [00:00,  7.30it/s]


Epoch [8], Train Loss : [0.17807] Val Loss : [0.17103] Val F1 Score : [0.81306]


23it [00:03,  6.73it/s]
6it [00:00,  7.35it/s]


Epoch [9], Train Loss : [0.18309] Val Loss : [0.16718] Val F1 Score : [0.80911]


23it [00:03,  6.82it/s]
6it [00:00,  7.54it/s]


Epoch [10], Train Loss : [0.18637] Val Loss : [0.16638] Val F1 Score : [0.81586]


23it [00:03,  6.86it/s]
6it [00:00,  7.46it/s]


Epoch [11], Train Loss : [0.18906] Val Loss : [0.16736] Val F1 Score : [0.80812]


23it [00:03,  6.83it/s]
6it [00:00,  7.32it/s]


Epoch [12], Train Loss : [0.16518] Val Loss : [0.17097] Val F1 Score : [0.80717]


23it [00:03,  6.75it/s]
6it [00:00,  7.41it/s]


Epoch [13], Train Loss : [0.17267] Val Loss : [0.16507] Val F1 Score : [0.81209]


23it [00:03,  6.58it/s]
6it [00:00,  7.37it/s]


Epoch [14], Train Loss : [0.17428] Val Loss : [0.16655] Val F1 Score : [0.81386]


23it [00:03,  6.79it/s]
6it [00:00,  7.45it/s]


Epoch [15], Train Loss : [0.16131] Val Loss : [0.16267] Val F1 Score : [0.81209]


23it [00:03,  6.79it/s]
6it [00:00,  7.45it/s]


Epoch [16], Train Loss : [0.17180] Val Loss : [0.17150] Val F1 Score : [0.80029]
Epoch    17: reducing learning rate of group 0 to 5.0000e-04.


23it [00:03,  6.83it/s]
6it [00:00,  7.37it/s]


Epoch [17], Train Loss : [0.15907] Val Loss : [0.16491] Val F1 Score : [0.81410]


23it [00:03,  6.82it/s]
6it [00:00,  7.52it/s]


Epoch [18], Train Loss : [0.14956] Val Loss : [0.16141] Val F1 Score : [0.81685]


23it [00:03,  6.80it/s]
6it [00:00,  7.44it/s]


Epoch [19], Train Loss : [0.14808] Val Loss : [0.15904] Val F1 Score : [0.81685]


23it [00:03,  6.78it/s]
6it [00:00,  7.39it/s]


Epoch [20], Train Loss : [0.14201] Val Loss : [0.15917] Val F1 Score : [0.82156]


23it [00:03,  6.62it/s]
6it [00:00,  7.35it/s]


Epoch [21], Train Loss : [0.14210] Val Loss : [0.16135] Val F1 Score : [0.81285]


23it [00:03,  6.79it/s]
6it [00:00,  7.34it/s]


Epoch [22], Train Loss : [0.14390] Val Loss : [0.15677] Val F1 Score : [0.82085]


23it [00:03,  6.75it/s]
6it [00:00,  7.45it/s]


Epoch [23], Train Loss : [0.13677] Val Loss : [0.16336] Val F1 Score : [0.81770]


23it [00:03,  6.75it/s]
6it [00:00,  7.37it/s]


Epoch [24], Train Loss : [0.13909] Val Loss : [0.15983] Val F1 Score : [0.82245]


23it [00:03,  6.76it/s]
6it [00:00,  7.47it/s]


Epoch [25], Train Loss : [0.13280] Val Loss : [0.16343] Val F1 Score : [0.81967]


23it [00:03,  6.74it/s]
6it [00:00,  7.48it/s]


Epoch [26], Train Loss : [0.13136] Val Loss : [0.16026] Val F1 Score : [0.82518]


23it [00:03,  6.76it/s]
6it [00:00,  7.48it/s]


Epoch [27], Train Loss : [0.15668] Val Loss : [0.17049] Val F1 Score : [0.81544]


23it [00:03,  6.67it/s]
6it [00:00,  7.45it/s]


Epoch [28], Train Loss : [0.14121] Val Loss : [0.16646] Val F1 Score : [0.82047]


23it [00:03,  6.85it/s]
6it [00:00,  7.36it/s]


Epoch [29], Train Loss : [0.12938] Val Loss : [0.15757] Val F1 Score : [0.82234]


23it [00:03,  6.87it/s]
6it [00:00,  7.39it/s]


Epoch [30], Train Loss : [0.12448] Val Loss : [0.16092] Val F1 Score : [0.83205]


23it [00:03,  6.88it/s]
6it [00:00,  7.51it/s]


Epoch [31], Train Loss : [0.12249] Val Loss : [0.16435] Val F1 Score : [0.82201]


23it [00:03,  6.79it/s]
6it [00:00,  7.55it/s]


Epoch [32], Train Loss : [0.11967] Val Loss : [0.16435] Val F1 Score : [0.81860]


23it [00:03,  6.80it/s]
6it [00:00,  7.45it/s]


Epoch [33], Train Loss : [0.13349] Val Loss : [0.18046] Val F1 Score : [0.81652]


23it [00:03,  6.76it/s]
6it [00:00,  7.37it/s]


Epoch [34], Train Loss : [0.14954] Val Loss : [0.16802] Val F1 Score : [0.81941]


23it [00:03,  6.64it/s]
6it [00:00,  7.48it/s]


Epoch [35], Train Loss : [0.12913] Val Loss : [0.16617] Val F1 Score : [0.81794]


23it [00:03,  6.70it/s]
6it [00:00,  7.47it/s]


Epoch [36], Train Loss : [0.12664] Val Loss : [0.17854] Val F1 Score : [0.80363]
Epoch    37: reducing learning rate of group 0 to 2.5000e-04.


23it [00:03,  6.75it/s]
6it [00:00,  7.37it/s]


Epoch [37], Train Loss : [0.11804] Val Loss : [0.16693] Val F1 Score : [0.81869]


23it [00:03,  6.77it/s]
6it [00:00,  7.16it/s]


Epoch [38], Train Loss : [0.11892] Val Loss : [0.16545] Val F1 Score : [0.82646]


23it [00:03,  6.72it/s]
6it [00:00,  7.34it/s]


Epoch [39], Train Loss : [0.13470] Val Loss : [0.17097] Val F1 Score : [0.82536]


23it [00:03,  6.72it/s]
6it [00:00,  7.45it/s]


Epoch [40], Train Loss : [0.13203] Val Loss : [0.17147] Val F1 Score : [0.82358]


23it [00:03,  6.81it/s]
6it [00:00,  7.36it/s]


Epoch [41], Train Loss : [0.12253] Val Loss : [0.16785] Val F1 Score : [0.83251]


23it [00:03,  6.52it/s]
6it [00:00,  7.17it/s]


Epoch [42], Train Loss : [0.12455] Val Loss : [0.16961] Val F1 Score : [0.83215]


23it [00:03,  6.64it/s]
6it [00:00,  7.36it/s]


Epoch [43], Train Loss : [0.12288] Val Loss : [0.16732] Val F1 Score : [0.82824]


23it [00:03,  6.63it/s]
6it [00:00,  7.43it/s]


Epoch [44], Train Loss : [0.11413] Val Loss : [0.16948] Val F1 Score : [0.83134]


23it [00:03,  6.78it/s]
6it [00:00,  7.42it/s]


Epoch [45], Train Loss : [0.11117] Val Loss : [0.16843] Val F1 Score : [0.82728]


23it [00:03,  6.78it/s]
6it [00:00,  7.33it/s]


Epoch [46], Train Loss : [0.10759] Val Loss : [0.18026] Val F1 Score : [0.81215]


23it [00:03,  6.72it/s]
6it [00:00,  7.30it/s]


Epoch [47], Train Loss : [0.10686] Val Loss : [0.17569] Val F1 Score : [0.82064]
Epoch    48: reducing learning rate of group 0 to 1.2500e-04.


23it [00:03,  6.69it/s]
6it [00:00,  7.19it/s]


Epoch [48], Train Loss : [0.10799] Val Loss : [0.18700] Val F1 Score : [0.82064]


23it [00:03,  6.06it/s]
6it [00:00,  7.36it/s]


Epoch [49], Train Loss : [0.10143] Val Loss : [0.17743] Val F1 Score : [0.82428]


23it [00:03,  6.68it/s]
6it [00:00,  7.24it/s]


Epoch [50], Train Loss : [0.12512] Val Loss : [0.17576] Val F1 Score : [0.82840]


23it [00:03,  6.67it/s]
6it [00:00,  7.24it/s]


Epoch [51], Train Loss : [0.10562] Val Loss : [0.18108] Val F1 Score : [0.82617]


23it [00:03,  6.70it/s]
6it [00:00,  7.18it/s]


Epoch [52], Train Loss : [0.12334] Val Loss : [0.18351] Val F1 Score : [0.82114]


23it [00:03,  6.61it/s]
6it [00:00,  7.18it/s]


Epoch [53], Train Loss : [0.10494] Val Loss : [0.17651] Val F1 Score : [0.82717]
Epoch    54: reducing learning rate of group 0 to 6.2500e-05.


23it [00:03,  6.68it/s]
6it [00:00,  7.09it/s]


Epoch [54], Train Loss : [0.13231] Val Loss : [0.17789] Val F1 Score : [0.82000]


23it [00:03,  6.59it/s]
6it [00:00,  7.43it/s]


Epoch [55], Train Loss : [0.10248] Val Loss : [0.17558] Val F1 Score : [0.82589]


23it [00:03,  6.60it/s]
6it [00:00,  7.53it/s]


Epoch [56], Train Loss : [0.09827] Val Loss : [0.17882] Val F1 Score : [0.82240]


23it [00:03,  6.75it/s]
6it [00:00,  7.51it/s]


Epoch [57], Train Loss : [0.09900] Val Loss : [0.18138] Val F1 Score : [0.82228]


23it [00:03,  6.79it/s]
6it [00:00,  7.41it/s]


Epoch [58], Train Loss : [0.09643] Val Loss : [0.18344] Val F1 Score : [0.81842]


23it [00:03,  6.69it/s]
6it [00:00,  7.40it/s]


Epoch [59], Train Loss : [0.10366] Val Loss : [0.18019] Val F1 Score : [0.82589]
Epoch    60: reducing learning rate of group 0 to 3.1250e-05.


23it [00:03,  6.60it/s]
6it [00:00,  7.28it/s]


Epoch [60], Train Loss : [0.09907] Val Loss : [0.18019] Val F1 Score : [0.82415]


23it [00:03,  6.77it/s]
6it [00:00,  7.37it/s]


Epoch [61], Train Loss : [0.11718] Val Loss : [0.17958] Val F1 Score : [0.82441]


23it [00:03,  6.83it/s]
6it [00:00,  7.32it/s]


Epoch [62], Train Loss : [0.09518] Val Loss : [0.17938] Val F1 Score : [0.82603]


23it [00:03,  6.59it/s]
6it [00:00,  7.41it/s]


Epoch [63], Train Loss : [0.10176] Val Loss : [0.18391] Val F1 Score : [0.82205]


23it [00:03,  6.76it/s]
6it [00:00,  7.29it/s]


Epoch [64], Train Loss : [0.10260] Val Loss : [0.17699] Val F1 Score : [0.82536]


23it [00:03,  6.80it/s]
6it [00:00,  7.47it/s]


Epoch [65], Train Loss : [0.10459] Val Loss : [0.18334] Val F1 Score : [0.82575]
Epoch    66: reducing learning rate of group 0 to 1.5625e-05.


23it [00:03,  6.78it/s]
6it [00:00,  7.36it/s]


Epoch [66], Train Loss : [0.10929] Val Loss : [0.18138] Val F1 Score : [0.82575]


23it [00:03,  6.73it/s]
6it [00:00,  6.87it/s]


Epoch [67], Train Loss : [0.09426] Val Loss : [0.17732] Val F1 Score : [0.82522]


23it [00:03,  6.77it/s]
6it [00:00,  7.10it/s]


Epoch [68], Train Loss : [0.09746] Val Loss : [0.18126] Val F1 Score : [0.82217]


23it [00:03,  6.76it/s]
6it [00:00,  7.44it/s]


Epoch [69], Train Loss : [0.11180] Val Loss : [0.17974] Val F1 Score : [0.82402]


23it [00:03,  6.58it/s]
6it [00:00,  7.45it/s]


Epoch [70], Train Loss : [0.09474] Val Loss : [0.17705] Val F1 Score : [0.83295]


23it [00:03,  6.77it/s]
6it [00:00,  7.43it/s]


Epoch [71], Train Loss : [0.09858] Val Loss : [0.18388] Val F1 Score : [0.82023]


23it [00:03,  6.83it/s]
6it [00:00,  7.37it/s]


Epoch [72], Train Loss : [0.09242] Val Loss : [0.17937] Val F1 Score : [0.82589]


23it [00:03,  6.83it/s]
6it [00:00,  7.33it/s]


Epoch [73], Train Loss : [0.09110] Val Loss : [0.18252] Val F1 Score : [0.82390]


23it [00:03,  6.75it/s]
6it [00:00,  7.35it/s]


Epoch [74], Train Loss : [0.09277] Val Loss : [0.18312] Val F1 Score : [0.82285]


23it [00:03,  6.77it/s]
6it [00:00,  7.30it/s]


Epoch [75], Train Loss : [0.09079] Val Loss : [0.18545] Val F1 Score : [0.82217]


23it [00:03,  6.77it/s]
6it [00:00,  7.41it/s]


Epoch [76], Train Loss : [0.12879] Val Loss : [0.18400] Val F1 Score : [0.82322]
Epoch    77: reducing learning rate of group 0 to 7.8125e-06.


23it [00:03,  6.54it/s]
6it [00:00,  7.36it/s]


Epoch [77], Train Loss : [0.09153] Val Loss : [0.18175] Val F1 Score : [0.82654]


23it [00:03,  6.71it/s]
6it [00:00,  7.52it/s]


Epoch [78], Train Loss : [0.09363] Val Loss : [0.18318] Val F1 Score : [0.81842]


23it [00:03,  6.70it/s]
6it [00:00,  7.52it/s]


Epoch [79], Train Loss : [0.09126] Val Loss : [0.18752] Val F1 Score : [0.82103]


23it [00:03,  6.70it/s]
6it [00:00,  7.40it/s]


Epoch [80], Train Loss : [0.09181] Val Loss : [0.18561] Val F1 Score : [0.81662]


23it [00:03,  6.70it/s]
6it [00:00,  7.45it/s]


Epoch [81], Train Loss : [0.11142] Val Loss : [0.18402] Val F1 Score : [0.82136]


23it [00:03,  6.65it/s]
6it [00:00,  7.30it/s]


Epoch [82], Train Loss : [0.09517] Val Loss : [0.18270] Val F1 Score : [0.82842]
Epoch    83: reducing learning rate of group 0 to 3.9063e-06.


23it [00:03,  6.72it/s]
6it [00:00,  7.27it/s]


Epoch [83], Train Loss : [0.10968] Val Loss : [0.18039] Val F1 Score : [0.82496]


23it [00:03,  6.49it/s]
6it [00:00,  7.28it/s]


Epoch [84], Train Loss : [0.09189] Val Loss : [0.18514] Val F1 Score : [0.82684]


23it [00:03,  6.71it/s]
6it [00:00,  7.29it/s]


Epoch [85], Train Loss : [0.10166] Val Loss : [0.18474] Val F1 Score : [0.82297]


23it [00:03,  6.65it/s]
6it [00:00,  7.20it/s]


Epoch [86], Train Loss : [0.09059] Val Loss : [0.18321] Val F1 Score : [0.82194]


23it [00:03,  6.73it/s]
6it [00:00,  7.26it/s]


Epoch [87], Train Loss : [0.09224] Val Loss : [0.18558] Val F1 Score : [0.81932]


23it [00:03,  6.59it/s]
6it [00:00,  6.81it/s]


Epoch [88], Train Loss : [0.10510] Val Loss : [0.18401] Val F1 Score : [0.82033]
Epoch    89: reducing learning rate of group 0 to 1.9531e-06.


23it [00:03,  6.74it/s]
6it [00:00,  7.26it/s]


Epoch [89], Train Loss : [0.09167] Val Loss : [0.18288] Val F1 Score : [0.82125]


23it [00:03,  6.76it/s]
6it [00:00,  7.34it/s]


Epoch [90], Train Loss : [0.09101] Val Loss : [0.18533] Val F1 Score : [0.81842]


23it [00:03,  6.56it/s]
6it [00:00,  7.27it/s]


Epoch [91], Train Loss : [0.09220] Val Loss : [0.18335] Val F1 Score : [0.82158]


23it [00:03,  6.73it/s]
6it [00:00,  7.42it/s]


Epoch [92], Train Loss : [0.09133] Val Loss : [0.18189] Val F1 Score : [0.82334]


23it [00:03,  6.69it/s]
6it [00:00,  7.38it/s]


Epoch [93], Train Loss : [0.09154] Val Loss : [0.18338] Val F1 Score : [0.82575]


23it [00:03,  6.74it/s]
6it [00:00,  7.43it/s]


Epoch [94], Train Loss : [0.09244] Val Loss : [0.18383] Val F1 Score : [0.82640]
Epoch    95: reducing learning rate of group 0 to 1.0000e-06.


23it [00:03,  6.82it/s]
6it [00:00,  7.39it/s]


Epoch [95], Train Loss : [0.09372] Val Loss : [0.18495] Val F1 Score : [0.82364]


23it [00:03,  6.80it/s]
6it [00:00,  7.27it/s]


Epoch [96], Train Loss : [0.10361] Val Loss : [0.18130] Val F1 Score : [0.82509]


23it [00:03,  6.77it/s]
6it [00:00,  7.38it/s]


Epoch [97], Train Loss : [0.09228] Val Loss : [0.18784] Val F1 Score : [0.81137]


23it [00:03,  6.62it/s]
6it [00:00,  7.48it/s]


Epoch [98], Train Loss : [0.09908] Val Loss : [0.18625] Val F1 Score : [0.82285]


23it [00:03,  6.83it/s]
6it [00:00,  7.46it/s]

Epoch [99], Train Loss : [0.09574] Val Loss : [0.18246] Val F1 Score : [0.82390]





# Small Model (student)

In [17]:
# ---------------------------
# Student Model
# ---------------------------
class Student(nn.Module):
    def __init__(self):
        super(Student, self).__init__()
        self.classifier = nn.Sequential(
            nn.Linear(in_features=18, out_features=128),
            nn.BatchNorm1d(128),
            nn.GELU(),
            nn.Linear(in_features=128, out_features=256),
            nn.BatchNorm1d(256),
            nn.GELU(),
            nn.Linear(in_features=256, out_features=256),
            nn.BatchNorm1d(256),
            nn.GELU(),
            nn.Linear(in_features=256, out_features=128),
            nn.BatchNorm1d(128),
            nn.GELU(),
            nn.Linear(in_features=128, out_features=1),
            nn.Sigmoid()
        )
        
    def forward(self, x):
        output = self.classifier(x)
        return output

In [18]:
# # ---------------------------
# # Distillation Loss with Temperature
# # ---------------------------

# def distillation(student_logits, labels, teacher_logits, T,alpha):
#     distillation_loss = nn.BCELoss()(student_logits/T, teacher_logits/T)
#     student_loss = nn.BCELoss()(student_logits, labels.reshape(-1, 1))
#     return (1.-alpha) * student_loss +  distillation_loss * (T*T * 2.0 * alpha)

# def distill_loss(output, target, teacher_output, loss_fn=distillation, opt=optimizer):
#     loss_b = loss_fn(output, target, teacher_output, T=20.0, alpha=0.7)

#     if opt is not None:
#         # BACKWARD
#         loss_b.backward()
        
#         # UPDATE
#         opt.step()

#     return loss_b.item()

In [19]:
# ---------------------------
# Distillation Loss without Temperature
# ---------------------------

def distillation(student_logits, labels, teacher_logits, alpha):
    distillation_loss = nn.BCELoss()(student_logits, teacher_logits)
    student_loss = nn.BCELoss()(student_logits, labels.reshape(-1, 1))
    return (1.-alpha) * student_loss +  distillation_loss * alpha

def distill_loss(output, target, teacher_output, loss_fn=distillation, opt=optimizer):
    loss_b = loss_fn(output, target, teacher_output, alpha=0.7)

    if opt is not None:
        # BACKWARD
        loss_b.backward()
        
        # UPDATE
        opt.step()

    return loss_b.item()

In [20]:
#--------------------------------------------------------------------------------
# Training Student
#--------------------------------------------------------------------------------

def student_train(s_model, t_model, optimizer, train_loader, val_loader, scheduler, device):
    
    s_model.to(device)
    t_model.to(device)
    
    best_score = 0
    best_model = None

    for epoch in range(CONFIG["EPOCHS"]):
        train_loss = []
        s_model.train()
        t_model.eval()
        
        for X_t, X_s, y in tqdm(train_loader):
            
            # ZERO GRADIENT
            optimizer.zero_grad()
            
            # FORWARD
            y_pred_student = s_model(X_s)
            with torch.no_grad():
                y_pred_teacher = t_model(X_t)
                
            loss_b = distill_loss(y_pred_student, y, y_pred_teacher, loss_fn=distillation, opt=optimizer)
            
                
            train_loss.append(loss_b)

        val_loss, val_score = validation_student(s_model, t_model, val_loader, distill_loss, device)
        print(f'Epoch [{epoch}], Train Loss : [{np.mean(train_loss) :.5f}] Val Loss : [{np.mean(val_loss) :.5f}] Val F1 Score : [{val_score:.5f}]')
        
        if scheduler is not None:
            scheduler.step(val_score)
            
        if best_score < val_score:
            best_model = s_model
            best_score = val_score
        
    return best_model


#--------------------------------------------------------------------------------
# Validation Student
#--------------------------------------------------------------------------------

def validation_student(s_model, t_model, val_loader, criterion, device):
    s_model.eval()
    t_model.eval()

    val_loss = []
    pred_labels = []
    true_labels = []
    threshold = 0.35
    
    with torch.no_grad():
        for X_t, X_s, y in tqdm(val_loader):
            
            model_pred     = s_model(X_s)
            teacher_output = t_model(X_t)
            
            loss_b = distill_loss(model_pred, y, teacher_output, loss_fn=distillation, opt=None)
            val_loss.append(loss_b)
            
            model_pred = model_pred.squeeze(1)#.to('cpu')
            pred_labels += model_pred.tolist()
            true_labels += y.tolist()
        
        pred_labels = np.where(np.array(pred_labels) > threshold, 1, 0)
        val_f1 = competition_metric(true_labels, pred_labels)
    return val_loss, val_f1    

In [21]:
# ---------------------------
# Dataset for Student Model (Distillation True)
# ---------------------------
train_dataset = CustomDataset(train_X, train_y, True)
val_dataset = CustomDataset(val_X, val_y, True)

# ---------------------------
# Dataloader for Student Model (Distillation True)
# ---------------------------
train_loader = DataLoader(train_dataset, batch_size = CONFIG['BATCH_SIZE'], shuffle=True)
val_loader = DataLoader(val_dataset, batch_size = CONFIG['BATCH_SIZE'], shuffle=False)

In [22]:
#--------------------------------------------------------------------------------
# Training Process
#--------------------------------------------------------------------------------

student_model = Student()
# student_model.eval()
optimizer = torch.optim.Adam(student_model.parameters(), lr=CONFIG['LEARNING_RATE'])
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=5, threshold_mode='abs',min_lr=1e-6, verbose=True)

best_student_model = student_train(student_model, teacher_model, optimizer, train_loader, val_loader, scheduler, device)

100%|██████████| 23/23 [00:12<00:00,  1.78it/s]
100%|██████████| 6/6 [00:02<00:00,  2.05it/s]


Epoch [0], Train Loss : [0.46715] Val Loss : [0.45206] Val F1 Score : [0.49463]


100%|██████████| 23/23 [00:12<00:00,  1.78it/s]
100%|██████████| 6/6 [00:02<00:00,  2.06it/s]


Epoch [1], Train Loss : [0.33465] Val Loss : [0.31013] Val F1 Score : [0.48907]


100%|██████████| 23/23 [00:13<00:00,  1.77it/s]
100%|██████████| 6/6 [00:02<00:00,  2.07it/s]


Epoch [2], Train Loss : [0.30403] Val Loss : [0.30269] Val F1 Score : [0.48031]


100%|██████████| 23/23 [00:12<00:00,  1.79it/s]
100%|██████████| 6/6 [00:02<00:00,  2.05it/s]


Epoch [3], Train Loss : [0.29855] Val Loss : [0.30246] Val F1 Score : [0.48133]


100%|██████████| 23/23 [00:12<00:00,  1.78it/s]
100%|██████████| 6/6 [00:02<00:00,  2.04it/s]


Epoch [4], Train Loss : [0.29033] Val Loss : [0.29761] Val F1 Score : [0.49038]


100%|██████████| 23/23 [00:13<00:00,  1.74it/s]
100%|██████████| 6/6 [00:02<00:00,  2.02it/s]


Epoch [5], Train Loss : [0.29011] Val Loss : [0.30101] Val F1 Score : [0.49354]


100%|██████████| 23/23 [00:13<00:00,  1.74it/s]
100%|██████████| 6/6 [00:02<00:00,  2.03it/s]


Epoch [6], Train Loss : [0.28817] Val Loss : [0.29825] Val F1 Score : [0.48491]
Epoch     7: reducing learning rate of group 0 to 5.0000e-04.


100%|██████████| 23/23 [00:13<00:00,  1.74it/s]
100%|██████████| 6/6 [00:02<00:00,  2.04it/s]


Epoch [7], Train Loss : [0.28798] Val Loss : [0.29856] Val F1 Score : [0.48921]


100%|██████████| 23/23 [00:13<00:00,  1.74it/s]
100%|██████████| 6/6 [00:02<00:00,  2.01it/s]


Epoch [8], Train Loss : [0.27810] Val Loss : [0.30051] Val F1 Score : [0.49315]


100%|██████████| 23/23 [00:13<00:00,  1.74it/s]
100%|██████████| 6/6 [00:02<00:00,  2.01it/s]


Epoch [9], Train Loss : [0.28828] Val Loss : [0.29783] Val F1 Score : [0.48979]


100%|██████████| 23/23 [00:13<00:00,  1.72it/s]
100%|██████████| 6/6 [00:02<00:00,  2.02it/s]


Epoch [10], Train Loss : [0.27674] Val Loss : [0.29945] Val F1 Score : [0.48849]


100%|██████████| 23/23 [00:13<00:00,  1.74it/s]
100%|██████████| 6/6 [00:02<00:00,  2.01it/s]


Epoch [11], Train Loss : [0.28055] Val Loss : [0.29916] Val F1 Score : [0.49259]


100%|██████████| 23/23 [00:13<00:00,  1.74it/s]
100%|██████████| 6/6 [00:02<00:00,  2.02it/s]


Epoch [12], Train Loss : [0.28218] Val Loss : [0.30177] Val F1 Score : [0.49801]


100%|██████████| 23/23 [00:13<00:00,  1.74it/s]
100%|██████████| 6/6 [00:02<00:00,  2.01it/s]


Epoch [13], Train Loss : [0.28434] Val Loss : [0.30244] Val F1 Score : [0.49259]


100%|██████████| 23/23 [00:13<00:00,  1.70it/s]
100%|██████████| 6/6 [00:03<00:00,  1.96it/s]


Epoch [14], Train Loss : [0.28030] Val Loss : [0.30232] Val F1 Score : [0.49565]


100%|██████████| 23/23 [00:13<00:00,  1.73it/s]
100%|██████████| 6/6 [00:02<00:00,  2.02it/s]


Epoch [15], Train Loss : [0.30272] Val Loss : [0.30226] Val F1 Score : [0.51056]


100%|██████████| 23/23 [00:13<00:00,  1.74it/s]
100%|██████████| 6/6 [00:02<00:00,  2.02it/s]


Epoch [16], Train Loss : [0.28827] Val Loss : [0.29888] Val F1 Score : [0.48821]


100%|██████████| 23/23 [00:12<00:00,  1.77it/s]
100%|██████████| 6/6 [00:02<00:00,  2.06it/s]


Epoch [17], Train Loss : [0.27536] Val Loss : [0.30006] Val F1 Score : [0.48807]


100%|██████████| 23/23 [00:13<00:00,  1.75it/s]
100%|██████████| 6/6 [00:02<00:00,  2.04it/s]


Epoch [18], Train Loss : [0.27168] Val Loss : [0.29920] Val F1 Score : [0.49531]


100%|██████████| 23/23 [00:13<00:00,  1.76it/s]
100%|██████████| 6/6 [00:02<00:00,  2.03it/s]


Epoch [19], Train Loss : [0.27775] Val Loss : [0.30010] Val F1 Score : [0.49565]


100%|██████████| 23/23 [00:13<00:00,  1.76it/s]
100%|██████████| 6/6 [00:02<00:00,  2.04it/s]


Epoch [20], Train Loss : [0.28378] Val Loss : [0.30194] Val F1 Score : [0.50597]


100%|██████████| 23/23 [00:13<00:00,  1.75it/s]
100%|██████████| 6/6 [00:03<00:00,  1.94it/s]


Epoch [21], Train Loss : [0.27351] Val Loss : [0.30360] Val F1 Score : [0.50540]
Epoch    22: reducing learning rate of group 0 to 2.5000e-04.


100%|██████████| 23/23 [00:13<00:00,  1.75it/s]
100%|██████████| 6/6 [00:03<00:00,  1.95it/s]


Epoch [22], Train Loss : [0.28718] Val Loss : [0.29922] Val F1 Score : [0.49599]


100%|██████████| 23/23 [00:13<00:00,  1.71it/s]
100%|██████████| 6/6 [00:03<00:00,  2.00it/s]


Epoch [23], Train Loss : [0.28023] Val Loss : [0.30178] Val F1 Score : [0.49819]


100%|██████████| 23/23 [00:13<00:00,  1.72it/s]
100%|██████████| 6/6 [00:03<00:00,  1.99it/s]


Epoch [24], Train Loss : [0.28101] Val Loss : [0.30251] Val F1 Score : [0.50989]


100%|██████████| 23/23 [00:13<00:00,  1.72it/s]
100%|██████████| 6/6 [00:03<00:00,  1.99it/s]


Epoch [25], Train Loss : [0.26944] Val Loss : [0.30319] Val F1 Score : [0.49748]


100%|██████████| 23/23 [00:13<00:00,  1.71it/s]
100%|██████████| 6/6 [00:03<00:00,  1.99it/s]


Epoch [26], Train Loss : [0.27027] Val Loss : [0.30081] Val F1 Score : [0.49910]


100%|██████████| 23/23 [00:13<00:00,  1.72it/s]
100%|██████████| 6/6 [00:03<00:00,  1.98it/s]


Epoch [27], Train Loss : [0.26947] Val Loss : [0.30116] Val F1 Score : [0.50494]
Epoch    28: reducing learning rate of group 0 to 1.2500e-04.


100%|██████████| 23/23 [00:13<00:00,  1.71it/s]
100%|██████████| 6/6 [00:03<00:00,  1.98it/s]


Epoch [28], Train Loss : [0.26789] Val Loss : [0.30229] Val F1 Score : [0.50414]


100%|██████████| 23/23 [00:13<00:00,  1.71it/s]
100%|██████████| 6/6 [00:03<00:00,  1.99it/s]


Epoch [29], Train Loss : [0.29126] Val Loss : [0.30348] Val F1 Score : [0.50275]


100%|██████████| 23/23 [00:13<00:00,  1.70it/s]
100%|██████████| 6/6 [00:03<00:00,  1.99it/s]


Epoch [30], Train Loss : [0.27335] Val Loss : [0.30401] Val F1 Score : [0.50354]


100%|██████████| 23/23 [00:13<00:00,  1.71it/s]
100%|██████████| 6/6 [00:03<00:00,  1.97it/s]


Epoch [31], Train Loss : [0.27187] Val Loss : [0.30459] Val F1 Score : [0.50036]


100%|██████████| 23/23 [00:13<00:00,  1.73it/s]
100%|██████████| 6/6 [00:02<00:00,  2.02it/s]


Epoch [32], Train Loss : [0.27128] Val Loss : [0.30271] Val F1 Score : [0.50374]


100%|██████████| 23/23 [00:13<00:00,  1.74it/s]
100%|██████████| 6/6 [00:02<00:00,  2.02it/s]


Epoch [33], Train Loss : [0.27120] Val Loss : [0.30241] Val F1 Score : [0.49730]
Epoch    34: reducing learning rate of group 0 to 6.2500e-05.


100%|██████████| 23/23 [00:13<00:00,  1.73it/s]
100%|██████████| 6/6 [00:02<00:00,  2.00it/s]


Epoch [34], Train Loss : [0.26875] Val Loss : [0.30268] Val F1 Score : [0.50855]


100%|██████████| 23/23 [00:12<00:00,  1.78it/s]
100%|██████████| 6/6 [00:02<00:00,  2.03it/s]


Epoch [35], Train Loss : [0.26625] Val Loss : [0.30243] Val F1 Score : [0.50576]


100%|██████████| 23/23 [00:13<00:00,  1.77it/s]
100%|██████████| 6/6 [00:02<00:00,  2.06it/s]


Epoch [36], Train Loss : [0.26830] Val Loss : [0.30394] Val F1 Score : [0.50791]


100%|██████████| 23/23 [00:12<00:00,  1.78it/s]
100%|██████████| 6/6 [00:02<00:00,  2.07it/s]


Epoch [37], Train Loss : [0.26932] Val Loss : [0.30392] Val F1 Score : [0.50314]


100%|██████████| 23/23 [00:13<00:00,  1.76it/s]
100%|██████████| 6/6 [00:03<00:00,  1.99it/s]


Epoch [38], Train Loss : [0.26325] Val Loss : [0.30460] Val F1 Score : [0.50434]


100%|██████████| 23/23 [00:12<00:00,  1.77it/s]
100%|██████████| 6/6 [00:02<00:00,  2.06it/s]


Epoch [39], Train Loss : [0.26652] Val Loss : [0.30303] Val F1 Score : [0.50515]
Epoch    40: reducing learning rate of group 0 to 3.1250e-05.


100%|██████████| 23/23 [00:13<00:00,  1.73it/s]
100%|██████████| 6/6 [00:02<00:00,  2.01it/s]


Epoch [40], Train Loss : [0.26572] Val Loss : [0.30308] Val F1 Score : [0.50112]


100%|██████████| 23/23 [00:13<00:00,  1.74it/s]
100%|██████████| 6/6 [00:02<00:00,  2.02it/s]


Epoch [41], Train Loss : [0.26255] Val Loss : [0.30305] Val F1 Score : [0.50576]


100%|██████████| 23/23 [00:13<00:00,  1.74it/s]
100%|██████████| 6/6 [00:03<00:00,  1.96it/s]


Epoch [42], Train Loss : [0.26244] Val Loss : [0.30402] Val F1 Score : [0.50855]


100%|██████████| 23/23 [00:13<00:00,  1.74it/s]
100%|██████████| 6/6 [00:02<00:00,  2.02it/s]


Epoch [43], Train Loss : [0.27790] Val Loss : [0.30384] Val F1 Score : [0.50454]


100%|██████████| 23/23 [00:13<00:00,  1.75it/s]
100%|██████████| 6/6 [00:02<00:00,  2.02it/s]


Epoch [44], Train Loss : [0.26057] Val Loss : [0.30371] Val F1 Score : [0.51545]


100%|██████████| 23/23 [00:13<00:00,  1.74it/s]
100%|██████████| 6/6 [00:02<00:00,  2.03it/s]


Epoch [45], Train Loss : [0.26277] Val Loss : [0.30498] Val F1 Score : [0.51947]


100%|██████████| 23/23 [00:13<00:00,  1.73it/s]
100%|██████████| 6/6 [00:03<00:00,  1.96it/s]


Epoch [46], Train Loss : [0.27439] Val Loss : [0.30417] Val F1 Score : [0.51474]


100%|██████████| 23/23 [00:13<00:00,  1.75it/s]
100%|██████████| 6/6 [00:02<00:00,  2.05it/s]


Epoch [47], Train Loss : [0.26277] Val Loss : [0.30543] Val F1 Score : [0.51897]


100%|██████████| 23/23 [00:13<00:00,  1.77it/s]
100%|██████████| 6/6 [00:02<00:00,  2.06it/s]


Epoch [48], Train Loss : [0.27355] Val Loss : [0.30492] Val F1 Score : [0.51699]


100%|██████████| 23/23 [00:13<00:00,  1.77it/s]
100%|██████████| 6/6 [00:02<00:00,  2.06it/s]


Epoch [49], Train Loss : [0.26229] Val Loss : [0.30341] Val F1 Score : [0.51333]


100%|██████████| 23/23 [00:13<00:00,  1.76it/s]
100%|██████████| 6/6 [00:02<00:00,  2.05it/s]


Epoch [50], Train Loss : [0.27065] Val Loss : [0.30414] Val F1 Score : [0.51822]


100%|██████████| 23/23 [00:12<00:00,  1.77it/s]
100%|██████████| 6/6 [00:02<00:00,  2.04it/s]


Epoch [51], Train Loss : [0.26780] Val Loss : [0.30520] Val F1 Score : [0.51641]
Epoch    52: reducing learning rate of group 0 to 1.5625e-05.


100%|██████████| 23/23 [00:12<00:00,  1.77it/s]
100%|██████████| 6/6 [00:02<00:00,  2.05it/s]


Epoch [52], Train Loss : [0.26154] Val Loss : [0.30365] Val F1 Score : [0.50986]


100%|██████████| 23/23 [00:13<00:00,  1.77it/s]
100%|██████████| 6/6 [00:02<00:00,  2.04it/s]


Epoch [53], Train Loss : [0.27835] Val Loss : [0.30455] Val F1 Score : [0.51569]


100%|██████████| 23/23 [00:13<00:00,  1.72it/s]
100%|██████████| 6/6 [00:02<00:00,  2.04it/s]


Epoch [54], Train Loss : [0.26663] Val Loss : [0.30192] Val F1 Score : [0.50556]


100%|██████████| 23/23 [00:13<00:00,  1.75it/s]
100%|██████████| 6/6 [00:02<00:00,  2.05it/s]


Epoch [55], Train Loss : [0.26638] Val Loss : [0.30503] Val F1 Score : [0.51666]


100%|██████████| 23/23 [00:13<00:00,  1.76it/s]
100%|██████████| 6/6 [00:02<00:00,  2.04it/s]


Epoch [56], Train Loss : [0.26482] Val Loss : [0.30413] Val F1 Score : [0.50920]


100%|██████████| 23/23 [00:13<00:00,  1.77it/s]
100%|██████████| 6/6 [00:02<00:00,  2.04it/s]


Epoch [57], Train Loss : [0.26230] Val Loss : [0.30499] Val F1 Score : [0.52431]


100%|██████████| 23/23 [00:13<00:00,  1.74it/s]
100%|██████████| 6/6 [00:02<00:00,  2.02it/s]


Epoch [58], Train Loss : [0.29159] Val Loss : [0.30499] Val F1 Score : [0.50920]


100%|██████████| 23/23 [00:13<00:00,  1.74it/s]
100%|██████████| 6/6 [00:02<00:00,  2.02it/s]


Epoch [59], Train Loss : [0.26364] Val Loss : [0.30492] Val F1 Score : [0.52246]


100%|██████████| 23/23 [00:13<00:00,  1.74it/s]
100%|██████████| 6/6 [00:02<00:00,  2.02it/s]


Epoch [60], Train Loss : [0.26268] Val Loss : [0.30559] Val F1 Score : [0.52714]


100%|██████████| 23/23 [00:13<00:00,  1.74it/s]
100%|██████████| 6/6 [00:02<00:00,  2.02it/s]


Epoch [61], Train Loss : [0.27199] Val Loss : [0.30510] Val F1 Score : [0.51675]


100%|██████████| 23/23 [00:13<00:00,  1.75it/s]
100%|██████████| 6/6 [00:02<00:00,  2.05it/s]


Epoch [62], Train Loss : [0.27656] Val Loss : [0.30479] Val F1 Score : [0.52485]


100%|██████████| 23/23 [00:12<00:00,  1.78it/s]
100%|██████████| 6/6 [00:02<00:00,  2.05it/s]


Epoch [63], Train Loss : [0.27700] Val Loss : [0.30485] Val F1 Score : [0.52040]


100%|██████████| 23/23 [00:12<00:00,  1.78it/s]
100%|██████████| 6/6 [00:02<00:00,  2.04it/s]


Epoch [64], Train Loss : [0.26361] Val Loss : [0.30624] Val F1 Score : [0.51989]


100%|██████████| 23/23 [00:12<00:00,  1.78it/s]
100%|██████████| 6/6 [00:02<00:00,  2.00it/s]


Epoch [65], Train Loss : [0.27012] Val Loss : [0.30681] Val F1 Score : [0.52405]


100%|██████████| 23/23 [00:12<00:00,  1.78it/s]
100%|██████████| 6/6 [00:02<00:00,  2.06it/s]


Epoch [66], Train Loss : [0.27292] Val Loss : [0.30407] Val F1 Score : [0.51403]
Epoch    67: reducing learning rate of group 0 to 7.8125e-06.


100%|██████████| 23/23 [00:12<00:00,  1.78it/s]
100%|██████████| 6/6 [00:02<00:00,  2.06it/s]


Epoch [67], Train Loss : [0.26795] Val Loss : [0.30560] Val F1 Score : [0.52539]


100%|██████████| 23/23 [00:12<00:00,  1.77it/s]
100%|██████████| 6/6 [00:02<00:00,  2.05it/s]


Epoch [68], Train Loss : [0.26502] Val Loss : [0.30487] Val F1 Score : [0.52220]


100%|██████████| 23/23 [00:12<00:00,  1.79it/s]
100%|██████████| 6/6 [00:02<00:00,  2.01it/s]


Epoch [69], Train Loss : [0.26795] Val Loss : [0.30525] Val F1 Score : [0.52116]


100%|██████████| 23/23 [00:12<00:00,  1.79it/s]
100%|██████████| 6/6 [00:02<00:00,  2.05it/s]


Epoch [70], Train Loss : [0.26711] Val Loss : [0.30446] Val F1 Score : [0.51748]


100%|██████████| 23/23 [00:12<00:00,  1.80it/s]
100%|██████████| 6/6 [00:02<00:00,  2.07it/s]


Epoch [71], Train Loss : [0.26126] Val Loss : [0.30483] Val F1 Score : [0.52539]


100%|██████████| 23/23 [00:12<00:00,  1.79it/s]
100%|██████████| 6/6 [00:02<00:00,  2.05it/s]


Epoch [72], Train Loss : [0.26019] Val Loss : [0.30525] Val F1 Score : [0.52325]
Epoch    73: reducing learning rate of group 0 to 3.9063e-06.


100%|██████████| 23/23 [00:13<00:00,  1.77it/s]
100%|██████████| 6/6 [00:02<00:00,  2.06it/s]


Epoch [73], Train Loss : [0.26176] Val Loss : [0.30464] Val F1 Score : [0.52142]


100%|██████████| 23/23 [00:12<00:00,  1.79it/s]
100%|██████████| 6/6 [00:02<00:00,  2.06it/s]


Epoch [74], Train Loss : [0.26457] Val Loss : [0.30444] Val F1 Score : [0.52485]


100%|██████████| 23/23 [00:12<00:00,  1.79it/s]
100%|██████████| 6/6 [00:02<00:00,  2.08it/s]


Epoch [75], Train Loss : [0.26194] Val Loss : [0.30459] Val F1 Score : [0.51748]


100%|██████████| 23/23 [00:12<00:00,  1.79it/s]
100%|██████████| 6/6 [00:02<00:00,  2.06it/s]


Epoch [76], Train Loss : [0.26259] Val Loss : [0.30556] Val F1 Score : [0.52769]


100%|██████████| 23/23 [00:13<00:00,  1.77it/s]
100%|██████████| 6/6 [00:02<00:00,  2.04it/s]


Epoch [77], Train Loss : [0.26806] Val Loss : [0.30608] Val F1 Score : [0.52405]


100%|██████████| 23/23 [00:13<00:00,  1.75it/s]
100%|██████████| 6/6 [00:02<00:00,  2.02it/s]


Epoch [78], Train Loss : [0.28169] Val Loss : [0.30352] Val F1 Score : [0.51259]


100%|██████████| 23/23 [00:13<00:00,  1.76it/s]
100%|██████████| 6/6 [00:02<00:00,  2.05it/s]


Epoch [79], Train Loss : [0.25993] Val Loss : [0.30493] Val F1 Score : [0.52431]


100%|██████████| 23/23 [00:13<00:00,  1.77it/s]
100%|██████████| 6/6 [00:02<00:00,  2.05it/s]


Epoch [80], Train Loss : [0.26705] Val Loss : [0.30440] Val F1 Score : [0.52142]


100%|██████████| 23/23 [00:13<00:00,  1.76it/s]
100%|██████████| 6/6 [00:02<00:00,  2.06it/s]


Epoch [81], Train Loss : [0.26780] Val Loss : [0.30537] Val F1 Score : [0.52273]


100%|██████████| 23/23 [00:13<00:00,  1.74it/s]
100%|██████████| 6/6 [00:02<00:00,  2.01it/s]


Epoch [82], Train Loss : [0.26171] Val Loss : [0.30543] Val F1 Score : [0.52769]
Epoch    83: reducing learning rate of group 0 to 1.9531e-06.


100%|██████████| 23/23 [00:13<00:00,  1.75it/s]
100%|██████████| 6/6 [00:02<00:00,  2.01it/s]


Epoch [83], Train Loss : [0.26347] Val Loss : [0.30417] Val F1 Score : [0.51168]


100%|██████████| 23/23 [00:13<00:00,  1.76it/s]
100%|██████████| 6/6 [00:03<00:00,  1.98it/s]


Epoch [84], Train Loss : [0.28144] Val Loss : [0.30458] Val F1 Score : [0.51403]


100%|██████████| 23/23 [00:13<00:00,  1.75it/s]
100%|██████████| 6/6 [00:02<00:00,  2.05it/s]


Epoch [85], Train Loss : [0.26732] Val Loss : [0.30387] Val F1 Score : [0.52246]


100%|██████████| 23/23 [00:13<00:00,  1.76it/s]
100%|██████████| 6/6 [00:02<00:00,  2.01it/s]


Epoch [86], Train Loss : [0.26326] Val Loss : [0.30364] Val F1 Score : [0.50036]


100%|██████████| 23/23 [00:12<00:00,  1.77it/s]
100%|██████████| 6/6 [00:02<00:00,  2.04it/s]


Epoch [87], Train Loss : [0.26000] Val Loss : [0.30492] Val F1 Score : [0.52378]


100%|██████████| 23/23 [00:13<00:00,  1.74it/s]
100%|██████████| 6/6 [00:03<00:00,  1.97it/s]


Epoch [88], Train Loss : [0.27462] Val Loss : [0.30490] Val F1 Score : [0.52512]
Epoch    89: reducing learning rate of group 0 to 1.0000e-06.


100%|██████████| 23/23 [00:13<00:00,  1.75it/s]
100%|██████████| 6/6 [00:02<00:00,  2.04it/s]


Epoch [89], Train Loss : [0.26212] Val Loss : [0.30567] Val F1 Score : [0.52769]


100%|██████████| 23/23 [00:13<00:00,  1.76it/s]
100%|██████████| 6/6 [00:02<00:00,  2.05it/s]


Epoch [90], Train Loss : [0.27248] Val Loss : [0.30510] Val F1 Score : [0.51847]


100%|██████████| 23/23 [00:13<00:00,  1.76it/s]
100%|██████████| 6/6 [00:02<00:00,  2.03it/s]


Epoch [91], Train Loss : [0.26445] Val Loss : [0.30544] Val F1 Score : [0.52605]


100%|██████████| 23/23 [00:13<00:00,  1.73it/s]
100%|██████████| 6/6 [00:02<00:00,  2.03it/s]


Epoch [92], Train Loss : [0.27334] Val Loss : [0.30552] Val F1 Score : [0.52714]


100%|██████████| 23/23 [00:13<00:00,  1.73it/s]
100%|██████████| 6/6 [00:02<00:00,  2.02it/s]


Epoch [93], Train Loss : [0.26903] Val Loss : [0.30397] Val F1 Score : [0.51773]


100%|██████████| 23/23 [00:13<00:00,  1.75it/s]
100%|██████████| 6/6 [00:02<00:00,  2.02it/s]


Epoch [94], Train Loss : [0.26353] Val Loss : [0.30556] Val F1 Score : [0.52797]


100%|██████████| 23/23 [00:13<00:00,  1.76it/s]
100%|██████████| 6/6 [00:02<00:00,  2.03it/s]


Epoch [95], Train Loss : [0.26382] Val Loss : [0.30491] Val F1 Score : [0.52352]


100%|██████████| 23/23 [00:13<00:00,  1.74it/s]
100%|██████████| 6/6 [00:02<00:00,  2.02it/s]


Epoch [96], Train Loss : [0.26243] Val Loss : [0.30582] Val F1 Score : [0.52687]


100%|██████████| 23/23 [00:13<00:00,  1.75it/s]
100%|██████████| 6/6 [00:03<00:00,  1.99it/s]


Epoch [97], Train Loss : [0.26489] Val Loss : [0.30545] Val F1 Score : [0.52065]


100%|██████████| 23/23 [00:13<00:00,  1.76it/s]
100%|██████████| 6/6 [00:02<00:00,  2.03it/s]


Epoch [98], Train Loss : [0.26773] Val Loss : [0.30473] Val F1 Score : [0.52853]


100%|██████████| 23/23 [00:13<00:00,  1.75it/s]
100%|██████████| 6/6 [00:02<00:00,  2.02it/s]

Epoch [99], Train Loss : [0.26394] Val Loss : [0.30514] Val F1 Score : [0.51056]





# Choose Inference Threshold

In [23]:
def choose_threshold(model, val_loader, device):
    model.to(device)
    model.eval()
    
    thresholds = [0.1, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5]
    pred_labels = []
    true_labels = []
    
    best_score = 0
    best_thr = None
    with torch.no_grad():
        for _, x_s, y in tqdm(iter(val_loader)):
            
            model_pred = model(x_s)
            
            model_pred = model_pred.squeeze(1).to('cpu')
            pred_labels += model_pred.tolist()
            true_labels += y.tolist()
        
        for threshold in thresholds:
            pred_labels_thr = np.where(np.array(pred_labels) > threshold, 1, 0)
            score_thr = competition_metric(true_labels, pred_labels_thr)
            if best_score < score_thr:
                best_score = score_thr
                best_thr = threshold
    return best_thr, best_score

In [24]:
best_threshold, best_score = choose_threshold(best_student_model, val_loader, device)
print(f'Best Threshold : [{best_threshold}], Score : [{best_score:.5f}]')

100%|██████████| 6/6 [00:02<00:00,  2.02it/s]

Best Threshold : [0.2], Score : [0.55478]





# Inference

In [None]:
test_datasets = CustomDataset(test, None, False)
test_loaders = DataLoader(test_datasets, batch_size = CONFIG['BATCH_SIZE'], shuffle=False)

In [None]:
def inference(model, test_loader, threshold, device):
    model.to(device)
    model.eval()
    
    test_predict = []
    with torch.no_grad():
        for x in tqdm(test_loader):
            x = x.float().to(device)
            model_pred = model(x)

            model_pred = model_pred.squeeze(1).to('cpu')
            test_predict += model_pred
        
    test_predict = np.where(np.array(test_predict) > threshold, 1, 0)
    print('Done.')
    return test_predict

In [None]:
preds = inference(best_student_model, test_loaders, best_threshold, device)

# Submit

In [None]:
submit = pd.read_csv('data/sample_submission.csv')
submit['Y_LABEL'] = preds
submit.head()

In [None]:
submit.to_csv('./submit_GELU_epoch100_alpha0.7_0.4_0.35.csv', index=False)