## Import

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset

from sklearn.metrics import f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

import os
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
import random

import warnings
warnings.filterwarnings(action='ignore') 

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

## Hyperparameter setting

In [2]:
CFG = {
    'EPOCHS': 30,
    'LEARNING_RATE':1e-2,
    'BATCH_SIZE':256,
    'SEED':41
}

## Fixed RandomSeed

In [3]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(CFG['SEED'])

## Data Load

In [4]:
train = pd.read_csv('train_oil.csv')
test = pd.read_csv('test_oil.csv')

## Data Preprocessing
#### 1. 결측치 처리
#### 2. Train / Validation 분할
#### 3. Data label-encoding, scaling

In [5]:
# 결측치 많은 값을 오버샘플링하면 너무 비정상 값이 많을 것 같아서 K와 CD만 추가!
train.drop(['FH2O', 'FNOX', 'FOPTIMETHGLY', 'FOXID', 'FSO4', 'FTBN', 'FUEL','SOOTPERCENTAGE', 'U100', 'U75', 'U50', 'U25', 'U20', 'U14', 'U6', 'U4', 'V100'],axis=1,inplace=True)

In [6]:
categorical_features = ['COMPONENT_ARBITRARY', 'YEAR']
# Inference(실제 진단 환경)에 사용하는 컬럼
test_stage_features = ['COMPONENT_ARBITRARY', 'ANONYMOUS_1', 'YEAR' , 'ANONYMOUS_2', 'AG', 'CO', 'CR', 'CU', 'FE', 'H2O', 'MN', 'MO', 'NI', 'PQINDEX', 'TI', 'V', 'V40', 'ZN']

In [7]:
train = train.fillna(0)
test = test.fillna(0)

In [8]:
pd.set_option('display.max_columns',None)
train.describe()

Unnamed: 0,ANONYMOUS_1,YEAR,SAMPLE_TRANSFER_DAY,ANONYMOUS_2,AG,AL,B,BA,BE,CA,CD,CO,CR,CU,FE,H2O,K,LI,MG,MN,MO,NA,NI,P,PB,PQINDEX,S,SB,SI,SN,TI,V,V40,ZN,Y_LABEL
count,14095.0,14095.0,14095.0,14095.0,14095.0,14095.0,14095.0,14095.0,14095.0,14095.0,14095.0,14095.0,14095.0,14095.0,14095.0,14095.0,14095.0,14095.0,14095.0,14095.0,14095.0,14095.0,14095.0,14095.0,14095.0,14095.0,14095.0,14095.0,14095.0,14095.0,14095.0,14095.0,14095.0,14095.0,14095.0
mean,3146.082937,2013.652501,7.600568,387.416885,0.025825,12.707698,64.026179,0.692799,0.006314,1366.757574,0.014048,0.028166,3.299468,34.560624,178.16928,0.03365,3.022206,0.242568,33.477545,2.787939,23.992409,3.946222,0.730117,909.662788,1.659383,415.159631,12029.318624,0.476978,35.058248,0.909826,0.707911,0.050656,109.355815,588.646825,0.085349
std,4216.089809,3.964758,11.681628,550.016073,0.171926,86.968,102.876871,2.905491,0.152189,1481.924727,0.198836,0.314249,28.59551,128.958953,533.208976,0.809162,14.360998,2.190055,116.526762,11.131836,59.833922,17.914325,3.806716,564.388965,10.31913,1528.191012,9325.610196,2.729936,195.329029,3.604276,6.897579,0.475438,49.612379,531.743393,0.279411
min,1000.0,2007.0,0.0,200.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,386.0,0.0,0.0,0.0,0.0,0.0,2.9,0.0,0.0
25%,1655.0,2010.0,3.0,200.0,0.0,1.0,3.0,0.0,0.0,48.0,0.0,0.0,0.0,1.0,14.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,448.0,0.0,12.0,4440.5,0.0,3.0,0.0,0.0,0.0,71.8,37.0,0.0
50%,2227.0,2014.0,5.0,200.0,0.0,2.0,11.0,0.0,0.0,198.0,0.0,0.0,1.0,4.0,41.0,0.0,1.0,0.0,6.0,1.0,1.0,2.0,0.0,916.0,0.0,29.0,8034.0,0.0,6.0,0.0,0.0,0.0,111.3,520.0,0.0
75%,3797.0,2017.0,8.0,410.0,0.0,4.0,110.0,0.0,0.0,2975.0,0.0,0.0,3.0,17.0,139.0,0.0,3.0,0.0,13.0,2.0,7.0,4.0,0.0,1101.0,2.0,181.0,19750.0,0.0,12.0,1.0,0.0,0.0,137.2,1119.0,0.0
max,294451.0,2022.0,368.0,9650.0,3.0,4630.0,2051.0,216.0,9.0,6609.0,18.0,27.0,2398.0,5701.0,31706.0,52.7,705.0,117.0,1509.0,877.0,419.0,1212.0,176.0,3102.0,736.0,56761.0,64160.0,103.0,5459.0,289.0,403.0,17.0,2840.5,2132.0,1.0


In [9]:
pd.set_option('display.max_columns',None)
test.describe()

Unnamed: 0,ANONYMOUS_1,YEAR,ANONYMOUS_2,AG,CO,CR,CU,FE,H2O,MN,MO,NI,PQINDEX,TI,V,V40,ZN
count,6041.0,6041.0,6041.0,6041.0,6041.0,6041.0,6041.0,6041.0,6041.0,6041.0,6041.0,6041.0,6041.0,6041.0,6041.0,6041.0,6041.0
mean,3199.140871,2013.656514,392.669922,0.026817,0.03079,3.155438,37.397285,198.615295,0.024599,3.127462,25.042212,0.883463,475.09121,0.797219,0.044529,110.495135,581.186393
std,4834.84993,3.976759,563.70529,0.179981,0.268141,21.120731,153.922102,690.985612,0.335461,11.315785,62.245538,5.708857,1812.652841,7.366433,0.359595,45.381679,530.79904
min,1000.0,2007.0,200.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,17.9,1.0
25%,1667.0,2010.0,200.0,0.0,0.0,0.0,1.0,15.0,0.0,0.0,0.0,0.0,12.0,0.0,0.0,73.0,38.0
50%,2271.0,2014.0,200.0,0.0,0.0,1.0,4.0,41.0,0.0,1.0,1.0,0.0,30.0,0.0,0.0,112.1,489.0
75%,3842.0,2017.0,417.0,0.0,0.0,2.0,17.0,143.0,0.0,2.0,8.0,0.0,173.0,0.0,0.0,138.2,1115.0
max,239570.0,2022.0,9650.0,2.0,13.0,1415.0,4113.0,30407.0,15.2,472.0,765.0,155.0,48976.0,280.0,13.0,388.4,1932.0


In [15]:
train[train.ANONYMOUS_1 > train.ANONYMOUS_1.mean()+(3*train.ANONYMOUS_1.std())]

Unnamed: 0,ID,COMPONENT_ARBITRARY,ANONYMOUS_1,YEAR,SAMPLE_TRANSFER_DAY,ANONYMOUS_2,AG,AL,B,BA,BE,CA,CD,CO,CR,CU,FE,H2O,K,LI,MG,MN,MO,NA,NI,P,PB,PQINDEX,S,SB,SI,SN,TI,V,V40,ZN,Y_LABEL
75,TRAIN_00075,COMPONENT3,21986,2015,14,200,0,0,163,0,0,6,0.0,0,0,0,11,0.0,2.0,0,0,0,0,1,0,932,0,32,19180,0,0,0,0,0,132.8,7,0
172,TRAIN_00172,COMPONENT4,63454,2017,6,900,0,2,2,1,0,2690,0.0,0,0,10,24,0.0,8.0,0,12,8,7,38,0,1360,0,10,7425,0,7,0,0,0,64.7,1628,0
212,TRAIN_00212,COMPONENT3,78418,2017,26,200,0,0,0,0,0,48,0.0,0,0,1,17,0.0,0.0,0,11,0,0,0,0,850,0,16,19870,0,0,0,0,0,134.0,35,0
589,TRAIN_00589,COMPONENT3,39411,2016,3,200,0,3,450,0,0,2987,0.0,0,0,0,64,0.0,0.0,0,0,0,0,0,0,211,0,111,19960,0,6,0,0,0,179.7,39,0
632,TRAIN_00632,COMPONENT3,22327,2012,5,200,0,3,368,0,0,2558,0.0,0,0,0,38,0.0,0.0,0,0,0,0,2,0,861,0,368,16360,0,1,0,0,0,130.8,6,0
860,TRAIN_00860,COMPONENT3,22087,2013,7,200,0,0,2,0,0,25,0.0,0,2,1,217,0.0,0.0,0,6,2,0,6,0,686,0,7079,24900,0,5,0,0,0,138.5,56,0
1109,TRAIN_01109,COMPONENT1,17832,2010,5,200,0,0,2,0,0,228,0.0,0,0,0,6,0.0,7.0,0,253,0,0,9,0,1121,2,12,6332,1,4,0,0,0,95.6,1259,0
1225,TRAIN_01225,COMPONENT3,56788,2016,3,200,0,3,468,0,0,3091,0.0,0,3,8,228,0.0,2.0,0,0,3,0,0,2,317,0,115,8453,0,10,0,0,0,194.5,72,0
1590,TRAIN_01590,COMPONENT2,39468,2013,6,200,0,0,0,0,0,36,0.0,0,0,5,6,0.0,0.0,18,4,0,0,2,0,411,0,6,1866,0,5,0,0,0,51.2,614,0
1753,TRAIN_01753,COMPONENT2,20098,2013,3,200,0,1,149,0,0,54,0.0,0,0,153,6,0.0,0.0,0,2,0,0,0,0,455,1,20,1217,0,0,0,0,0,44.2,653,0


In [42]:
all_X = train.drop(['ID', 'Y_LABEL'], axis = 1)
all_y = train['Y_LABEL']

test = test.drop(['ID'], axis = 1)

train_X, val_X, train_y, val_y = train_test_split(all_X, all_y, test_size=0.2, random_state=CFG['SEED'], stratify=all_y)

In [43]:
def get_values(value):
    return value.values.reshape(-1, 1)

for col in train_X.columns:
    if col not in categorical_features:
        scaler = StandardScaler()
        train_X[col] = scaler.fit_transform(get_values(train_X[col]))
        val_X[col] = scaler.transform(get_values(val_X[col]))
        if col in test.columns:
            test[col] = scaler.transform(get_values(test[col]))
            
le = LabelEncoder()
for col in categorical_features:    
    train_X[col] = le.fit_transform(train_X[col])
    val_X[col] = le.transform(val_X[col])
    if col in test.columns:
        test[col] = le.transform(test[col])

## CustomDataset

In [44]:
class CustomDataset(Dataset):
    def __init__(self, data_X, data_y, distillation=False):
        super(CustomDataset, self).__init__()
        self.data_X = data_X
        self.data_y = data_y
        self.distillation = distillation
        
    def __len__(self):
        return len(self.data_X)
    
    def __getitem__(self, index):
        if self.distillation:
            # 지식 증류 학습 시
            teacher_X = torch.Tensor(self.data_X.iloc[index])
            student_X = torch.Tensor(self.data_X[test_stage_features].iloc[index])
            y = self.data_y.values[index]
            return teacher_X, student_X, y
        else:
            if self.data_y is None:
                test_X = torch.Tensor(self.data_X.iloc[index])
                return test_X
            else:
                teacher_X = torch.Tensor(self.data_X.iloc[index])
                y = self.data_y.values[index]
                return teacher_X, y

In [45]:
train_dataset = CustomDataset(train_X, train_y, False)
val_dataset = CustomDataset(val_X, val_y, False)

In [46]:
train_loader = DataLoader(train_dataset, batch_size = CFG['BATCH_SIZE'], shuffle=True)
val_loader = DataLoader(val_dataset, batch_size = CFG['BATCH_SIZE'], shuffle=False)

## Define Teacher Model

In [47]:
class Teacher(nn.Module):
    def __init__(self):
        super(Teacher, self).__init__()
        self.classifier = nn.Sequential(
            nn.Linear(in_features=35, out_features=128),
            nn.BatchNorm1d(128),
            nn.LeakyReLU(),
            nn.Linear(in_features=128, out_features=512),
            nn.BatchNorm1d(512),
            nn.LeakyReLU(),
            nn.Linear(in_features=512, out_features=1024),
            nn.BatchNorm1d(1024),
            nn.LeakyReLU(),
            nn.Linear(in_features=1024, out_features=256),
            nn.BatchNorm1d(256),
            nn.LeakyReLU(),
            nn.Linear(in_features=256, out_features=1),
            nn.Sigmoid()
        )
        
    def forward(self, x):
        output = self.classifier(x)
        return output

## Teacher Train / Validation

In [48]:
def train(model, optimizer, train_loader, val_loader, scheduler, device):
    model.to(device)

    best_score = 0
    best_model = None
    criterion = nn.BCELoss().to(device)

    for epoch in range(CFG["EPOCHS"]):
        train_loss = []
  
        model.train()
        for X, y in tqdm(train_loader):
            X = X.float().to(device)
            y = y.float().to(device)
            
            optimizer.zero_grad()
            
            y_pred = model(X)
            
            loss = criterion(y_pred, y.reshape(-1, 1))
            loss.backward()
            
            optimizer.step()

            train_loss.append(loss.item())

        val_loss, val_score = validation_teacher(model, val_loader, criterion, device)
        print(f'Epoch [{epoch}], Train Loss : [{np.mean(train_loss) :.5f}] Val Loss : [{np.mean(val_loss) :.5f}] Val F1 Score : [{val_score:.5f}]')

        if scheduler is not None:
            scheduler.step(val_score)
            
        if best_score < val_score:
            best_model = model
            best_score = val_score
        
    return best_model 

In [49]:
def competition_metric(true, pred):
    return f1_score(true, pred, average="macro")

def validation_teacher(model, val_loader, criterion, device):
    model.eval()

    val_loss = []
    pred_labels = []
    true_labels = []
    threshold = 0.35
    
    with torch.no_grad():
        for X, y in tqdm(val_loader):
            X = X.float().to(device)
            y = y.float().to(device)
            
            model_pred = model(X.to(device))
            
            loss = criterion(model_pred, y.reshape(-1, 1))
            val_loss.append(loss.item())      
            
            model_pred = model_pred.squeeze(1).to('cpu')  
            pred_labels += model_pred.tolist()
            true_labels += y.tolist()
        
        pred_labels = np.where(np.array(pred_labels) > threshold, 1, 0)
        val_f1 = competition_metric(true_labels, pred_labels)
    return val_loss, val_f1   

## Run (Teacher Model)

In [50]:
model = Teacher()
model.eval()
optimizer = torch.optim.Adam(model.parameters(), lr=CFG['LEARNING_RATE'])
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=1, threshold_mode='abs',min_lr=1e-8, verbose=True)

teacher_model = train(model, optimizer, train_loader, val_loader, scheduler, device)

  0%|          | 0/45 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

Epoch [0], Train Loss : [0.26782] Val Loss : [0.27186] Val F1 Score : [0.75879]


  0%|          | 0/45 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

Epoch [1], Train Loss : [0.19260] Val Loss : [0.31101] Val F1 Score : [0.79285]


  0%|          | 0/45 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

Epoch [2], Train Loss : [0.18785] Val Loss : [0.31456] Val F1 Score : [0.80016]


  0%|          | 0/45 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

Epoch [3], Train Loss : [0.16776] Val Loss : [0.27876] Val F1 Score : [0.78776]


  0%|          | 0/45 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

Epoch [4], Train Loss : [0.16931] Val Loss : [0.26094] Val F1 Score : [0.79641]
Epoch 00005: reducing learning rate of group 0 to 5.0000e-03.


  0%|          | 0/45 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

Epoch [5], Train Loss : [0.16309] Val Loss : [0.27255] Val F1 Score : [0.79705]


  0%|          | 0/45 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

Epoch [6], Train Loss : [0.15244] Val Loss : [0.23913] Val F1 Score : [0.79501]
Epoch 00007: reducing learning rate of group 0 to 2.5000e-03.


  0%|          | 0/45 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

Epoch [7], Train Loss : [0.14008] Val Loss : [0.23036] Val F1 Score : [0.79375]


  0%|          | 0/45 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

Epoch [8], Train Loss : [0.14011] Val Loss : [0.27070] Val F1 Score : [0.78978]
Epoch 00009: reducing learning rate of group 0 to 1.2500e-03.


  0%|          | 0/45 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

Epoch [9], Train Loss : [0.14085] Val Loss : [0.24555] Val F1 Score : [0.79885]


  0%|          | 0/45 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

Epoch [10], Train Loss : [0.13377] Val Loss : [0.27241] Val F1 Score : [0.77660]
Epoch 00011: reducing learning rate of group 0 to 6.2500e-04.


  0%|          | 0/45 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

Epoch [11], Train Loss : [0.14025] Val Loss : [0.26832] Val F1 Score : [0.79655]


  0%|          | 0/45 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

Epoch [12], Train Loss : [0.13750] Val Loss : [0.25764] Val F1 Score : [0.79297]
Epoch 00013: reducing learning rate of group 0 to 3.1250e-04.


  0%|          | 0/45 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

Epoch [13], Train Loss : [0.13176] Val Loss : [0.26263] Val F1 Score : [0.79289]


  0%|          | 0/45 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

Epoch [14], Train Loss : [0.12561] Val Loss : [0.25045] Val F1 Score : [0.79237]
Epoch 00015: reducing learning rate of group 0 to 1.5625e-04.


  0%|          | 0/45 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

Epoch [15], Train Loss : [0.12468] Val Loss : [0.25774] Val F1 Score : [0.78087]


  0%|          | 0/45 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

Epoch [16], Train Loss : [0.12480] Val Loss : [0.24378] Val F1 Score : [0.79193]
Epoch 00017: reducing learning rate of group 0 to 7.8125e-05.


  0%|          | 0/45 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

Epoch [17], Train Loss : [0.12656] Val Loss : [0.26085] Val F1 Score : [0.78282]


  0%|          | 0/45 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

Epoch [18], Train Loss : [0.12335] Val Loss : [0.24400] Val F1 Score : [0.79691]
Epoch 00019: reducing learning rate of group 0 to 3.9063e-05.


  0%|          | 0/45 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

Epoch [19], Train Loss : [0.12885] Val Loss : [0.26075] Val F1 Score : [0.78229]


  0%|          | 0/45 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

Epoch [20], Train Loss : [0.12539] Val Loss : [0.25835] Val F1 Score : [0.78342]
Epoch 00021: reducing learning rate of group 0 to 1.9531e-05.


  0%|          | 0/45 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

Epoch [21], Train Loss : [0.12790] Val Loss : [0.25635] Val F1 Score : [0.78589]


  0%|          | 0/45 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

Epoch [22], Train Loss : [0.12327] Val Loss : [0.25283] Val F1 Score : [0.78812]
Epoch 00023: reducing learning rate of group 0 to 9.7656e-06.


  0%|          | 0/45 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

Epoch [23], Train Loss : [0.12271] Val Loss : [0.24614] Val F1 Score : [0.79398]


  0%|          | 0/45 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

Epoch [24], Train Loss : [0.12760] Val Loss : [0.24548] Val F1 Score : [0.78647]
Epoch 00025: reducing learning rate of group 0 to 4.8828e-06.


  0%|          | 0/45 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

Epoch [25], Train Loss : [0.12384] Val Loss : [0.25981] Val F1 Score : [0.78691]


  0%|          | 0/45 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

Epoch [26], Train Loss : [0.12525] Val Loss : [0.26248] Val F1 Score : [0.78632]
Epoch 00027: reducing learning rate of group 0 to 2.4414e-06.


  0%|          | 0/45 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

Epoch [27], Train Loss : [0.12650] Val Loss : [0.25077] Val F1 Score : [0.79999]


  0%|          | 0/45 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

Epoch [28], Train Loss : [0.12448] Val Loss : [0.25828] Val F1 Score : [0.78895]
Epoch 00029: reducing learning rate of group 0 to 1.2207e-06.


  0%|          | 0/45 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

Epoch [29], Train Loss : [0.12529] Val Loss : [0.26102] Val F1 Score : [0.80080]


## Define Student Model

In [62]:
class Student(nn.Module):
    def __init__(self):
        super(Student, self).__init__()
        self.classifier = nn.Sequential(
            nn.Linear(in_features=18, out_features=128),
            nn.BatchNorm1d(128),
            nn.LeakyReLU(),
            nn.Linear(in_features=128, out_features=256),
            nn.BatchNorm1d(256),
            nn.LeakyReLU(),
            nn.Linear(in_features=256, out_features=512),
            nn.BatchNorm1d(512),
            nn.LeakyReLU(),
            nn.Linear(in_features=512, out_features=128),
            nn.BatchNorm1d(128),
            nn.LeakyReLU(),
            nn.Linear(in_features=128, out_features=1),
            nn.Sigmoid()
        )
        
    def forward(self, x):
        output = self.classifier(x)
        return output

## Define Knowledge distillation Loss

In [63]:
def distillation(student_logits, labels, teacher_logits, alpha):
    distillation_loss = nn.BCELoss()(student_logits, teacher_logits)
    student_loss = nn.BCELoss()(student_logits, labels.reshape(-1, 1))
    return alpha * student_loss + (1-alpha) * distillation_loss

In [64]:
def distill_loss(output, target, teacher_output, loss_fn=distillation, opt=optimizer):
    loss_b = loss_fn(output, target, teacher_output, alpha=0.1)

    if opt is not None:
        opt.zero_grad()
        loss_b.backward()
        opt.step()

    return loss_b.item()

## Student Train / Validation

In [65]:
def student_train(s_model, t_model, optimizer, train_loader, val_loader, scheduler, device):
    s_model.to(device)
    t_model.to(device)
    
    best_score = 0
    best_model = None

    for epoch in range(CFG["EPOCHS"]):
        train_loss = []
        s_model.train()
        t_model.eval()
        
        for X_t, X_s, y in tqdm(train_loader):
            X_t = X_t.float().to(device)
            X_s = X_s.float().to(device)
            y = y.float().to(device)
            
            optimizer.zero_grad()

            output = s_model(X_s)
            with torch.no_grad():
                teacher_output = t_model(X_t)
                
            loss_b = distill_loss(output, y, teacher_output, loss_fn=distillation, opt=optimizer)

            train_loss.append(loss_b)

        val_loss, val_score = validation_student(s_model, t_model, val_loader, distill_loss, device)
        print(f'Epoch [{epoch}], Train Loss : [{np.mean(train_loss) :.5f}] Val Loss : [{np.mean(val_loss) :.5f}] Val F1 Score : [{val_score:.5f}]')
        
        if scheduler is not None:
            scheduler.step(val_score)
            
        if best_score < val_score:
            best_model = s_model
            best_score = val_score
        
    return best_model

In [66]:
def validation_student(s_model, t_model, val_loader, criterion, device):
    s_model.eval()
    t_model.eval()

    val_loss = []
    pred_labels = []
    true_labels = []
    threshold = 0.35
    
    with torch.no_grad():
        for X_t, X_s, y in tqdm(val_loader):
            X_t = X_t.float().to(device)
            X_s = X_s.float().to(device)
            y = y.float().to(device)
            
            model_pred = s_model(X_s)
            teacher_output = t_model(X_t)
            
            loss_b = distill_loss(model_pred, y, teacher_output, loss_fn=distillation, opt=None)
            val_loss.append(loss_b)
            
            model_pred = model_pred.squeeze(1).to('cpu')
            pred_labels += model_pred.tolist()
            true_labels += y.tolist()
        
        pred_labels = np.where(np.array(pred_labels) > threshold, 1, 0)
        val_f1 = competition_metric(true_labels, pred_labels)
    return val_loss, val_f1    

## Run (Student Model)

In [67]:
train_dataset = CustomDataset(train_X, train_y, True)
val_dataset = CustomDataset(val_X, val_y, True)

train_loader = DataLoader(train_dataset, batch_size = CFG['BATCH_SIZE'], shuffle=True)
val_loader = DataLoader(val_dataset, batch_size = CFG['BATCH_SIZE'], shuffle=False)

In [68]:
student_model = Student()
student_model.eval()
optimizer = torch.optim.Adam(student_model.parameters(), lr=CFG['LEARNING_RATE'])
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=1, threshold_mode='abs',min_lr=1e-8, verbose=True)

best_student_model = student_train(student_model, teacher_model, optimizer, train_loader, val_loader, scheduler, device)

  0%|          | 0/45 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

Epoch [0], Train Loss : [0.30228] Val Loss : [0.28550] Val F1 Score : [0.47767]


  0%|          | 0/45 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

Epoch [1], Train Loss : [0.27785] Val Loss : [0.28447] Val F1 Score : [0.48849]


  0%|          | 0/45 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

Epoch [2], Train Loss : [0.28122] Val Loss : [0.28934] Val F1 Score : [0.48167]


  0%|          | 0/45 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

Epoch [3], Train Loss : [0.27365] Val Loss : [0.28648] Val F1 Score : [0.48994]


  0%|          | 0/45 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

Epoch [4], Train Loss : [0.26896] Val Loss : [0.27899] Val F1 Score : [0.48491]


  0%|          | 0/45 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

Epoch [5], Train Loss : [0.26961] Val Loss : [0.28166] Val F1 Score : [0.49243]


  0%|          | 0/45 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

Epoch [6], Train Loss : [0.27131] Val Loss : [0.28163] Val F1 Score : [0.48121]


  0%|          | 0/45 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

Epoch [7], Train Loss : [0.27504] Val Loss : [0.28278] Val F1 Score : [0.48792]
Epoch 00008: reducing learning rate of group 0 to 5.0000e-03.


  0%|          | 0/45 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

Epoch [8], Train Loss : [0.26729] Val Loss : [0.27862] Val F1 Score : [0.48994]


  0%|          | 0/45 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

Epoch [9], Train Loss : [0.26742] Val Loss : [0.27910] Val F1 Score : [0.50189]


  0%|          | 0/45 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

Epoch [10], Train Loss : [0.27031] Val Loss : [0.27656] Val F1 Score : [0.51030]


  0%|          | 0/45 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

Epoch [11], Train Loss : [0.26475] Val Loss : [0.27931] Val F1 Score : [0.50170]


  0%|          | 0/45 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

Epoch [12], Train Loss : [0.26450] Val Loss : [0.27897] Val F1 Score : [0.51508]


  0%|          | 0/45 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

Epoch [13], Train Loss : [0.26949] Val Loss : [0.28197] Val F1 Score : [0.51865]


  0%|          | 0/45 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

Epoch [14], Train Loss : [0.27175] Val Loss : [0.28629] Val F1 Score : [0.50021]


  0%|          | 0/45 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

Epoch [15], Train Loss : [0.27109] Val Loss : [0.27840] Val F1 Score : [0.50507]
Epoch 00016: reducing learning rate of group 0 to 2.5000e-03.


  0%|          | 0/45 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

Epoch [16], Train Loss : [0.26276] Val Loss : [0.27638] Val F1 Score : [0.50964]


  0%|          | 0/45 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

Epoch [17], Train Loss : [0.26026] Val Loss : [0.27413] Val F1 Score : [0.50639]
Epoch 00018: reducing learning rate of group 0 to 1.2500e-03.


  0%|          | 0/45 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

Epoch [18], Train Loss : [0.26257] Val Loss : [0.27536] Val F1 Score : [0.50791]


  0%|          | 0/45 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

Epoch [19], Train Loss : [0.26243] Val Loss : [0.27491] Val F1 Score : [0.51123]
Epoch 00020: reducing learning rate of group 0 to 6.2500e-04.


  0%|          | 0/45 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

Epoch [20], Train Loss : [0.25774] Val Loss : [0.27643] Val F1 Score : [0.51168]


  0%|          | 0/45 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

Epoch [21], Train Loss : [0.26306] Val Loss : [0.27531] Val F1 Score : [0.52471]


  0%|          | 0/45 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

Epoch [22], Train Loss : [0.26205] Val Loss : [0.27608] Val F1 Score : [0.51748]


  0%|          | 0/45 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

Epoch [23], Train Loss : [0.26145] Val Loss : [0.27619] Val F1 Score : [0.51964]
Epoch 00024: reducing learning rate of group 0 to 3.1250e-04.


  0%|          | 0/45 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

Epoch [24], Train Loss : [0.25923] Val Loss : [0.27554] Val F1 Score : [0.51168]


  0%|          | 0/45 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

Epoch [25], Train Loss : [0.25869] Val Loss : [0.27591] Val F1 Score : [0.51190]
Epoch 00026: reducing learning rate of group 0 to 1.5625e-04.


  0%|          | 0/45 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

Epoch [26], Train Loss : [0.25721] Val Loss : [0.27474] Val F1 Score : [0.51797]


  0%|          | 0/45 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

Epoch [27], Train Loss : [0.25590] Val Loss : [0.27575] Val F1 Score : [0.52325]
Epoch 00028: reducing learning rate of group 0 to 7.8125e-05.


  0%|          | 0/45 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

Epoch [28], Train Loss : [0.25553] Val Loss : [0.27642] Val F1 Score : [0.52091]


  0%|          | 0/45 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

Epoch [29], Train Loss : [0.26423] Val Loss : [0.27578] Val F1 Score : [0.51426]
Epoch 00030: reducing learning rate of group 0 to 3.9063e-05.


## Choose Inference Threshold

In [69]:
def choose_threshold(model, val_loader, device):
    model.to(device)
    model.eval()
    
    thresholds = [0.1, 0.175, 0.2, 0.225, 0.25, 0.3, 0.4 ,0.5]
    pred_labels = []
    true_labels = []
    
    best_score = 0
    best_thr = None
    with torch.no_grad():
        for _, x_s, y in tqdm(iter(val_loader)):
            x_s = x_s.float().to(device)
            y = y.float().to(device)
            
            model_pred = model(x_s)
            
            model_pred = model_pred.squeeze(1).to('cpu')
            pred_labels += model_pred.tolist()
            true_labels += y.tolist()
        
        for threshold in thresholds:
            pred_labels_thr = np.where(np.array(pred_labels) > threshold, 1, 0)
            score_thr = competition_metric(true_labels, pred_labels_thr)
            if best_score < score_thr:
                best_score = score_thr
                best_thr = threshold
    return best_thr, best_score

In [71]:
best_threshold, best_score = choose_threshold(best_student_model, val_loader, device)
print(f'Best Threshold : [{best_threshold}], Score : [{best_score:.5f}]')

  0%|          | 0/12 [00:00<?, ?it/s]

Best Threshold : [0.175], Score : [0.54571]


## Inference

In [72]:
test_datasets = CustomDataset(test, None, False)
test_loaders = DataLoader(test_datasets, batch_size = CFG['BATCH_SIZE'], shuffle=False)

In [73]:
def inference(model, test_loader, threshold, device):
    model.to(device)
    model.eval()
    
    test_predict = []
    with torch.no_grad():
        for x in tqdm(test_loader):
            x = x.float().to(device)
            model_pred = model(x)

            model_pred = model_pred.squeeze(1).to('cpu')
            test_predict += model_pred
        
    test_predict = np.where(np.array(test_predict) > threshold, 1, 0)
    print('Done.')
    return test_predict

In [74]:
preds = inference(best_student_model, test_loaders, best_threshold, device)

  0%|          | 0/24 [00:00<?, ?it/s]

Done.


## Submit

In [75]:
submit = pd.read_csv('submission_oil.csv')
submit['Y_LABEL'] = preds
submit.head()

Unnamed: 0,ID,Y_LABEL
0,TEST_0000,0
1,TEST_0001,0
2,TEST_0002,0
3,TEST_0003,0
4,TEST_0004,0
