In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "3"

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset

from sklearn.metrics import f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

import os
import pandas as pd
import numpy as np
# from tqdm.auto import tqdm
from tqdm import tqdm
import random

import warnings
warnings.filterwarnings(action='ignore') 

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(device)
print(torch.cuda.device_count())

cuda
1


In [3]:
CONFIG = {
    'EPOCHS': 20,
    'LEARNING_RATE':1e-3,
    'BATCH_SIZE':512,
    'SEED':101
}

In [4]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(CONFIG['SEED'])

In [5]:
train = pd.read_csv('./train.csv')
test = pd.read_csv('./test.csv')

In [6]:
categorical_features = ['COMPONENT_ARBITRARY', 'YEAR']
# Inference(실제 진단 환경)에 사용하는 컬럼
test_stage_features = ['COMPONENT_ARBITRARY', 'ANONYMOUS_1', 'YEAR' , 'ANONYMOUS_2', 'AG', 'CO', 'CR', 'CU', 'FE', 'H2O', 'MN', 'MO', 'NI', 'PQINDEX', 'TI', 'V', 'V40', 'ZN']

In [7]:
train = train.fillna(0)
test = test.fillna(0)

In [8]:
all_X = train.drop(['ID', 'Y_LABEL'], axis = 1)
all_y = train['Y_LABEL']

test = test.drop(['ID'], axis = 1)

train_X, val_X, train_y, val_y = train_test_split(all_X, all_y, test_size=0.2, random_state=CONFIG['SEED'], stratify=all_y)

In [9]:
le = LabelEncoder()
for col in categorical_features:    
    train_X[col] = le.fit_transform(train_X[col])
    val_X[col] = le.transform(val_X[col])
    if col in test.columns:
        test[col] = le.transform(test[col])

In [10]:
scaler = StandardScaler()
scaled_features = train_X.head(1).drop(categorical_features,axis=1).columns

# ---- TRAIN DATA
trainX_scaled = pd.DataFrame(scaler.fit_transform(train_X.drop(categorical_features,axis=1)),columns=scaled_features)
trainX_scaled[categorical_features] = train_X[categorical_features].reset_index(drop=True)
trainX_scaled = trainX_scaled[train_X.columns]

# ---- VAL DATA
valX_scaled = pd.DataFrame(scaler.transform(val_X.drop(categorical_features,axis=1)),columns=scaled_features)
valX_scaled[categorical_features] = val_X[categorical_features].reset_index(drop=True)
valX_scaled = valX_scaled[val_X.columns]

# ---- LABEL TRAIN and TEST
train_y = pd.DataFrame(train_y).reset_index(drop=True)
val_y = pd.DataFrame(val_y).reset_index(drop=True)

In [11]:
class CustomDataset(Dataset):
    def __init__(self, data_X, data_y, distillation=False):
        super(CustomDataset, self).__init__()
        self.data_X = data_X
        self.data_y = data_y
        self.distillation = distillation
        
    def __len__(self):
        return len(self.data_X)
    
    def __getitem__(self, index):
        if self.distillation:
            #------------------------------------
            # When Learing with distillation
            #------------------------------------
            teacher_X = torch.Tensor(self.data_X.iloc[index])
            student_X = torch.Tensor(self.data_X[test_stage_features].iloc[index])
            y = self.data_y.values[index]
            return  teacher_X.to(device), \
                    student_X.to(device), \
                    torch.Tensor(y).to(device)
        else:
            #------------------------------------
            # When Learing with Normal Data
            #------------------------------------
            if self.data_y is None:
                test_X = torch.Tensor(self.data_X.iloc[index])
                return test_X.to(device)
            else:
                teacher_X = torch.Tensor(self.data_X.iloc[index])
                y = self.data_y.values[index]
                return  teacher_X.to(device), \
                        torch.Tensor(y).to(device)

In [12]:
train_dataset = CustomDataset(trainX_scaled, train_y, False)
val_dataset = CustomDataset(valX_scaled, val_y, False)

In [13]:
train_loader = DataLoader(train_dataset, batch_size = CONFIG['BATCH_SIZE'], shuffle=True)
val_loader   = DataLoader(val_dataset, batch_size = CONFIG['BATCH_SIZE'], shuffle=False)

In [14]:
class Teacher(nn.Module):
    def __init__(self):
        super(Teacher, self).__init__()
        self.classifier = nn.Sequential(
            nn.Linear(in_features=52, out_features=256),
            nn.BatchNorm1d(256),
            nn.LeakyReLU(),
            nn.Linear(in_features=256, out_features=1024),
            nn.BatchNorm1d(1024),
            nn.LeakyReLU(),
            nn.Linear(in_features=1024, out_features=256),
            nn.BatchNorm1d(256),
            nn.LeakyReLU(),
            nn.Linear(in_features=256, out_features=1),
            nn.Sigmoid()
        )
        
    def forward(self, x):
        output = self.classifier(x)
        return output

In [15]:
#--------------------------------------------------------------------------------
# METRICS
#--------------------------------------------------------------------------------
def competition_metric(true, pred):
    return f1_score(true, pred, average="macro")


#--------------------------------------------------------------------------------
# TRAINING
#--------------------------------------------------------------------------------
def teacher_train(model, optimizer, train_loader, val_loader, scheduler):
    model.to(device)

    best_score = 0
    best_model = None
    criterion = nn.BCELoss().to(device)

    for epoch in range(CONFIG["EPOCHS"]):
        train_loss = []
        
        # ACTIVATE TRAINING MODE
        # --- normalisation layers1 use per-batch statistics
        # --- activates Dropout layers2
        model.train()
    
        for i,(X, y) in tqdm(enumerate(train_loader)):
            
            #X = X.float().to(device)
            #y = y.float().to(device)
            
            # ZERO GRADIENT
            optimizer.zero_grad()
            
            # FORWARD
            y_pred = model(X)
            loss = criterion(y_pred, y)
            
            # BACKWARD
            loss.backward()
            
            # UPDATE
            optimizer.step()

            train_loss.append(loss.item())

        val_loss, val_score = validation_teacher(model, val_loader, criterion)
        print(f'Epoch [{epoch}], Train Loss : [{np.mean(train_loss) :.5f}] Val Loss : [{np.mean(val_loss) :.5f}] Val F1 Score : [{val_score:.5f}]')

        if scheduler is not None:
            scheduler.step(val_score)
            
        if best_score < val_score:
            best_model = model
            best_score = val_score
        
    return best_model 


#--------------------------------------------------------------------------------
# VALIDATION
#--------------------------------------------------------------------------------
def validation_teacher(model, val_loader, criterion):
    # ACTIVATE EVALUATION MODE
    # --- normalisation layers use running statistics
    # --- de-activates Dropout layers
    model.eval()

    val_loss = []
    pred_labels = []
    true_labels = []
    threshold = 0.5
    
    with torch.no_grad():
        for i,(X, y) in tqdm(enumerate(val_loader)):
#             X = X.float().to(device)
#             y = y.float().to(device)
            
            model_pred = model(X)
            
            loss = criterion(model_pred,y)
            val_loss.append(loss.item())      
            
            model_pred = model_pred.squeeze(1)#.to('cpu')  
            pred_labels += model_pred.tolist()
            true_labels += y.tolist()
        
        pred_labels = np.where(np.array(pred_labels) > threshold, 1, 0)
        val_f1 = competition_metric(true_labels, pred_labels)
    return val_loss, val_f1  

In [16]:
train_dataset[0]

(tensor([ 1.0000,  0.7868, 11.0000, -0.3945, -0.3407, -0.1505, -0.1192, -0.5691,
         -0.3078, -0.0426, -0.1569, -0.0651, -0.0850, -0.1077, -0.2462, -0.3642,
         -0.5897, -0.3646, -0.5866, -0.6048, -0.6063, -0.3066, -0.1057, -0.0399,
         -0.2196, -0.1080, -0.2771, -0.2369, -0.3839, -0.2065, -0.1846, -1.1619,
         -0.1500, -0.2537, -0.4782, -0.1723, -0.1712, -0.2364, -0.3834, -0.1004,
         -0.0764, -0.0997,  0.2821,  0.1438,  0.0344,  0.0875,  0.7378,  1.4414,
         -0.1103, -0.5949, -0.8780, -0.5356], device='cuda:0'),
 tensor([0.], device='cuda:0'))

In [17]:
model = Teacher()
# model.eval()
optimizer = torch.optim.Adam(model.parameters(), lr=CONFIG['LEARNING_RATE'])
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=1, threshold_mode='abs',min_lr=1e-8, verbose=True)

teacher_model = teacher_train(model, optimizer, train_loader, val_loader, scheduler)

23it [00:03,  6.54it/s]
6it [00:00,  7.17it/s]


Epoch [0], Train Loss : [0.30647] Val Loss : [0.22883] Val F1 Score : [0.71587]


23it [00:03,  6.68it/s]
6it [00:00,  7.09it/s]


Epoch [1], Train Loss : [0.20408] Val Loss : [0.19778] Val F1 Score : [0.77058]


23it [00:03,  6.47it/s]
6it [00:00,  7.10it/s]


Epoch [2], Train Loss : [0.20864] Val Loss : [0.19336] Val F1 Score : [0.77245]


23it [00:03,  6.49it/s]
6it [00:00,  7.14it/s]


Epoch [3], Train Loss : [0.18006] Val Loss : [0.18930] Val F1 Score : [0.77830]


23it [00:03,  6.61it/s]
6it [00:00,  7.13it/s]


Epoch [4], Train Loss : [0.16672] Val Loss : [0.18968] Val F1 Score : [0.79069]


23it [00:03,  6.62it/s]
6it [00:00,  7.05it/s]


Epoch [5], Train Loss : [0.17931] Val Loss : [0.17594] Val F1 Score : [0.79831]


23it [00:03,  6.62it/s]
6it [00:00,  6.78it/s]


Epoch [6], Train Loss : [0.18818] Val Loss : [0.17980] Val F1 Score : [0.77899]


23it [00:03,  6.66it/s]
6it [00:00,  7.15it/s]


Epoch [7], Train Loss : [0.16228] Val Loss : [0.18103] Val F1 Score : [0.78525]
Epoch     8: reducing learning rate of group 0 to 5.0000e-04.


23it [00:03,  6.47it/s]
6it [00:00,  7.07it/s]


Epoch [8], Train Loss : [0.14524] Val Loss : [0.17049] Val F1 Score : [0.78789]


23it [00:03,  6.59it/s]
6it [00:00,  7.36it/s]


Epoch [9], Train Loss : [0.14809] Val Loss : [0.17003] Val F1 Score : [0.80042]


23it [00:03,  6.82it/s]
6it [00:00,  7.33it/s]


Epoch [10], Train Loss : [0.14572] Val Loss : [0.17567] Val F1 Score : [0.79041]


23it [00:03,  6.86it/s]
6it [00:00,  7.33it/s]


Epoch [11], Train Loss : [0.13512] Val Loss : [0.17004] Val F1 Score : [0.79348]
Epoch    12: reducing learning rate of group 0 to 2.5000e-04.


23it [00:03,  6.86it/s]
6it [00:00,  7.43it/s]


Epoch [12], Train Loss : [0.12787] Val Loss : [0.16791] Val F1 Score : [0.79941]


23it [00:03,  6.90it/s]
6it [00:00,  7.40it/s]


Epoch [13], Train Loss : [0.13378] Val Loss : [0.16873] Val F1 Score : [0.80813]


23it [00:03,  6.85it/s]
6it [00:00,  7.32it/s]


Epoch [14], Train Loss : [0.13808] Val Loss : [0.17148] Val F1 Score : [0.79556]


23it [00:03,  6.56it/s]
6it [00:00,  7.41it/s]


Epoch [15], Train Loss : [0.13234] Val Loss : [0.16895] Val F1 Score : [0.79927]
Epoch    16: reducing learning rate of group 0 to 1.2500e-04.


23it [00:03,  6.78it/s]
6it [00:00,  7.35it/s]


Epoch [16], Train Loss : [0.12936] Val Loss : [0.16955] Val F1 Score : [0.80909]


23it [00:03,  6.78it/s]
6it [00:00,  7.40it/s]


Epoch [17], Train Loss : [0.12617] Val Loss : [0.17084] Val F1 Score : [0.79631]


23it [00:03,  6.82it/s]
6it [00:00,  7.40it/s]


Epoch [18], Train Loss : [0.12315] Val Loss : [0.17085] Val F1 Score : [0.81099]


23it [00:03,  6.78it/s]
6it [00:00,  7.06it/s]

Epoch [19], Train Loss : [0.11937] Val Loss : [0.17173] Val F1 Score : [0.81005]





In [18]:
class Student(nn.Module):
    def __init__(self):
        super(Student, self).__init__()
        self.classifier = nn.Sequential(
            nn.Linear(in_features=18, out_features=128),
            nn.BatchNorm1d(128),
            nn.LeakyReLU(),
            nn.Linear(in_features=128, out_features=512),
            nn.BatchNorm1d(512),
            nn.LeakyReLU(),
            nn.Linear(in_features=512, out_features=128),
            nn.BatchNorm1d(128),
            nn.LeakyReLU(),
            nn.Linear(in_features=128, out_features=1),
            nn.Sigmoid()
        )
        
    def forward(self, x):
        output = self.classifier(x)
        return output

In [19]:
def distillation_loss(student_logits, labels, teacher_logits, alpha):
    distillation_loss = nn.BCELoss()(student_logits, teacher_logits)
    student_loss = nn.BCELoss()(student_logits, labels.reshape(-1, 1))
    return alpha * student_loss + (1-alpha) * distillation_loss