In [1]:
import random
from datetime import datetime

import numpy as np
import pandas as pd
import torch
from sklearn.model_selection import StratifiedKFold
from torch import nn, optim
from torch.utils.data import DataLoader
from torch.utils.data import TensorDataset
from tqdm import tqdm

random.seed(0)
np.random.seed(0)
torch.manual_seed(0)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

  from .autonotebook import tqdm as notebook_tqdm


In [20]:
train_data = pd.read_csv('train.csv',index_col=0)
test_data = pd.read_csv('test.csv',index_col=0)

In [21]:
# 명목형 변수 변환
replace_dict = {'education':str, 'engnat':str, 'married':str, 'urban':str}

In [22]:
#familysize >50 제거
train_data = train_data.drop(train_data[train_data.familysize > 50].index)

#불필요하다고 판단되는 변수 제거
train_data = train_data.drop(columns=['country','introelapse','testelapse','surveyelapse','hand'])
test_data = test_data.drop(columns=['country','introelapse','testelapse','surveyelapse','hand'])

In [23]:
X_train = train_data.drop(columns=['nerdiness'])
y_train = train_data['nerdiness']
X_test = test_data

In [24]:
# 명목형 변수로 변환
X_train = X_train.astype(replace_dict)
X_test = X_test.astype(replace_dict)

# 더미 변수 변한
X_train = pd.get_dummies(X_train).to_numpy()
X_test = pd.get_dummies(X_test).to_numpy()

In [25]:
X_train_t = torch.tensor(X_train, dtype=torch.float32)
y_train_t = torch.tensor(y_train, dtype=torch.float32)
x_test_t = torch.tensor(X_test, dtype=torch.float32)

In [31]:
print(X_train_t.shape, y_train_t.shape)

torch.Size([14999, 75]) torch.Size([14999])


In [34]:
print(x_test_t.shape)

torch.Size([35452, 75])


In [26]:
test_len = len(x_test_t)

In [35]:
N_REPEAT = 5
N_SKFOLD = 7
N_EPOCH = 48
BATCH_SIZE = 75
LOADER_PARAM = {
    'batch_size': BATCH_SIZE,
    'num_workers': 4,
    'pin_memory': True
}
prediction = np.zeros((test_len, 1), dtype=np.float32)
DEVICE = 'cuda:0' if torch.cuda.is_available() else 'cpu'

In [37]:
for repeat in range(N_REPEAT):

    skf, tot = StratifiedKFold(n_splits=N_SKFOLD, random_state=repeat, shuffle=True), 0.
    for skfold, (train_idx, valid_idx) in enumerate(skf.split(X_train, y_train)):
        train_idx, valid_idx = list(train_idx), list(valid_idx)
        train_loader = DataLoader(TensorDataset(X_train_t[train_idx, :], y_train_t[train_idx]),
                                  shuffle=True, drop_last=True, **LOADER_PARAM)
        valid_loader = DataLoader(TensorDataset(X_train_t[valid_idx, :], y_train_t[valid_idx]),
                                  shuffle=False, drop_last=False, **LOADER_PARAM)
        test_loader = DataLoader(TensorDataset(x_test_t, torch.zeros((test_len,), dtype=torch.float32)),
                                 shuffle=False, drop_last=False, **LOADER_PARAM)
        model = nn.Sequential(
            nn.Dropout(0.05),
            nn.Linear(75, 180, bias=False),
            nn.LeakyReLU(0.05, inplace=True),
            nn.Dropout(0.5),
            nn.Linear(180, 32, bias=False),
            nn.ReLU(inplace=True),
            nn.Linear(32, 1)
        ).to(DEVICE)
        criterion = torch.nn.BCEWithLogitsLoss(pos_weight=torch.tensor([1.20665], device=DEVICE))
        optimizer = optim.AdamW(model.parameters(), lr=5e-3, weight_decay=7.8e-2)
        scheduler = optim.lr_scheduler.CosineAnnealingWarmRestarts(
            optimizer, T_0=N_EPOCH // 6, eta_min=4e-4)
        prediction_t, loss_t = np.zeros((test_len, 1), dtype=np.float32), 1.

        # for epoch in range(N_EPOCH):
        for epoch in tqdm(range(N_EPOCH), desc='{:02d}/{:02d}'.format(skfold + 1, N_SKFOLD)):
            model.train()
            for idx, (xx, yy) in enumerate(train_loader):
                optimizer.zero_grad()
                xx, yy = xx.to(DEVICE), yy.to(DEVICE)
                pred = model(xx).squeeze()
                loss = criterion(pred, yy)
                loss.backward()
                optimizer.step()
                scheduler.step(epoch + idx / len(train_loader))

            with torch.no_grad():
                model.eval()
                running_acc, running_loss, running_count = 0, 0., 0
                for xx, yy in valid_loader:
                    xx, yy = xx.to(DEVICE), yy.to(DEVICE)
                    pred = model(xx).squeeze()
                    loss = criterion(pred, yy)
                    running_loss += loss.item() * len(yy)
                    running_count += len(yy)
                    running_acc += ((torch.sigmoid(pred) > 0.5).float() == yy).sum().item()
                # print('R{:02d} S{:02d} E{:02d} | {:6.4f}, {:5.2f}%'
                #       .format(repeat + 1, skfold + 1, epoch + 1, running_loss / running_count,
                #               running_acc / running_count * 100))

                if running_loss / running_count < loss_t:
                    loss_t = running_loss / running_count
                    for idx, (xx, _) in enumerate(test_loader):
                        xx = xx.to(DEVICE)
                        pred = (2. - torch.sigmoid(model(xx).detach().to('cpu'))).numpy()
                        prediction_t[BATCH_SIZE * idx:min(BATCH_SIZE * (idx + 1), len(prediction)), :] \
                            = pred[:, :].copy()
        prediction[:, :] += prediction_t[:, :].copy() / (N_REPEAT * N_SKFOLD)
        tot += loss_t
    print('R{} -> {:6.4f}'.format(repeat + 1, tot / N_SKFOLD))

01/07: 100%|██████████| 48/48 [04:22<00:00,  5.47s/it]
02/07: 100%|██████████| 48/48 [04:50<00:00,  6.06s/it]
03/07: 100%|██████████| 48/48 [04:51<00:00,  6.08s/it]
04/07: 100%|██████████| 48/48 [04:47<00:00,  6.00s/it]
05/07: 100%|██████████| 48/48 [04:38<00:00,  5.81s/it]
06/07: 100%|██████████| 48/48 [04:29<00:00,  5.61s/it]
07/07: 100%|██████████| 48/48 [04:28<00:00,  5.60s/it]


R1 -> 1.0000


01/07: 100%|██████████| 48/48 [04:54<00:00,  6.14s/it]
02/07: 100%|██████████| 48/48 [04:50<00:00,  6.06s/it]
03/07: 100%|██████████| 48/48 [04:59<00:00,  6.23s/it]
04/07: 100%|██████████| 48/48 [04:50<00:00,  6.05s/it]
05/07: 100%|██████████| 48/48 [04:48<00:00,  6.02s/it]
06/07: 100%|██████████| 48/48 [04:55<00:00,  6.16s/it]
07/07: 100%|██████████| 48/48 [04:43<00:00,  5.90s/it]


R2 -> 1.0000


01/07: 100%|██████████| 48/48 [05:00<00:00,  6.26s/it]
02/07: 100%|██████████| 48/48 [05:03<00:00,  6.32s/it]
03/07: 100%|██████████| 48/48 [04:47<00:00,  6.00s/it]
04/07: 100%|██████████| 48/48 [04:19<00:00,  5.40s/it]
05/07: 100%|██████████| 48/48 [04:45<00:00,  5.96s/it]
06/07: 100%|██████████| 48/48 [04:41<00:00,  5.86s/it]
07/07: 100%|██████████| 48/48 [04:41<00:00,  5.87s/it]


R3 -> 1.0000


01/07: 100%|██████████| 48/48 [04:42<00:00,  5.89s/it]
02/07: 100%|██████████| 48/48 [04:45<00:00,  5.94s/it]
03/07: 100%|██████████| 48/48 [04:57<00:00,  6.20s/it]
04/07: 100%|██████████| 48/48 [04:53<00:00,  6.10s/it]
05/07: 100%|██████████| 48/48 [04:45<00:00,  5.95s/it]
06/07: 100%|██████████| 48/48 [04:51<00:00,  6.08s/it]
07/07: 100%|██████████| 48/48 [05:01<00:00,  6.29s/it]


R4 -> 1.0000


01/07: 100%|██████████| 48/48 [05:11<00:00,  6.49s/it]
02/07: 100%|██████████| 48/48 [05:01<00:00,  6.28s/it]
03/07: 100%|██████████| 48/48 [05:01<00:00,  6.28s/it]
04/07: 100%|██████████| 48/48 [04:51<00:00,  6.07s/it]
05/07: 100%|██████████| 48/48 [04:34<00:00,  5.72s/it]
06/07: 100%|██████████| 48/48 [04:28<00:00,  5.60s/it]
07/07: 100%|██████████| 48/48 [04:28<00:00,  5.59s/it]

R5 -> 1.0000





In [39]:
df = pd.read_csv('sample_submission.csv')
df.iloc[:, 1:] = prediction
df.to_csv('{}.csv'.format(datetime.now().strftime('%m%d-%H%M')), index=False)