In [None]:
import gc
import os
import random
from typing import List, Tuple, Optional, Union

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from sklearn.preprocessing import StandardScaler
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm


from sklearn.model_selection import KFold
from joblib import Parallel, delayed
from sklearn.decomposition import PCA
from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts

In [None]:
X_train = pd.read_csv('../input/xgb-fraud-with-magic-0-9600/X_train.csv')
X_test = pd.read_csv('../input/xgb-fraud-with-magic-0-9600/X_test.csv')
y_train = pd.read_csv('../input/xgb-fraud-with-magic-0-9600/y_train.csv',header=None)[1]

In [None]:
emb_dim: int = 50
batch_size: int = 1024
model_type: str = 'mlp'
mlp_dropout: float = 0.0
mlp_hidden: int = 64
mlp_bn: bool = False
cnn_hidden: int = 128
cnn_channel1: int = 32
cnn_channel2: int = 32
cnn_channel3: int = 32
cnn_kernel1: int = 5
cnn_celu: bool = False
cnn_weight_norm: bool = False
dropout_emb: bool = 0.0
lr: float = 1e-3
weight_decay: float = 0.0
model_path: str = 'fold_{}.pth'
scaler_type: str = 'standard'
output_dir: str = 'artifacts'
scheduler_type: str = 'onecycle'
optimizer_type: str = 'adam'
max_lr: float = 0.01
epochs: int = 30
seed: int = 42
n_pca: int = -1
batch_double_freq: int = 50
cnn_dropout: float = 0.1
na_cols: bool = True
cnn_leaky_relu: bool = False
patience: int = 8
factor: float = 0.5

In [None]:
NN_VALID_TH = 0.185
NN_MODEL_TOP_N = 3
TAB_MODEL_TOP_N = 3
ENSEMBLE_METHOD = 'mean'
NN_NUM_MODELS = 10
TABNET_NUM_MODELS = 5

In [None]:
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

In [None]:
class AverageMeter:
    """Computes and stores the average and current value"""

    def __init__(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


class TabularDataset(Dataset):
    def __init__(self, x_num: np.ndarray, x_cat: np.ndarray, y: Optional[np.ndarray]):
        super().__init__()
        self.x_num = x_num
        self.x_cat = x_cat
        self.y = y

    def __len__(self):
        return len(self.x_num)

    def __getitem__(self, idx):
        if self.y is None:
            return self.x_num[idx], torch.LongTensor(self.x_cat[idx])
        else:
            return self.x_num[idx], torch.LongTensor(self.x_cat[idx]), self.y[idx]


class MLP(nn.Module):
    def __init__(self,
                 src_num_dim: int,
                 n_categories: List[int],
                 dropout: float = 0.0,
                 hidden: int = 50,
                 emb_dim: int = 10,
                 dropout_cat: float = 0.2,
                 bn: bool = False):
        super().__init__()

        self.embs = nn.ModuleList([
            nn.Embedding(x, emb_dim) for x in n_categories])
        self.cat_dim = emb_dim * len(n_categories)
        self.dropout_cat = nn.Dropout(dropout_cat)

        if bn:
            self.sequence = nn.Sequential(
                nn.Linear(src_num_dim + self.cat_dim, hidden),
                nn.Dropout(dropout),
                nn.BatchNorm1d(hidden),
                nn.ReLU(),
                nn.Linear(hidden, hidden),
                nn.Dropout(dropout),
                nn.BatchNorm1d(hidden),
                nn.ReLU(),
                nn.Linear(hidden, 1)
            )
        else:
            self.sequence = nn.Sequential(
                nn.Linear(src_num_dim + self.cat_dim, hidden),
                nn.Dropout(dropout),
                nn.ReLU(),
                nn.Linear(hidden, hidden),
                nn.Dropout(dropout),
                nn.ReLU(),
                nn.Linear(hidden, 1)
            )

    def forward(self, x_num, x_cat):
        embs = [embedding(x_cat[:, i]) for i, embedding in enumerate(self.embs)]
        x_cat_emb = self.dropout_cat(torch.cat(embs, 1))
        x_all = torch.cat([x_num, x_cat_emb], 1)
        x = self.sequence(x_all)
        return torch.squeeze(x)


class CNN(nn.Module):
    def __init__(self,
                 num_features: int,
                 hidden_size: int,
                 n_categories: List[int],
                 emb_dim: int = 10,
                 dropout_cat: float = 0.2,
                 channel_1: int = 256,
                 channel_2: int = 512,
                 channel_3: int = 512,
                 dropout_top: float = 0.1,
                 dropout_mid: float = 0.3,
                 dropout_bottom: float = 0.2,
                 weight_norm: bool = True,
                 two_stage: bool = True,
                 celu: bool = True,
                 kernel1: int = 5,
                 leaky_relu: bool = False):
        super().__init__()

        num_targets = 1

        cha_1_reshape = int(hidden_size / channel_1)
        cha_po_1 = int(hidden_size / channel_1 / 2)
        cha_po_2 = int(hidden_size / channel_1 / 2 / 2) * channel_3

        self.cat_dim = emb_dim * len(n_categories)
        self.cha_1 = channel_1
        self.cha_2 = channel_2
        self.cha_3 = channel_3
        self.cha_1_reshape = cha_1_reshape
        self.cha_po_1 = cha_po_1
        self.cha_po_2 = cha_po_2
        self.two_stage = two_stage

        self.expand = nn.Sequential(
            nn.BatchNorm1d(num_features + self.cat_dim),
            nn.Dropout(dropout_top),
            nn.utils.weight_norm(nn.Linear(num_features + self.cat_dim, hidden_size), dim=None),
            nn.CELU(0.06) if celu else nn.ReLU()
        )

        def _norm(layer, dim=None):
            return nn.utils.weight_norm(layer, dim=dim) if weight_norm else layer

        self.conv1 = nn.Sequential(
            nn.BatchNorm1d(channel_1),
            nn.Dropout(dropout_top),
            _norm(nn.Conv1d(channel_1, channel_2, kernel_size=kernel1, stride=1, padding=kernel1 // 2, bias=False)),
            nn.ReLU(),
            nn.AdaptiveAvgPool1d(output_size=cha_po_1),
            nn.BatchNorm1d(channel_2),
            nn.Dropout(dropout_top),
            _norm(nn.Conv1d(channel_2, channel_2, kernel_size=3, stride=1, padding=1, bias=True)),
            nn.ReLU()
        )

        if self.two_stage:
            self.conv2 = nn.Sequential(
                nn.BatchNorm1d(channel_2),
                nn.Dropout(dropout_mid),
                _norm(nn.Conv1d(channel_2, channel_2, kernel_size=3, stride=1, padding=1, bias=True)),
                nn.ReLU(),
                nn.BatchNorm1d(channel_2),
                nn.Dropout(dropout_bottom),
                _norm(nn.Conv1d(channel_2, channel_3, kernel_size=5, stride=1, padding=2, bias=True)),
                nn.ReLU()
            )

        self.max_po_c2 = nn.MaxPool1d(kernel_size=4, stride=2, padding=1)

        self.flt = nn.Flatten()

        if leaky_relu:
            self.dense = nn.Sequential(
                nn.BatchNorm1d(cha_po_2),
                nn.Dropout(dropout_bottom),
                _norm(nn.Linear(cha_po_2, num_targets), dim=0),
                nn.LeakyReLU()
            )
        else:
            self.dense = nn.Sequential(
                nn.BatchNorm1d(cha_po_2),
                nn.Dropout(dropout_bottom),
                _norm(nn.Linear(cha_po_2, num_targets), dim=0),
#                 _norm(nn.Linear(cha_po_2, num_targets), dim=0)
            )

        self.embs = nn.ModuleList([nn.Embedding(x, emb_dim) for x in n_categories])
        self.cat_dim = emb_dim * len(n_categories)
        self.dropout_cat = nn.Dropout(dropout_cat)

    def forward(self, x_num, x_cat):
        embs = [embedding(x_cat[:, i]) for i, embedding in enumerate(self.embs)]
        x_cat_emb = self.dropout_cat(torch.cat(embs, 1))
        x = torch.cat([x_num, x_cat_emb], 1)

        x = self.expand(x)


        x = x.reshape(x.shape[0], self.cha_1, self.cha_1_reshape)

        x = self.conv1(x)
        if self.two_stage:
            x = self.conv2(x) * x
        x = self.max_po_c2(x)
        x = self.flt(x)
        x = self.dense(x)

        return torch.squeeze(x)



def train_epoch(data_loader: DataLoader,
                model: nn.Module,
                optimizer,
                scheduler,
                device,
                clip_grad: float = 1.5):
    model.train()
    losses = AverageMeter()
    step = 0

    for x_num, x_cat, y in tqdm(data_loader, position=0, leave=True, desc='Training'):
        batch_size = x_num.size(0)
        x_num = x_num.to(device, dtype=torch.float)
        x_cat = x_cat.to(device)
        y = y.to(device, dtype=torch.float)


        target = model(x_num, x_cat).float()
#         y = y.long()
        loss = nn.BCEWithLogitsLoss()(target, y)
        losses.update(loss.detach().cpu().numpy(), batch_size)
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), clip_grad)
        optimizer.step()
        optimizer.zero_grad()

        if scheduler is not None:
            scheduler.step()

        step += 1

    return losses.avg


def evaluate(data_loader: DataLoader, model, device):
    model.eval()

    losses = AverageMeter()

    final_targets = []
    final_outputs = []

    with torch.no_grad():
        for x_num, x_cat, y in data_loader:
            batch_size = x_num.size(0)
            x_num = x_num.to(device, dtype=torch.float)
            x_cat = x_cat.to(device)
            y = y.to(device, dtype=torch.float)

            with torch.no_grad():
                output = model(x_num, x_cat)
            target = output.float()
#             y = y.long()
            loss = nn.BCEWithLogitsLoss()(target, y)
            # record loss
            losses.update(loss.detach().cpu().numpy(), batch_size)

            targets = y.detach().cpu().numpy()
            output = nn.Softmax(dim=0)(output).detach().cpu().numpy()

            final_targets.append(targets)
            final_outputs.append(output)

    final_targets = np.concatenate(final_targets)
    final_outputs = np.concatenate(final_outputs)


    return final_outputs, final_targets, losses.avg



In [None]:
def fast_auc(y_true, y_prob):
    y_true = np.asarray(y_true)
    y_true = y_true[np.argsort(y_prob)]
    nfalse = 0
    auc = 0
    n = len(y_true)
    for i in range(n):
        y_i = y_true[i]
        nfalse += (1 - y_i)
        auc += y_i * nfalse
    auc /= (nfalse * (n - nfalse))
    return auc

In [None]:
X_train['dummy_emb'] = np.random.randint(1)
X_num = X_train[X_train.columns[~X_train.columns.isin(['dummy_emb'])]]
scaler = StandardScaler()
scaled_features = scaler.fit_transform(X_num.values)
X_num = pd.DataFrame(scaled_features, index=X_num.index, columns=X_num.columns)

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)


gc.collect()


fold_score = []

kf = KFold(n_splits=10)

for fold, (train_idx, test_idx) in enumerate(kf.split(X_train)):
    
    X_tr = np.array(X_num.loc[train_idx])
    X_tr_cat = np.array(X_train[['dummy_emb']].loc[train_idx])
    y_tr = np.array(y_train.iloc[train_idx]).flatten()
 
    X_va = np.array(X_num.loc[test_idx])
    X_va_cat = np.array(X_train[['dummy_emb']].loc[test_idx])
    y_va = np.array(y_train.iloc[test_idx]).flatten()

    train_dataset = TabularDataset(X_tr, X_tr_cat, y_tr)
    valid_dataset = TabularDataset(X_va, X_va_cat, y_va)
    
    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True,
                                               num_workers=0)
    valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=batch_size, shuffle=False,
                                               num_workers=0)


    model = CNN(X_tr.shape[1],
                hidden_size=cnn_hidden,
                n_categories=[200],
                emb_dim=emb_dim,
                dropout_cat=dropout_emb,
                channel_1=cnn_channel1,
                channel_2=cnn_channel2,
                channel_3=cnn_channel3,
                two_stage=False,
                kernel1=cnn_kernel1,
                celu=cnn_celu,
                dropout_top=cnn_dropout,
                dropout_mid=cnn_dropout,
                dropout_bottom=cnn_dropout,
                weight_norm=cnn_weight_norm,
                leaky_relu=cnn_leaky_relu)    
    
    
    model = model.to(device)

    if optimizer_type == 'adamw':
        opt = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
    elif optimizer_type == 'adam':
        opt = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
    else:
        raise NotImplementedError()

    scheduler = epoch_scheduler = None
    if scheduler_type == 'onecycle':
        scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer=opt, pct_start=0.1, div_factor=1e3,
                                                        max_lr=max_lr, epochs=epochs,
                                                        steps_per_epoch=len(train_loader))
    elif scheduler_type == 'reduce':
        epoch_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer=opt,
                                                                     mode='min',
                                                                     min_lr=1e-7,
                                                                     patience=patience,
                                                                     verbose=True,
                                                                     factor=factor)

    for epoch in range(epochs):
        if epoch > 0 and epoch % batch_double_freq == 0:
            batch_size = batch_size * 2
            print(f'batch: {cur_batch}')
        train_loader = torch.utils.data.DataLoader(train_dataset,
                                                   batch_size=batch_size,
                                                   shuffle=True,
                                                   num_workers=0)
        train_loss = train_epoch(train_loader, model, opt, scheduler, device)
        predictions, valid_targets, valid_loss = evaluate(valid_loader, model, device=device)
        print(f"epoch {epoch}, train loss: {train_loss:.3f}, valid loss: {valid_loss:.3f}")
    fold_score.append(fast_auc(valid_targets, predictions))
    torch.save(model.state_dict(), 'model_fold' + str(fold) + '.pth')

In [None]:
print('Cross Validation score = %1.6f' % np.mean(fold_score))

In [None]:
X_test['dummy_emb'] = np.random.randint(1)
X_num = X_test[X_test.columns[~X_test.columns.isin(['dummy_emb'])]]
scaled_features = scaler.transform(X_num.values)
X_num = pd.DataFrame(scaled_features, index=X_num.index, columns=X_num.columns)
X_num = np.array(X_num)
X_cat = np.array(X_train[['dummy_emb']])

valid_dataset = TabularDataset(X_num, X_cat, None)
valid_loader = torch.utils.data.DataLoader(valid_dataset,
                                           batch_size=512,
                                           shuffle=False,
                                           num_workers=4)

In [None]:


final_outputs = []
for fold in range(10):

    
    model = CNN(num_features=X_num.shape[1],
                hidden_size=cnn_hidden,
                n_categories=[200],
                emb_dim=emb_dim,
                dropout_cat=dropout_emb,
                channel_1=cnn_channel1,
                channel_2=cnn_channel2,
                channel_3=cnn_channel3,
                two_stage=False,
                kernel1=cnn_kernel1,
                celu=cnn_celu,
                dropout_top=cnn_dropout,
                dropout_mid=cnn_dropout,
                dropout_bottom=cnn_dropout,
                weight_norm=cnn_weight_norm,
                leaky_relu=cnn_leaky_relu)
    model.load_state_dict(torch.load('model_fold' + str(fold) + '.pth'))
    model.eval()
    
    temp_outputs = []
    with torch.no_grad():
        for x_num, x_cat in tqdm(valid_loader, position=0, leave=True, desc='Evaluating'):
            x_num = x_num.to(device, dtype=torch.float)
            x_cat = x_cat.to(device)

            outputs = []
            with torch.no_grad():

                output = model(x_num, x_cat)
                outputs.append(nn.Softmax(dim=0)(output).detach().cpu().numpy())


            pred = np.array(outputs).mean(axis=0)
            temp_outputs.append(pred)
    final_outputs.append(np.concatenate(temp_outputs))




In [None]:
submission_df = pd.read_csv('../input/ieee-fraud-detection/sample_submission.csv')

In [None]:
submission_df['isFraud'] = np.mean(final_outputs,axis=0)

In [None]:
submission_df.to_csv('submission.csv', index=False)

In [None]:
# result = []

# for fold in range(10):

#     clf = xgb.XGBClassifier()
#     clf.load_model('model_fold_' +str(fold)+ '.json')
#     result.append(clf.predict_proba(X_test)[:,1])

In [None]:
# submission_df = pd.read_csv('../input/ieee-fraud-detection/sample_submission.csv')

In [None]:
# submission_df['isFraud'] = np.mean(result,axis=0)

In [None]:
# submission_df.to_csv('submission.csv', index=False)