In [None]:
import os
import gc
import pickle

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import torch

import torch.nn as nn
import matplotlib.pyplot as plt

from sklearn.model_selection import StratifiedKFold

Gru Baseline Reference:
https://www.kaggle.com/code/cdeotte/tensorflow-gru-starter-0-790/notebook

# Config

In [None]:
class CFG:
    BATCH_SIZE=512
    N_EPOCHS=20

In [None]:
device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

# loading the dataset

In [None]:
%%time
allX=[]
ally=[]

for i in range(100):
    xpath = "../input/amex-train-series/Xchunk_{}.npy".format(i)
    ypath = "../input/amex-train-series/ychunk_{}.npy".format(i)
    
    if not os.path.exists(xpath):
        break
    allX.append(np.load(xpath))
    ally.append(np.load(ypath))
    gc.collect()
    
allX=np.concatenate(allX)
ally=np.concatenate(ally)
print(allX.shape, ally.shape)

# dataset

In [None]:
class AmexDataset(torch.utils.data.Dataset):
    def __init__(self, X, y, idxs, phase='train'):
        self.idxs=idxs
        self.X = X
        self.y = y
        self.phase=phase
    
    def __getitem__(self, idx):
        idx=self.idxs[idx]
        Xnumeric = torch.tensor(self.X[idx][:, :-11], dtype=torch.float32)
        Xcat  = torch.tensor(self.X[idx][:, -11:], dtype=torch.long)
        if self.phase !='train':
            return Xnumeric, Xcat
        y = torch.tensor(self.y[idx], dtype=torch.float32)
        return (Xnumeric, Xcat , y)
    
    def __len__(self):
        return len(self.idxs)

# Model

In [None]:
class MLP(nn.Module):
    def __init__(self, sz):
        super().__init__()
        self.mlp = nn.Sequential(
            nn.BatchNorm1d(sz),
            nn.Dropout(0.1),
            nn.Linear(sz, sz//2),
            nn.LeakyReLU(),
            
            nn.BatchNorm1d(sz//2),
            nn.Dropout(0.1),
            nn.Linear(sz//2, sz//4),
            nn.LeakyReLU(),
            nn.Linear(sz//4, 1)
        )
        
    def forward(self, x):
        return self.mlp(x)
    
class AmexGruModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.embeddings = nn.ModuleList([nn.Embedding(10, 4, padding_idx=0) for _ in range(11)])
        self.gru = nn.GRU(213, 128, bidirectional = True, batch_first = True)
        self.out1 = MLP(128)
        self.out2 = MLP(128)
        self.out  = MLP(2*128)
        
        
    def forward(self, x, xcat):
        xcat_embedds = []
        for i in range(11):
            xcat_embedds.append( self.embeddings[i](xcat[:, :, i]) )
        xcat_embedds = torch.cat(xcat_embedds, dim=-1)
        
        x = torch.cat([x, xcat_embedds], dim=-1)
        (_, h) = self.gru(x)
        h1 = h[0]
        h2 = h[1]
        
        h = torch.cat([h1, h2], dim=-1)
        
        y1 = self.out1(h1).view(-1)
        y2 = self.out2(h2).view(-1)
        y = self.out(h).view(-1)
        return y, y1, y2

# Metrics

In [None]:
def top_4percent(pred_df):
    df = pred_df.copy()
    df = df.sort_values('pred', ascending=False)
    df['weight'] = df['target'].apply(lambda v: 20 if v==0 else 1)
    four_percent_cutoff = 0.04 * sum(df['weight'])
    df['weight_cumsum'] = df['weight'].cumsum()
    df_cutoff = df[df.weight_cumsum <= four_percent_cutoff]
    
    return df_cutoff['target'].sum()/df['target'].sum()

def weighted_gini(pred_df):
    df = pred_df.copy()
    df = df.sort_values('pred', ascending=False)
    df['weight'] = df['target'].apply(lambda v: 20 if v==0 else 1)
    df['random'] = (df['weight'] / df['weight'].sum()).cumsum()
    total_pos = (df['target'] * df['weight']).sum()
    df['cum_pos_found'] = (df['target'] * df['weight']).cumsum()
    df['lorentz'] = df['cum_pos_found'] / total_pos
    df['gini'] = (df['lorentz'] - df['random']) * df['weight']
    return df['gini'].sum()


def normalized_gini(df):
    df_true=df[['target']].copy()
    df_true['pred'] = df_true['target'].copy()
    
    G = weighted_gini(df)/weighted_gini(df_true)
    return G

In [None]:
def label_smoothing(y, yhat):
    y = torch.clamp(y, 0.01, 0.99)
    loss = -y*torch.log(torch.sigmoid(yhat)) - (1-y) * torch.log(1-torch.sigmoid(yhat))
    return loss.mean()

# train model

In [None]:
def get_lr(epoch_num):
    lrs = [1e-3, 1e-3, 1e-3, 1e-4, 1e-4, 1e-4, 1e-5, 1e-5]
    if epoch_num < len(lrs):
        return lrs[foldnum]
    return 1e-5

In [None]:
def evaluate(model, val_dataloader):
    model.eval()
    ytrue=[]
    ypred=[]
    
    for (Xnumeric, Xcat , y) in val_dataloader:
        Xnumeric = Xnumeric.to(device)
        Xcat = Xcat.to(device)
        y = y.to(device)
        
        with torch.no_grad():
            (yhat, yhat1, yhat2)=model(Xnumeric, Xcat)
            yhat = yhat.sigmoid()
            ytrue += y.cpu().tolist()
            ypred += yhat.cpu().tolist()
    
    df = pd.DataFrame.from_dict({
        'target': ytrue,
        'pred': ypred
    })
    
    G = normalized_gini(df[['target', 'pred']])
    D = top_4percent(df[['target', 'pred']])
    
    M = (G+D)/2
    return (G, D, M)

In [None]:
def train_model(foldnum, train_dataloader, val_dataloader):
    best_eval=None
    
    model = AmexGruModel().to(device)
    criterion = nn.BCEWithLogitsLoss()
    optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3, weight_decay=0.001)
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, 
                                                           T_max = CFG.N_EPOCHS * len(train_dataloader), 
                                                           eta_min=1e-5)
    
    for e in range(CFG.N_EPOCHS):
        epoch_loss=[]
        model.train()
        for it, (Xnumeric, Xcat , y) in enumerate(train_dataloader):
            Xnumeric = Xnumeric.to(device)
            Xcat = Xcat.to(device)
            y = y.to(device)
            
            (yhat, yhat1, yhat2) = model(Xnumeric, Xcat)
            loss1 = criterion(yhat, y)
            loss2 = criterion(yhat1, y)
            loss3 = criterion(yhat2, y)
            loss = (loss1 + loss2 + loss3)/3
            
            optimizer.zero_grad(set_to_none=True)
            loss.backward()
            
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1)
            optimizer.step()
            scheduler.step()

            epoch_loss.append(loss.item())        
        #Evaluating
        (G, D, M) = evaluate(model, val_dataloader)
        if best_eval is None or best_eval<M:
            best_eval = M
            torch.save(model, "model{}.pt".format(foldnum))
        
        
        print("epoch:{} | loss:{:.4f}".format(e, np.mean(epoch_loss)))
        print("current Eval:{:.4f} | best Eval:{:.4f}".format(M, best_eval))
        print("Gini:{:.4f} | Default Rate:{:4f}".format(G, D))
        print()
        print()
        
        plt.title("train epoch loss.")
        plt.plot(epoch_loss)
        plt.show()

In [None]:
skf = StratifiedKFold(n_splits=5, random_state=33, shuffle=True)
for foldnum, (train_index, val_index) in enumerate(skf.split(ally, ally)):
    if foldnum == 2:
        break
        
    train_dataset = AmexDataset(allX,ally, train_index)
    val_dataset = AmexDataset(allX, ally, val_index)
    
    train_dataloader = torch.utils.data.DataLoader(train_dataset, 
                                                   batch_size=CFG.BATCH_SIZE, 
                                                   shuffle=True,
                                                   drop_last=True)
    
    val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=CFG.BATCH_SIZE, 
                                                   shuffle=False,
                                                   drop_last=False)
    
    
    print("Foldnumber:", foldnum)
    print("number of train iterations:", len(train_dataloader))
    print("number of val iterations:", len(val_dataloader))
    
    train_model(foldnum, train_dataloader, val_dataloader)

# Inference

In [None]:
def load_pickle_obj(filename):
    with open(filename, 'rb') as file:
        obj = pickle.load(file)
    return obj

test_id2customer = load_pickle_obj("../input/amex-datasetcategorical-encoders/test_id2customer.pkl")
print(len(test_id2customer))

In [None]:
models=[]
for i in range(2):
    model = torch.load("model{}.pt".format(i))
    models.append(model)

In [None]:
len(models)

In [None]:
sub_df = []
for fileid in range(200):
    xpath = "../input/amex-test-time-series-dataset/Xchunk_{}.npy".format(fileid)
    customer_path = "../input/amex-test-time-series-dataset/customerIds_chunk_{}.npy".format(fileid)
    
    if os.path.exists(xpath):
        Xtest = np.load(xpath)
        customerids = np.load(customer_path)
        test_ids = np.arange(len(Xtest))
        test_dataset = AmexDataset(Xtest, None, test_ids, phase="infer")
        test_loader  = torch.utils.data.DataLoader(test_dataset, shuffle=False, drop_last=False, batch_size=512)
        
        all_preds=[]
        for (Xnumeric, Xcat) in test_loader:
            Xnumeric = Xnumeric.to(device)
            Xcat = Xcat.to(device)
            preds=np.zeros(len(Xnumeric))
            
            for model in models:
                model.eval()
                with torch.no_grad():
                    (yhat, _, _) = model(Xnumeric, Xcat)
                    yhat = yhat.sigmoid()
                    preds += yhat.cpu().numpy()
            preds = preds/len(models)
            all_preds += list(preds)
        
        df = pd.DataFrame.from_dict({
            'customer_ID': customerids,
            'prediction': all_preds
        })
        df.fillna(0.0, inplace=True)
        sub_df.append(df)

In [None]:
sub_df=pd.concat(sub_df)
sub_df['customer_ID'] = sub_df['customer_ID'].apply(lambda k: test_id2customer[k])
sub_df.head()


In [None]:
sub_df.shape

In [None]:
sub_df.to_csv("submission.csv", index=False)
