# PyTorch Logistic Regression x 300
This notebook merge the predictions of 300 logistic regression models (5 folds x 3 learning rates x 5 weight decays x 4 repeated models = 300 models)

In [None]:
import numpy as np
import pandas as pd
import os
import copy
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.optim.lr_scheduler import MultiStepLR

In [None]:
def amex_metric(y_true, y_pred, return_components=False) -> float:
    """Amex metric for ndarrays"""
    def top_four_percent_captured(df) -> float:
        """Corresponds to the recall for a threshold of 4 %"""
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        four_pct_cutoff = int(0.04 * df['weight'].sum())
        df['weight_cumsum'] = df['weight'].cumsum()
        df_cutoff = df.loc[df['weight_cumsum'] <= four_pct_cutoff]
        return (df_cutoff['target'] == 1).sum() / (df['target'] == 1).sum()
        
    def weighted_gini(df) -> float:
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        df['random'] = (df['weight'] / df['weight'].sum()).cumsum()
        total_pos = (df['target'] * df['weight']).sum()
        df['cum_pos_found'] = (df['target'] * df['weight']).cumsum()
        df['lorentz'] = df['cum_pos_found'] / total_pos
        df['gini'] = (df['lorentz'] - df['random']) * df['weight']
        return df['gini'].sum()

    def normalized_weighted_gini(df) -> float:
        """Corresponds to 2 * AUC - 1"""
        df2 = pd.DataFrame({'target': df.target, 'prediction': df.target})
        df2.sort_values('prediction', ascending=False, inplace=True)
        return weighted_gini(df) / weighted_gini(df2)

    df = pd.DataFrame({'target': y_true.ravel(), 'prediction': y_pred.ravel()})
    df.sort_values('prediction', ascending=False, inplace=True)
    g = normalized_weighted_gini(df)
    d = top_four_percent_captured(df)
    print("G: {:.6f}, D: {:.6f}, ALL: {:6f}".format(g, d, 0.5*(g+d)))
    if return_components: return g, d, 0.5 * (g + d)
    return 0.5 * (g + d)

In [None]:
class LogisticRegression(nn.Module):
    def __init__(self, in_feats, repeat=1):
        super(LogisticRegression, self).__init__()
        self.encode = nn.Linear(in_feats, repeat, bias=True)
        self.output = nn.Sigmoid()
    
    def forward(self, x):
        return self.output(self.encode(x))

In [None]:
class early_stopper(object):
    def __init__(self, patience=12, verbose=False, delta=0):
        self.patience = patience
        self.verbose = verbose
        self.delta = delta
        self.best_value = None
        self.best_cv = None
        self.is_earlystop = False
        self.count = 0
        self.best_model = None
        #self.val_preds = []
        #self.val_logits = []

    def earlystop(self, loss, value, model=None):#, preds, logits):
        """
        value: evaluation value on valiation dataset
        """
        cv = value
        if self.best_value is None:
            self.best_value = value
            self.best_cv = cv
            self.best_model = copy.deepcopy(model).to('cpu')
            #self.val_preds = preds
            #self.val_logits = logits
        elif value < self.best_value + self.delta:
            self.count += 1
            if self.verbose:
                print('EarlyStoper count: {:02d}'.format(self.count))
            if self.count >= self.patience:
                self.is_earlystop = True
        else:
            self.best_value = value
            self.best_cv = cv
            self.best_model = copy.deepcopy(model).to('cpu')
            #self.val_preds = preds
            #self.val_logits = logits
            self.count = 0

In [None]:
params = {
    'model': 'LogisticRegression',
    'batch_size': 2048,
    'lr': [0.03, 0.01, 0.003],
    'wd': [3e-4, 1e-4, 3e-5, 1e-5, 0.],
    'repeat': 4,
    #'device': 'cpu',
    'device': 'cuda:0',
    'early_stopping': 12,
    'n_fold': 5,
    'seed': 2021,
    'max_epochs': 200,
}

def binary_cross_entropy(pos_weight=None):
    def weighted_bce(y_pred, y_true):
        a = y_pred.reshape(-1)
        b = y_true.reshape(-1)
        if pos_weight is None:
            weighted_loss =  (1-b) * torch.log(1-a) + b * torch.log(a)
        elif isinstance(pos_weight, (list, torch.Tensor, np.ndarray)):
            weighted_loss = (pos_weight[0] * (1-b) * torch.log(1-a) + pos_weight[1] * b * torch.log(a)) / sum(pos_weight)
        elif isinstance(pos_weight, (int, float)):
            weighted_loss = ((1-b) * torch.log(1-a) + pos_weight * b * torch.log(a)) / (pos_weight + 1)
        else:
            weighted_loss = 1
        return torch.mean(weighted_loss)
    return weighted_bce

In [None]:
# loss_fn = nn.CrossEntropyLoss(weight=torch.from_numpy(np.array([118828, 340085])).float()).to(device)
# loss_fn = nn.BCELoss(weight=torch.from_numpy(np.array([118828, 340085])))
#for fold, (trn_idx, val_idx) in enumerate(kfold.split(train_nn, y_target)):
device = params['device']
test_preds = list()
for fold in range(5):
    print(f'Training fold {fold + 1}')
    # x_train, x_val = num_feat.iloc[trn_idx], num_feat.iloc[val_idx]
    x_train = pd.read_pickle("../input/amex-5-fold-agg-data/train_fold_{}.pkl".format(fold))
    x_val = pd.read_pickle("../input/amex-5-fold-agg-data/val_fold_{}.pkl".format(fold))
    print("Point 1!")
    features = [col for col in x_val.columns if col not in ['customer_ID', 'target']]
    # scaler = StandardScaler()
    x_train = torch.from_numpy(x_train[features].values).float().to(device)
    x_val = torch.from_numpy(x_val[features].values).float().to(device)
    print("Point 2!")
    y_train = pd.read_csv("../input/amex-5-fold-agg-data/train_fold_{}_target.csv".format(fold))
    pos_weight = torch.from_numpy((y_train.target * 1.862 + 1).values).float().to(device)
    y_val = pd.read_csv("../input/amex-5-fold-agg-data/val_fold_{}_target.csv".format(fold))
    y_train = torch.from_numpy(y_train.target.values).float().to(device)
    y_val = torch.from_numpy(y_val.target.values).float().to(device)
    # loss_fn = nn.BCELoss(weight=torch.from_numpy(pos_weight.values))
    train_sample_strategy = torch.utils.data.sampler.WeightedRandomSampler(np.ones(len(y_train)),
                                                                           num_samples=len(y_train), replacement=False)
    train_dataloader = torch.utils.data.DataLoader(np.array(range(len(y_train))), batch_size=params['batch_size'], num_workers=0,
                                                   sampler=train_sample_strategy, drop_last=False)
    val_sample_strategy = torch.utils.data.sampler.WeightedRandomSampler(np.ones(len(y_val)),
                                                                         num_samples=len(y_val), replacement=False)
    val_dataloader = torch.utils.data.DataLoader(np.array(range(len(y_val))), batch_size=params['batch_size'], num_workers=0,
                                                 sampler=val_sample_strategy, drop_last=False)
    model_list = [eval(params['model'])(x_train.shape[1], params['repeat']).to(device) for lr in params['lr'] for wd in params['wd']]
    lr_list = [lr * np.sqrt(params['batch_size']/2048) for lr in params['lr'] for wd in params['wd']]
    wd_list = [wd for lr in params['lr'] for wd in params['wd']]
    optimizer_list = [optim.Adam(model.parameters(), lr=lr_list[i], weight_decay=wd_list[i]) for i, model in enumerate(model_list)]
    lr_scheduler_list = [MultiStepLR(optimizer=optimizer, milestones=[800, 1600, 2400, 3200, 4000, 4800, 5600], gamma=0.6) for optimizer in optimizer_list]
    earlystoper_list = [early_stopper(patience=params['early_stopping'], verbose=False) for lr in lr_list]
    val_prediction_list = [torch.zeros(x_val.shape[0]).float().to(device) for lr in lr_list]
    start_epoch = 0
    print("Point 3!")
    for epoch in range(start_epoch, params['max_epochs']):
        print("In epoch:{:03d}".format(epoch))
        train_loss_list = []
        # train_acc_list = []
        _ = [model.train() for model in model_list]
        for step, input_seeds in enumerate(train_dataloader):
            batch_inputs = x_train[input_seeds].to(device)
            batch_labels = y_train[input_seeds].to(device)
            weight = pos_weight[input_seeds].to(device)
            for i, model in enumerate(model_list):
                # model.train()
                train_batch_logits = model(batch_inputs)
                train_loss = F.binary_cross_entropy(train_batch_logits.mean(1), batch_labels, weight=weight)
                # backward
                optimizer_list[i].zero_grad()
                train_loss.backward()
                optimizer_list[i].step()
                lr_scheduler_list[i].step()
            # train_loss_list.append(train_loss.cpu().detach().numpy())
            
            # tr_batch_pred = None
    
        # mini-batch for validation
        val_loss_list = 0
        val_acc_list = 0
        #val_correct_list = 0
        val_all_list = 0
        _ = [model.eval() for model in model_list]
        with torch.no_grad():
            for step, input_seeds in enumerate(val_dataloader):
                batch_inputs = x_val[input_seeds].to(device)
                batch_labels = y_val[input_seeds].to(device)
                weight = pos_weight[input_seeds].to(device)
                for i, model in enumerate(model_list):
                    # model.eval()
                    val_batch_logits = model(batch_inputs)
                    val_prediction_list[i][input_seeds] = val_batch_logits.mean(1).detach()
                # val_loss_list = val_loss_list + loss_fn(val_batch_logits, batch_labels)
                # val_batch_pred = torch.sum(torch.argmax(val_batch_logits, dim=1) == batch_labels) / torch.tensor(batch_labels.shape[0])
                # val_acc_list = val_acc_list + val_batch_pred * torch.tensor(batch_labels.shape[0])
                # val_all_list = val_all_list + batch_labels.shape[0]
            #tmp_predictions = model(test_feature).cpu().numpy()
        #infold_preds[fold] = tmp_predictions
        #test_predictions += tmp_predictions / params['n_fold']
        semaphore = len(model_list)
        for i, model in enumerate(model_list):
            val_predictions = torch.sigmoid(val_prediction_list[i]).cpu().numpy()
            earlystoper_list[i].earlystop(val_loss_list, amex_metric(y_val.float().cpu().numpy(), val_predictions), model)
            if earlystoper_list[i].is_earlystop:
                semaphore -= 1
        if semaphore < 1:
            print("Early Stopping")
            break
                
        # true = np.concatenate([true, y_valid])
        # oof = np.concatenate([oof, val_predictions])
        # if earlystoper.is_earlystop:
            # print("Early Stopping!")
            # break
    print("Best val_metric is:\t", "\t".join(["{:.4f}".format(earlystoper.best_cv) for earlystoper in earlystoper_list]))
    # print("Best val_metric is: {:.7f}".format(earlystoper.best_cv))
    x_test = pd.read_pickle("../input/amex-5-fold-agg-data/test_fold_{}.pkl".format(fold))
    # test_ids = x_test.customer_ID
    test_prediction_list = [torch.zeros(len(x_test)) for i in lr_list]
    test_sample_strategy = torch.utils.data.sampler.WeightedRandomSampler(np.ones(len(x_test)),
                                                                          num_samples=len(x_test), replacement=False)
    test_dataloader = torch.utils.data.DataLoader(np.array(range(len(x_test))), batch_size=params['batch_size'], num_workers=0,
                                                  sampler=test_sample_strategy, drop_last=False)
    x_test = torch.from_numpy(x_test[features].values).float().to(device)
    test_predictions = torch.zeros(x_test.shape[0]).float()
    # todo: start at this
    # test_num_feat = torch.from_numpy(scaler.transform(test_nn[features_numerical])).float().to(device)
    b_model_list = [earlystoper.best_model.to(device) for earlystoper in earlystoper_list]
    _ = [b_model.eval() for b_model in b_model_list]
    with torch.no_grad():
        for step, input_seeds in enumerate(test_dataloader):
            batch_inputs = x_test[input_seeds].to(device)
            for i, b_model in enumerate(b_model_list):
                # b_model.eval()
                test_batch_logits = b_model(batch_inputs)
                test_predictions[input_seeds] = test_predictions[input_seeds] + test_batch_logits.mean(1).detach().cpu()/len(b_model_list)
            #test_batch_pred = torch.sum(torch.argmax(test_batch_logits, dim=1) == batch_labels) / torch.tensor(batch_labels.shape[0])
            if step % 50 == 0:
                print('In test batch:{:04d}'.format(step))
    test_preds.append(test_predictions)
    del x_train, x_val, x_test
test_preds = torch.stack(test_preds).mean(0).numpy()

#my_acc = acc(y, oof_predictions)
#my_ap = average_precision_score(y_target, torch.softmax(oof_predictions, dim=1).cpu()[:, 1])
#print("NN out of fold AP is:", my_ap)


In [None]:
import gc
x_test = pd.read_pickle("../input/amex-5-fold-agg-data/test_fold_{}.pkl".format(0))
ids = copy.deepcopy(x_test.customer_ID)
del x_test
gc.collect()
sub = pd.DataFrame()
sub['customer_ID'] = ids
sub['prediction'] = test_preds
sub.to_csv("submission_LR_x_300.csv", index=False)