## Baseline

In [None]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import TensorDataset
from torch.utils.data import random_split
#from math import ceil
from sklearn import metrics
from tqdm import tqdm
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
import torch.nn.functional as F

In [None]:
train_data = pd.read_csv("../input/santander-customer-transaction-prediction/train.csv")
test_data = pd.read_csv("../input/santander-customer-transaction-prediction/test.csv")

In [None]:
def get_data(train,test):
    #train/validation set
    train_data = train
    #train_data = pd.read_csv("../input/santander-customer-transaction-prediction/train.csv")
    y = train_data["target"]
    X = train_data.drop(["ID_code","target"],axis=1)
    y_tensor = torch.tensor(y.values, dtype=torch.float32)
    X_tensor = torch.tensor(X.values, dtype=torch.float32)
    ds = TensorDataset(X_tensor, y_tensor)
    train_ds, val_ds = random_split(ds,[int(0.8*len(ds)), len(ds)-(int(0.8*len(ds)))])
    
    #test set
    #test_data = pd.read_csv("../input/santander-customer-transaction-prediction/test.csv")
    test_data = test
    test_ids = test_data["ID_code"]
    X_test = test_data.drop(["ID_code"],axis=1)
    #y_tensor = torch.tensor(y.values,dtype=torch.float32)
    X_tensor = torch.tensor(X_test.values,dtype=torch.float32)
    y_tensor = torch.tensor(y.values,dtype=torch.float32)
    test_ds = TensorDataset(X_tensor,y_tensor)
    
    return train_ds, val_ds, test_ds, test_ids
    

# create a simple NN as Baseline

In [None]:
class NN(nn.Module):
    def __init__(self, input_size):
        super(NN, self).__init__()
        self.net = nn.Sequential(
            nn.BatchNorm1d(input_size),
            nn.Linear(input_size, 50),
            nn.ReLU(inplace=True),
            nn.Linear(50,1),
        )
    def forward(self,x):
        return torch.sigmoid(self.net(x)).view(-1)

In [None]:
def get_predicitons(loader, model, device):
    model.eval()
    saved_preds=[]
    true_labels=[]
    
    with torch.no_grad():
        for x,y in loader:
            x=x.to(device)
            y=y.to(device)
            scores = model(x)
            saved_preds += scores.tolist()
            true_labels += y.tolist()
            
    model.train()
    return saved_preds, true_labels

In [None]:
# Check if GPU is available
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

model = NN(input_size=200).to(DEVICE)
optimizer = optim.Adam(model.parameters(), lr=2e-3, weight_decay=1e-4)
loss_fn = nn.BCELoss() #if nn.BCEWithLogitsLoss is used then the sigmoid is not required in the model because is
#included in the loss function

train_ds, val_ds, test_ds, test_ids = get_data(train_data,test_data)

train_loader = DataLoader(train_ds, batch_size = 1024, shuffle = True)
val_loader = DataLoader(val_ds, batch_size = 1024)
test_loader = DataLoader(test_ds, batch_size = 1024)

In [None]:
EPOCHS = 20

#model.train()
for epoch in range(EPOCHS):
    probabilities, true = get_predicitons(val_loader, model, device=DEVICE)
    print(f"VALID ROC:{metrics.roc_auc_score(true, probabilities)}")
    #data, targets = next(iter(train_loader))
    for batch_idx, (data, targets) in enumerate(train_loader):
        data = data.to(DEVICE)
        targets = targets.to(DEVICE)

        #forward
        scores = model(data)
        #print(scores.shape)
        loss = loss_fn(scores, targets)
        #print(loss)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

# Model improvment

In [None]:
# Check if data is correlated

train_data.corr().abs()

features seems to be uncorrelated

In [None]:
col_names = [f"var_{i}" for i in range(200)]
for col in tqdm(col_names):
             count = test_data[col].value_counts()
             uniques = count.index[count == 1]
             #print(uniques)
             test_data[col + "_u"] = test_data[col].isin(uniques)

test_data["has_unique"] = test_data[[col + "_u" for col in col_names]].any(axis=1)

In [None]:
real_test = test_data.loc[test_data["has_unique"], ["ID_code"]+col_names]
fake_test = test_data.loc[~test_data["has_unique"], ["ID_code"]+col_names]

In [None]:
train_and_test = pd.concat([train_data, real_test],axis=0)

In [None]:
for col in tqdm(col_names):
    count = train_and_test[col].value_counts().to_dict()
    #print(count)
    train_and_test[col+"_unique"] = train_and_test[col].apply(
        lambda x: 1 if count[x]==1 else 0).values
    fake_test[col+"_unique"] = fake_test[col].apply(
        lambda x: 1 if count[x]==1 else 0).values

In [None]:
real_test = train_and_test[train_and_test["ID_code"].str.contains("test")].copy()
real_test.drop(["target"], axis=1, inplace=True)
train_data_2 = train_and_test[train_and_test["ID_code"].str.contains("train")].copy()


In [None]:
test_data_2 = pd.concat([real_test, fake_test], axis=0)

In [None]:
# new nn

class NN_new(nn.Module):
    def __init__(self, input_size, hidden_dim):
        super(NN_new, self).__init__()
        self.bn = nn.BatchNorm1d(input_size)
        self.fc1 = nn.Linear(2, hidden_dim)
        self.fc2 = nn.Linear(input_size//2*hidden_dim, 1)
        #self.net = nn.Sequential(
        #    nn.BatchNorm1d(input_size),
        #    nn.Linear(input_size, 50),
        #    nn.ReLU(inplace=True),
        #    nn.Linear(50,1),
        #)
    def forward(self,x):
        BATCH_SIZE = x.shape[0]
        x = self.bn(x)
        orig_features = x[:,:200].unsqueeze(2) #(BATCH_SIZE, 200, 1)
        new_features = x[:,200:].unsqueeze(2) #(BATCH_SIZE, 200, 1)
        x = torch.cat([orig_features,new_features],dim=2) #(BATCH_SIZE, 200, 2)
        #x = x.view(-1,1)
        x=self.fc1(x) #(BATCH_SIZE, 200*hidden_dim)
        x= F.relu(x).reshape(BATCH_SIZE,-1)
        x=self.fc2(x)
        return torch.sigmoid(x).view(-1)

In [None]:
# Check if GPU is available
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

model = NN_new(input_size=400, hidden_dim=16).to(DEVICE)
optimizer = optim.Adam(model.parameters(), lr=2e-3, weight_decay=1e-4)
loss_fn = nn.BCELoss() #if nn.BCEWithLogitsLoss is used then the sigmoid is not required in the model because is
#included in the loss function

train_ds, val_ds, test_ds, test_ids = get_data(train_data_2,test_data_2)

train_loader = DataLoader(train_ds, batch_size = 1024, shuffle = True)
val_loader = DataLoader(val_ds, batch_size = 1024)
test_loader = DataLoader(test_ds, batch_size = 1024)

In [None]:
data, targets = next(iter(test_loader))
data.size()

In [None]:
EPOCHS = 30

#model.train()
for epoch in range(EPOCHS):
    probabilities, true = get_predicitons(val_loader, model, device=DEVICE)
    print(f"VALID ROC:{metrics.roc_auc_score(true, probabilities):.4f}")
    #data, targets = next(iter(train_loader))
    for batch_idx, (data, targets) in enumerate(train_loader):
        data = data.to(DEVICE)
        targets = targets.to(DEVICE)

        #forward
        scores = model(data)
        #print(scores.shape)
        loss = loss_fn(scores, targets)
        #print(loss)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

In [None]:
def get_submission(model, loader, test_ids, device):
    all_preds=[]
    model.eval()
    with torch.no_grad():
        for x,y in loader:
            x = x.to(device)
            y = y.to(device)
            score =model(x)
            prediction=score.float()
            all_preds += prediction.tolist()
            
    model.train()
    df= pd.DataFrame({
        "ID_code" : test_ids.values,
        "target" : np.array(all_preds)
    })
    
    df.to_csv("submission.csv",index=False)

In [None]:
get_submission(model, test_loader, test_ids, DEVICE)