In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train_df = pd.read_csv("/kaggle/input/tabular-playground-series-aug-2021/train.csv")
train_df.info()

In [None]:
sub_test_df = pd.read_csv("/kaggle/input/tabular-playground-series-aug-2021/test.csv")
sub_test_df.info()

In [None]:
train_df.head()

In [None]:
train_df.describe()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

fig, ax = plt.subplots(figsize=(36,36))
ax = sns.heatmap(train_df.corr(), vmax=.04)

In [None]:
top_corrs = train_df.corr().unstack().sort_values(ascending=False)

In [None]:
top_corrs[top_corrs < 1].plot()

In [None]:
rstate = 3495743
df = train_df.copy()
#split into train validation test for ML
train_df = df.sample(frac=.8, random_state=rstate)
valid_df = df[~df.id.isin(train_df.id)].sample(frac=.5,random_state=rstate)
test_df = df[~df.id.isin(train_df.id.to_list()+valid_df.id.to_list())]

train_df.shape, valid_df.shape, test_df.shape

In [None]:
train_df.id.isin(test_df.id).sum(),train_df.id.isin(valid_df.id).sum()

In [None]:
import torch
import torch.nn as nn 
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import MinMaxScaler

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device 

In [None]:
class KaggleTabularDataset(Dataset):
    def __init__(self,df,scaler=None):
        self.df = df
        if scaler:
            self.scaler = scaler
            self.data_label = self.scaler.transform(self.df.iloc[:,1:-1].values)
        else:
            self.scaler = MinMaxScaler(feature_range=(-1,1))
            self.data_label = self.scaler.fit_transform(self.df.iloc[:,1:-1].values)
            
        self.id = self.df.iloc[:,0].values
        self.data = self.data_label[:,:]
        self.label = self.df.iloc[:,-1:].values # self.data_label[:,-2:-1]
        
    def __len__(self):
        return  self.data.shape[0]

    def __getitem__(self, idx):
        X = torch.tensor(self.data[idx]).float().to(device)
        y = torch.tensor(self.label[idx]).float().to(device)
        return X, y

In [None]:
train_data = KaggleTabularDataset(train_df)
valid_data = KaggleTabularDataset(valid_df, scaler=train_data.scaler)
test_data = KaggleTabularDataset(test_df, scaler=train_data.scaler)


In [None]:
sub_test_df['loss'] = np.nan
sub_test_data = KaggleTabularDataset(sub_test_df, scaler=train_data.scaler)

In [None]:
# this is one way to define a network
class Net(torch.nn.Module):
    def __init__(self, n_feature, n_shrink, n_hidden,  n_output):
        super(Net, self).__init__()
        self.hidden1 = nn.Sequential(
            torch.nn.Linear(n_feature, n_shrink),   # hidden layer
            #nn.Dropout(p=0.02),
            nn.LayerNorm(n_shrink),
            nn.ReLU(),
            torch.nn.Linear(n_shrink, n_hidden),   # hidden layer
            #nn.Dropout(p=0.02),
            nn.LayerNorm(n_hidden),
            nn.ReLU()
        )

        self.combined = torch.nn.Linear(n_hidden, n_hidden)
        self.predict = torch.nn.Linear(n_hidden, n_output)   # output layer

    def forward(self, x):
        x = self.hidden1(x)  # activation function for hidden layer
        x = self.combined(x)
        x = self.predict(x)*40             # linear output
        return x

In [None]:
class FixedAttention(torch.nn.Module):
    def __init__(self, n_feature):
        super(FixedAttention, self).__init__()
        self.Q = nn.parameter.Parameter(torch.rand((n_feature,50)))
        self.wV = nn.parameter.Parameter(torch.rand((n_feature,50)))
        self.alpha = None
        self.softmax = nn.Softmax(1)
        self.d_k = torch.sqrt(torch.tensor(n_feature).float())
                              
    def forward(self, x):
        self.alpha = self.softmax(torch.matmul(x,self.Q)/self.d_k)
        V = torch.matmul(x,self.wV)
        return torch.sum(self.alpha*V,1).unsqueeze(-1)

class AutoQueryNetV1(torch.nn.Module):
    def __init__(self, n_feature, n_head, n_hidden, n_output):
        super(AutoQueryNetV1, self).__init__()
        
        self.multi_heads = nn.ModuleList([FixedAttention(n_feature).to(device) for _ in range(n_head)])
        self.combined = torch.nn.Linear(n_head, n_hidden)
        self.predict = torch.nn.Linear(n_hidden, n_output)   # output layer
        
    def forward(self, x):
        x = torch.cat([head(x) for head in self.multi_heads],-1)
        #print(x.shape)
        x = self.combined(x)
        x = self.predict(x)*10 
        return x

In [None]:
class Norm(nn.Module):
    def __init__(self, d_model, eps = 1e-6):
        super().__init__()
    
        self.size = d_model
        # create two learnable parameters to calibrate normalisation
        self.alpha = nn.Parameter(torch.ones(self.size))
        self.bias = nn.Parameter(torch.zeros(self.size))
        self.eps = eps
        
    def forward(self, x):
        norm = self.alpha * (x - x.mean(dim=-1, keepdim=True)) \
        / (x.std(dim=-1, keepdim=True) + self.eps) + self.bias
        return norm
    
class MultiHeadAttention(torch.nn.Module):
    def __init__(self, n_feature,n_head, n_attn):
        super(MultiHeadAttention, self).__init__()
        condition = (n_attn / n_head) % 2 == 0 and n_attn % n_head == 0 and n_attn / n_head >= 20
        if not condition:
            raise AssertionError()
        self.n_feature = n_feature
        self.n_head = n_head
        self.n_attn = n_attn
        self.Q = nn.parameter.Parameter(torch.rand((n_head, int(n_attn/n_head), n_feature)))#.to(device)
        self.wV = nn.parameter.Parameter(torch.rand((n_feature,n_attn)))#.to(device)
        self.wK = nn.parameter.Parameter(torch.rand((n_feature,n_feature)))#.to(device)
        
        self.alpha = None
        self.softmax = nn.Softmax(1)
        self.d_k = torch.sqrt(torch.tensor(n_feature/n_head).float()).to(device)
                              
    def forward(self, x):
        K = torch.matmul(x, self.wK)
        QK_T = torch.matmul(self.Q,K.T)
        self.alpha = self.softmax(QK_T/self.d_k).permute(2,0,1)
        V = torch.matmul(x,self.wV).reshape(-1,self.n_head,int(self.n_attn/self.n_head))
        #print(V.shape,self.alpha.shape)
        return torch.sum(self.alpha*V,2)

class AutoQueryNetV2(torch.nn.Module):
    def __init__(self, n_feature, n_head, n_attn, n_hidden, n_output):
        super(AutoQueryNetV2, self).__init__()

        self.multi_heads = MultiHeadAttention(n_feature, n_head, n_attn)
        self.norm1 = Norm(n_head)
        self.combined = torch.nn.Linear(n_head, n_hidden)
        self.norm2 = nn.LayerNorm(n_hidden)
        self.predict = torch.nn.Linear(n_hidden, n_output)   # output layer
        
        
    def forward(self, x):
        x = self.multi_heads(x)
        x = self.norm1(x)
        #print(x.shape)
        x = self.combined(x)
        x = self.norm2(x)
        x = self.predict(x)*10 
        return x

In [None]:
class Attention(torch.nn.Module):
    def __init__(self, n_feature):
        super(Attention, self).__init__()
        n_latent = 16
        self.wQ = nn.parameter.Parameter(torch.rand((n_feature,n_latent)))
        self.wK = nn.parameter.Parameter(torch.rand((n_feature,n_latent)))
        self.wV = nn.parameter.Parameter(torch.rand((n_feature,n_latent)))
        
        self.alpha = None
        self.softmax = nn.Softmax(1)
        self.d_k = torch.sqrt(torch.tensor(n_feature).float())
                              
    def forward(self, x):
        Q = torch.matmul(x, self.wQ) # (b_sz,n_fea)x(n_fea,latent_attn_dim) -> (b_sz,latent_attn_dim)
        K = torch.matmul(x, self.wK) # (b_sz,n_fea)x(n_fea,latent_attn_dim) -> (b_sz,latent_attn_dim)
        self.alpha = self.softmax(Q*K/self.d_k) # (b_sz,latent_attn_dim)
        V = torch.matmul(x,self.wV) # (b_sz,n_fea)x(n_fea,latent_attn_dim) -> (b_sz,latent_attn_dim)
        return torch.sum(self.alpha*V,1).unsqueeze(-1)

class AutoQueryNetV3(torch.nn.Module):
    def __init__(self, n_feature, n_head, n_hidden, n_output):
        super(AutoQueryNetV3, self).__init__()
        
        self.multi_heads = nn.ModuleList([Attention(n_feature).to(device) for _ in range(n_head)])
        self.combined = torch.nn.Linear(n_head, n_hidden)
        self.predict = torch.nn.Linear(n_hidden, n_output)   # output layer
        
    def forward(self, x):
        x = torch.cat([head(x) for head in self.multi_heads],-1)
        #print(x.shape)
        x = self.combined(x)
        x = self.predict(x)*10 
        return x

In [None]:
def RMSELoss(yhat,y):
    return torch.sqrt(torch.mean((yhat-y)**2))

In [None]:
def training_loop(data, model, val_data=None, t_data=None, verbose=2, num_epoch=25, patient=7):
    batch_loss = []
    epoch_loss = []
    min_loss = np.inf
    pat_ct = 0
    
    model.train()
    for epoch in range(num_epoch):
        pat_ct+=1
        data_iterator = iter(data)
        n_batch = len(data_iterator)
        for batch,(X,y) in enumerate(data_iterator):
            output = model(X)
            loss = loss_func(output, y) 

            
            optimizer.zero_grad()   # clear gradients for next train
            loss.backward()         # backpropagation, compute gradients
            optimizer.step()        # apply gradients
            batch_loss+=[loss.item()]
            
            if batch%int(n_batch/5) == 0 and verbose>1:
                print(f"batch {batch+1}/{n_batch} loss: {sum(batch_loss)/len(batch_loss)}")
            
        epoch_loss+=batch_loss
        batch_loss = []
        
        print(f"epoch {epoch+1}/{num_epoch} loss: {sum(epoch_loss)/len(epoch_loss)}")
        if val_data:
            eval_loss = model_eval(val_data, model)
            if t_data:
                model_eval(t_data, model, test=True)
            if min_loss > eval_loss:
                print('saving best model')
                min_loss = eval_loss
                torch.save(model, 'best_model')
                pat_ct = 0
                
        if pat_ct == patient: #each patient is an epoch that no best model according to val data loss
            print('ran out of patient, loading best model')
            model = torch.load('best_model')
            return model
        else:
            print(f'patient {pat_ct}/{patient}')
            
    return model
            

                
def model_eval(data, model, test=False):
    batch_loss = []
    
    with torch.no_grad():
        model.eval()
        data_iterator = iter(data)

        for batch,(X,y) in enumerate(data_iterator):
            output = model(X)
            loss = loss_func(output, y)
            batch_loss+=[loss.item()]

    print(f"{'test' if test else 'validation'} loss: {sum(batch_loss)/len(batch_loss)}")
    return sum(batch_loss)/len(batch_loss)

In [None]:
def flatten(t):
    return [item for sublist in t for item in sublist]

In [None]:
#model = Net(n_feature=100, n_shrink=50, n_hidden=20, n_output=1)     # define the network

# print(net)  # net architecture
# model = Net(n_feature=100, n_shrink=50, n_hidden=20, n_output=1).to(device)
loss_func = RMSELoss#torch.nn.MSELoss()  # this is for regression mean squared loss
#model = AutoQueryNetV1(100, 10, 64, 1).to(device)
#model = AutoQueryNetV2(100, 10, 200, 64, 1).to(device)
model = AutoQueryNetV3(100, 32, 64, 1).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
print(model)
[p.shape for p in model.parameters()]

In [None]:
batch_sz = 100
dataset = DataLoader(train_data, batch_size=batch_sz, shuffle=True)
valid_dataset = DataLoader(valid_data, batch_size=10000, shuffle=False)
test_dataset = DataLoader(test_data, batch_size=10000, shuffle=False)

model = training_loop(dataset, model, val_data=valid_dataset,t_data=test_dataset, verbose=2)

In [None]:
model_eval(valid_dataset, model),model_eval(test_dataset, model, test=True)

In [None]:
with torch.no_grad():
    model.eval()
    test_df.loss.iloc[:100].plot.bar(figsize=(36,6))
    pd.DataFrame(model(test_data[:][0]).cpu().detach().numpy()).iloc[:100].plot.bar(figsize=(36,6))

In [None]:
sLoader = DataLoader(sub_test_data, batch_size=1000, shuffle=False)

outputs = []
with torch.no_grad():
    for sX,sy in iter(sLoader):
        model.eval()
        batch_out = model(sX).cpu().detach().numpy()
        outputs+=[batch_out]

In [None]:
sub_list = np.vstack(outputs)[:,0]

In [None]:
sub = pd.read_csv('/kaggle/input/tabular-playground-series-aug-2021/sample_submission.csv')
sub.loss = sub_list
sub

In [None]:
sub.to_csv('submission_07.csv',index=False)