<a href="https://colab.research.google.com/github/sugangnb/ai-research/blob/main/colab_notebooks/alpha_effect_test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install einops

Collecting einops
  Downloading https://files.pythonhosted.org/packages/5d/a0/9935e030634bf60ecd572c775f64ace82ceddf2f504a5fd3902438f07090/einops-0.3.0-py2.py3-none-any.whl
Installing collected packages: einops
Successfully installed einops-0.3.0


In [2]:
!pip install entmax

Collecting entmax
  Downloading https://files.pythonhosted.org/packages/05/da/27fc966a4786e933778161644a1a1a228148b296d2059682799c4a8ecff8/entmax-1.0.tar.gz
Building wheels for collected packages: entmax
  Building wheel for entmax (setup.py) ... [?25l[?25hdone
  Created wheel for entmax: filename=entmax-1.0-cp36-none-any.whl size=11018 sha256=b0a86d403462bb7030c5a61368b274f1c623cfbe43b95be5648843c3a873f26f
  Stored in directory: /root/.cache/pip/wheels/4c/2c/4e/687c0abbeb16f906bd5fb8a9763e1cdd2b0d118ad55a4332f2
Successfully built entmax
Installing collected packages: entmax
Successfully installed entmax-1.0


In [3]:
!rm -rf kicked_dataset/
!git clone https://github.com/calibertytz/kicked_dataset.git

Cloning into 'kicked_dataset'...
remote: Enumerating objects: 3, done.[K
remote: Counting objects: 100% (3/3), done.[K
remote: Compressing objects: 100% (3/3), done.[K
remote: Total 25 (delta 0), reused 0 (delta 0), pack-reused 22[K
Unpacking objects: 100% (25/25), done.


In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

df_train = pd.read_csv('kicked_dataset/df_train_final.csv')
df_test = pd.read_csv('kicked_dataset/df_test_final.csv')

train_X = df_train.drop(columns=['label'])
train_Y = df_train['label']

X_test = df_test.drop(columns=['label'])
Y_test = df_test['label']

test_X, val_X, test_Y, val_Y = train_test_split(X_test, Y_test, test_size=0.5, random_state=1) # x_val for updating alpha

In [5]:
def cate_count(df):
  count_res = df.nunique()
  return count_res.values

In [6]:
num_tokens = cate_count(train_X)

In [7]:
train_X = train_X.values
test_X = test_X.values
val_X = val_X.values

train_Y = train_Y.values
test_Y = test_Y.values
val_Y = val_Y.values

#train_Y = pd.get_dummies(train_Y).values
#test_Y = pd.get_dummies(test_Y).values
#val_Y = pd.get_dummies(val_Y).values

In [8]:
train_X.shape, train_Y.shape

((50000, 93), (50000,))

In [9]:
import torch
import torch.nn.functional as F
from torch import nn, einsum
from einops import rearrange
from entmax import sparsemax, entmax15, entmax_bisect


# helpers

def exists(val):
    return val is not None


def default(val, d):
    return val if exists(val) else d


# classes

class Residual(nn.Module):
    def __init__(self, fn):
        super().__init__()
        self.fn = fn

    def forward(self, x, **kwargs):
        return self.fn(x, **kwargs) + x


class PreNorm(nn.Module):
    def __init__(self, dim, fn):
        super().__init__()
        self.norm = nn.LayerNorm(dim)
        self.fn = fn

    def forward(self, x, **kwargs):
        return self.fn(self.norm(x), **kwargs)


class LayerNorm(nn.Module):
    "Construct a layernorm module (See citation for details)."

    def __init__(self, features, eps=1e-12):
        super(LayerNorm, self).__init__()
        self.a_2 = nn.Parameter(torch.ones(features))
        self.b_2 = nn.Parameter(torch.zeros(features))
        self.eps = eps

    def forward(self, x):
        mean = x.mean(-1, keepdim=True)
        std = x.std(-1, keepdim=True)
        return self.a_2 * (x - mean) / (std + self.eps) + self.b_2


# attention

class GEGLU(nn.Module):
    def forward(self, x):
        x, gates = x.chunk(2, dim=-1)
        return x * F.gelu(gates)


class FeedForward(nn.Module):
    def __init__(self, dim, mult=4, dropout=0.1):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(dim, dim * mult * 2),
            GEGLU(),
            nn.Dropout(dropout),
            nn.Linear(dim * mult, dim)
        )

    def forward(self, x, **kwargs):
        return self.net(x)


class Attention(nn.Module):
    def __init__(
            self,
            dim,
            heads=8,
            dim_head=16,
            dropout=0.1,
            mask_type='entmax15'
    ):
        super().__init__()
        inner_dim = dim_head * heads
        self.heads = heads
        self.mask_type = mask_type
        self.scale = dim_head ** -0.5

        self.to_qkv = nn.Linear(dim, inner_dim * 3, bias=False)
        self.to_out = nn.Linear(inner_dim, dim)

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        h = self.heads
        q, k, v = self.to_qkv(x).chunk(3, dim=-1)
        q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h=h), (q, k, v))
        sim = einsum('b h i d, b h j d -> b h i j', q, k) * self.scale

        if self.mask_type == 'entmax15':
            attn = entmax15(sim, dim=-1)
        elif self.mask_type == 'sparsemax':
            attn = sparsemax(sim, dim=-1)
        elif self.mask_type == 'entmax':
            alpha = 1.5
            attn = sim * entmax_bisect(sim, alpha=alpha, dim=-1)
        else:
            raise NotImplemented()
        attn = self.dropout(attn)
        out = einsum('b h i j, b h j d -> b h i d', attn, v)
        out = rearrange(out, 'b h n d -> b n (h d)', h=h)
        return self.to_out(out)


# transformer

class Transformer(nn.Module):
    def __init__(self, dim, heads=8, dim_head=16, attn_dropout=0.1, ff_dropout=0.1, depth=1):
        super().__init__()
        self.layers = nn.ModuleList([])
        self.depth = depth

        for _ in range(self.depth):
            self.layers.append(nn.ModuleList([
                Residual(PreNorm(dim, Attention(dim, heads=heads, dim_head=dim_head, dropout=attn_dropout,mask_type="entmax"))),
                Residual(PreNorm(dim, FeedForward(dim, dropout=ff_dropout))),
            ]))

    def forward(self, x):
        for i, (attn, ff) in enumerate(self.layers):
            x = attn(x)
            x = ff(x)
        return x


class DeepTabnet(nn.Module):
    def __init__(
            self,
            categories,
            dim,
            depth_info,
            heads=8,
            dim_head=16,
            dim_out=2,
            attn_dropout=0.1,
            ff_dropout=0.1
    ):
        super().__init__()
        assert all(map(lambda n: n > 0, categories)), 'number of each category must be positive'

        # categories related calculations

        self.num_categories = len(categories)
        self.num_unique_categories = sum(categories)
        total_tokens = self.num_unique_categories

        # embedding
        self.embeds = nn.Embedding(total_tokens, dim)

        # transformer
        self.transformer_base = Transformer(
            dim=dim,
            depth=depth_info[0],
            heads=heads,
            dim_head=dim_head,
            attn_dropout=attn_dropout,
            ff_dropout=ff_dropout
        )
        self.no_shared_transformer = nn.ModuleList()
        for i in range(1, len(depth_info)):
            trans = Transformer(
                dim=dim,
                depth=depth_info[i],
                heads=heads,
                dim_head=dim_head,
                attn_dropout=attn_dropout,
                ff_dropout=ff_dropout
            )
            self.no_shared_transformer.append(trans)
        # final layer
        self.mlp = nn.Sequential(nn.ReLU(),
                                 nn.Linear(dim * 93, dim_out))

    def forward(self, x_categ):
        assert x_categ.shape[
                   -1] == self.num_categories, f'you must pass in {self.num_categories} values for your categories input'
        x_categ_embed = self.embeds(x_categ)

        for t in self.no_shared_transformer:
            x = self.transformer_base(x_categ_embed)
            x = t(x)
        flat_categ = x.flatten(1)
        return self.mlp(flat_categ)

In [10]:
from torch.utils.data import Dataset, DataLoader
import torch
import numpy as np


class TorchDataset(Dataset):
    """
    Format for numpy array
    Parameters
    ----------
    X : 2D array
        The input matrix
    y : 1D array
        Target
    """

    def __init__(self, x, y):
        self.x = x
        self.y = y

    def __len__(self):
        return len(self.x)

    def __getitem__(self, index):
        x, y = self.x[index], self.y[index]
        return x, y

# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


categories = num_tokens
dim = 64
depth_info = (1, 2, 2, 2, 2)
batch_size = 256
learning_rate = 1e-3
num_epochs = 2

train_loader = DataLoader(TorchDataset(train_X.astype(np.float32), train_Y),
        batch_size=batch_size)

val_loader = DataLoader(TorchDataset(test_X.astype(np.float32), test_Y),
        batch_size=batch_size)
        
test_loader = DataLoader(TorchDataset(val_X.astype(np.float32), val_Y),
        batch_size=batch_size)

model = DeepTabnet(categories, dim, depth_info).to(device)

# Loss and optimizer
criterion = nn.CrossEntropyLoss()

# add radam and lookahead (TODO)
opt_1 = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=learning_rate)


def train():
  # Train the model
  total_step = len(train_loader)
  for epoch in range(num_epochs):
      print('train model param \n')
      for i, (inputs, labels) in enumerate(train_loader):  
          # Move tensors to the configured device
          inputs = inputs.to(device=device, dtype=torch.long)
          labels = labels.to(device)
          # Forward pass
          outputs = model(inputs)
          loss1 = criterion(outputs, labels)
          
          # Backward and optimize
          opt_1.zero_grad()
          #loss1.backward(retain_graph=True)
          loss1.backward() 
          opt_1.step()
          if (i+1) % 5 == 0:
              print ('train mode, Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}' 
                    .format(epoch+1, num_epochs, i+1, total_step, loss1.item()))
              val_train()
              val()
              
def val_train():
    with torch.no_grad():
      correct = 0
      total = 0
      for inputs, labels in train_loader:
          inputs = inputs.to(device=device, dtype=torch.long)
          labels = labels.to(device)
          outputs = model(inputs)
          _, predicted = torch.max(outputs.data, 1)
          total += labels.size(0)
          correct += (predicted == labels).sum().item()

      print('train_accuracy: {} %'.format(100 * correct / total))


def val():
  # Test the model
  # In test phase, we don't need to compute gradients (for memory efficiency)
  with torch.no_grad():
      correct = 0
      total = 0
      for inputs, labels in test_loader:
          inputs = inputs.to(device=device, dtype=torch.long)
          labels = labels.to(device)
          outputs = model(inputs)
          _, predicted = torch.max(outputs.data, 1)
          total += labels.size(0)
          correct += (predicted == labels).sum().item()

      print('Accuracy: {} %'.format(100 * correct / total))

In [None]:
train()

train model param 

train mode, Epoch [1/2], Step [5/196], Loss: 0.3893
train_accuracy: 87.27 %
Accuracy: 85.99895579533589 %
train mode, Epoch [1/2], Step [10/196], Loss: 0.3310


In [None]:
class Attention(nn.Module):
    def __init__(
            self,
            dim,
            heads=8,
            dim_head=16,
            dropout=0.1,
            mask_type='entmax15'
    ):
        super().__init__()
        inner_dim = dim_head * heads
        self.heads = heads
        self.mask_type = mask_type
        self.scale = dim_head ** -0.5

        self.to_qkv = nn.Linear(dim, inner_dim * 3, bias=False)
        self.to_out = nn.Linear(inner_dim, dim)

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        h = self.heads
        q, k, v = self.to_qkv(x).chunk(3, dim=-1)
        q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h=h), (q, k, v))
        sim = einsum('b h i d, b h j d -> b h i j', q, k) * self.scale

        if self.mask_type == 'entmax15':
            attn = entmax15(sim, dim=-1)
        elif self.mask_type == 'sparsemax':
            attn = sparsemax(sim, dim=-1)
        elif self.mask_type == 'entmax':
            alpha = 1
            attn = sim * entmax_bisect(sim, alpha=alpha, dim=-1)
        else:
            raise NotImplemented()
        attn = self.dropout(attn)
        out = einsum('b h i j, b h j d -> b h i d', attn, v)
        out = rearrange(out, 'b h n d -> b n (h d)', h=h)
        return self.to_out(out)


In [None]:
train()

In [None]:
class Attention(nn.Module):
    def __init__(
            self,
            dim,
            heads=8,
            dim_head=16,
            dropout=0.1,
            mask_type='entmax15'
    ):
        super().__init__()
        inner_dim = dim_head * heads
        self.heads = heads
        self.mask_type = mask_type
        self.scale = dim_head ** -0.5

        self.to_qkv = nn.Linear(dim, inner_dim * 3, bias=False)
        self.to_out = nn.Linear(inner_dim, dim)

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        h = self.heads
        q, k, v = self.to_qkv(x).chunk(3, dim=-1)
        q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h=h), (q, k, v))
        sim = einsum('b h i d, b h j d -> b h i j', q, k) * self.scale

        if self.mask_type == 'entmax15':
            attn = entmax15(sim, dim=-1)
        elif self.mask_type == 'sparsemax':
            attn = sparsemax(sim, dim=-1)
        elif self.mask_type == 'entmax':
            alpha = 2
            attn = sim * entmax_bisect(sim, alpha=alpha, dim=-1)
        else:
            raise NotImplemented()
        attn = self.dropout(attn)
        out = einsum('b h i j, b h j d -> b h i d', attn, v)
        out = rearrange(out, 'b h n d -> b n (h d)', h=h)
        return self.to_out(out)

In [None]:
train()