In [5]:
!pip install transformers
!pip install SentencePiece
!pip install vncorenlp



In [6]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [7]:
import pandas as pd
import numpy as np

from transformers import AutoModel, AutoTokenizer

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from torch.utils.data import Dataset, DataLoader

from tqdm import tqdm

from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.metrics import confusion_matrix, classification_report

import warnings
warnings.filterwarnings("ignore")

In [8]:
import os
os.listdir('/content/drive/MyDrive/test/vihsd')

['dev.csv', 'train.csv', 'test.csv']

In [9]:
train_df = pd.read_csv('/content/drive/MyDrive/test/vihsd/train.csv')
dev_df = pd.read_csv('/content/drive/MyDrive/test/vihsd/dev.csv')
test_df = pd.read_csv('/content/drive/MyDrive/test/vihsd/test.csv')

In [10]:
train_df.shape, dev_df.shape, test_df.shape

((24048, 2), (2672, 2), (6680, 2))

# P

In [11]:
#pre-process
import re
import numpy as np

STOPWORDS = '/content/drive/MyDrive/test/vietnamese-stopwords.txt'
with open(STOPWORDS, "r") as ins:
    stopwords = []
    for line in ins:
        dd = line.strip('\n')
        stopwords.append(dd)
    stopwords = set(stopwords)

def filter_stop_words(train_sentences, stop_words):
    new_sent = [word for word in train_sentences.split() if word not in stop_words]
    train_sentences = ' '.join(new_sent)

    return train_sentences

def deEmojify(text):
    regrex_pattern = re.compile(pattern = "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags = re.UNICODE)
    return regrex_pattern.sub(r'', text)

def preprocess(text, tokenizer):
    # text = filter_stop_words(text, stopwords)
    text = deEmojify(text)
    text = text.lower()
    text = tokenizer(text, return_tensors='pt', max_length=200, truncation=True, padding='max_length')
    ids = text['input_ids']
    attn_mask = text['attention_mask']
    return ids, attn_mask


def full_preprocess(X):
    X_p = [preprocess(text, tokenizer) for text in X]
    X_ids = [x[0] for x in X_p]
    X_attn_mask = [x[1] for x in X_p]
    X_ids, X_attn_mask = torch.cat(X_ids), torch.cat(X_attn_mask)
    return X_ids, X_attn_mask

In [12]:
# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base")

X_train = train_df['free_text'].fillna('')
y_train = train_df['label_id'].values

X_dev = dev_df['free_text'].fillna('')
y_dev = dev_df['label_id'].values

X_test = test_df['free_text'].fillna('')
y_test = test_df['label_id'].values

X_train_ids, X_train_attn_masks = full_preprocess(X_train)
X_dev_ids, X_dev_attn_masks = full_preprocess(X_dev)
X_test_ids, X_test_attn_masks = full_preprocess(X_test)

config.json:   0%|          | 0.00/557 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/895k [00:00<?, ?B/s]

bpe.codes:   0%|          | 0.00/1.14M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.13M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/543M [00:00<?, ?B/s]

In [203]:
# model(X_train_ids[:2].to('cuda'), X_train_attn_masks[:2].to('cuda')).last_hidden_state

tensor([[[-0.2048, -0.1393, -0.2151,  ..., -0.5427, -0.0655, -0.0854],
         [-0.0637,  0.0886,  0.4915,  ..., -0.0878, -0.4867,  1.0695],
         [ 0.1235,  0.2365, -0.0434,  ...,  0.1860, -0.4038,  0.8755],
         ...,
         [-0.4209,  0.0350,  0.4603,  ..., -0.2518, -0.3851,  0.6667],
         [-0.4209,  0.0350,  0.4603,  ..., -0.2518, -0.3851,  0.6667],
         [-0.4209,  0.0350,  0.4603,  ..., -0.2518, -0.3851,  0.6667]],

        [[-0.1598, -0.0177, -0.2965,  ..., -0.2090,  0.0162,  0.7991],
         [-0.2434,  0.0890, -0.3365,  ..., -0.2292,  0.0827,  0.6587],
         [-0.2648, -0.4205, -0.1349,  ..., -0.0636,  0.2531,  0.5265],
         ...,
         [-0.6814,  0.3689, -0.2794,  ..., -0.1386, -0.1029,  0.3956],
         [-0.6814,  0.3689, -0.2794,  ..., -0.1386, -0.1029,  0.3956],
         [-0.6814,  0.3689, -0.2794,  ..., -0.1386, -0.1029,  0.3956]]],
       device='cuda:0', grad_fn=<NativeLayerNormBackward0>)

In [13]:
# Define a custom Dataset class
class TextDataset(Dataset):
    def __init__(self, input_ids, attention_masks, labels):
        self.input_ids = input_ids
        self.attention_masks = attention_masks
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_masks[idx],
            'labels': self.labels[idx]
        }

# Create datasets
train_dataset = TextDataset(X_train_ids, X_train_attn_masks, y_train)
dev_dataset = TextDataset(X_dev_ids, X_dev_attn_masks, y_dev)
test_dataset = TextDataset(X_test_ids, X_test_attn_masks, y_test)

# Create dataloaders
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)
dev_dataloader = DataLoader(dev_dataset, batch_size=32, shuffle=False)
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Model

In [14]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable
from torchvision import datasets, transforms
import torch.nn.functional as F

In [125]:
class ConvLayer(nn.Module):

    def __init__(self, in_channels=1, out_channels=256, kernel_size=9):
        super(ConvLayer, self).__init__()
        self.conv = nn.Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size, stride=1)

    def forward(self, x):
        return F.relu(self.conv(x))

class PrimaryCaps(nn.Module):

    def __init__(self, num_capsules=8, in_channels=256, out_channels=32, kernel_size=9):
        super(PrimaryCaps, self).__init__()
        self.num_capsules = num_capsules
        self.capsules = nn.ModuleList([
            nn.Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size, stride=2, padding=0)
            for _ in range(num_capsules)
        ])

    def forward(self, x):
        # (batch_size, in_channels, height, width)
        u = [capsule(x) for capsule in self.capsules]
        u = torch.stack(u, dim=1)
        u = u.view(x.size(0), -1, self.num_capsules)
        # (batch_size, out_dims, num_caps)
        return self.squash(u)

    def squash(self, input_tensor):
        squared_norm = (input_tensor ** 2).sum(-1, keepdim=True)
        output_tensor = squared_norm *  input_tensor / ((1. + squared_norm) * torch.sqrt(squared_norm))
        return output_tensor

In [105]:
class DigitCaps(nn.Module):
    def __init__(self, num_capsules=10, num_routes=32*6*6, in_channels=8, out_channels=16):
        super(DigitCaps, self).__init__()

        self.in_channels = in_channels
        self.num_routes = num_routes
        self.num_capsules = num_capsules

        self.W = nn.Parameter(torch.randn(1, num_routes, num_capsules, out_channels, in_channels))

    def forward(self, x):
        batch_size = x.size(0)
        x = torch.stack([x] * self.num_capsules, dim=2).unsqueeze(4)

        W = torch.cat([self.W] * batch_size, dim=0)
        u_hat = torch.matmul(W, x)

        b_ij = Variable(torch.zeros(1, self.num_routes, self.num_capsules, 1))
        if USE_CUDA:
            b_ij = b_ij.cuda()

        num_iterations = 3
        for iteration in range(num_iterations):
            c_ij = F.softmax(b_ij)
            c_ij = torch.cat([c_ij] * batch_size, dim=0).unsqueeze(4)

            s_j = (c_ij * u_hat).sum(dim=1, keepdim=True)
            v_j = self.squash(s_j)

            if iteration < num_iterations - 1:
                a_ij = torch.matmul(u_hat.transpose(3, 4), torch.cat([v_j] * self.num_routes, dim=1))
                b_ij = b_ij + a_ij.squeeze(4).mean(dim=0, keepdim=True)

        return v_j.squeeze(1)

    def squash(self, input_tensor):
        squared_norm = (input_tensor ** 2).sum(-1, keepdim=True)
        output_tensor = squared_norm *  input_tensor / ((1. + squared_norm) * torch.sqrt(squared_norm))
        return output_tensor

In [206]:
class Decoder(nn.Module):

    def __init__(self, hidden_size=16*10, output_size=(1, 28, 28), num_classes=10):

        super(Decoder, self).__init__()
        self.hidden_size = hidden_size
        self.num_classes = num_classes
        self.output_size = output_size
        self.reconstraction_layers = nn.Sequential(
            nn.Linear(hidden_size, 512),
            nn.ReLU(inplace=True),
            nn.Linear(512, 1024),
            nn.ReLU(inplace=True),
            nn.Linear(1024, self.output_size[0] * self.output_size[1] * self.output_size[2]),
            nn.Tanh() # from -1 to 1
        )

    def forward(self, x, data):
        classes = torch.sqrt((x ** 2).sum(2))
        classes = F.softmax(classes)
        max_values, max_indices = classes.max(dim=1)

        masked = Variable(torch.sparse.torch.eye(self.num_classes))
        if USE_CUDA:
            masked = masked.cuda()
        # masked = (batch_size, num_classes) 0/1, 1 is highest probability class
        masked = masked.index_select(dim=0, index=max_indices.squeeze(1).data)
        x = (x * masked[:, :, None, None]).view(x.size(0), -1)
        reconstructions = self.reconstraction_layers(x)
        reconstructions = reconstructions.view(-1, self.output_size[0], self.output_size[1], self.output_size[2])

        return reconstructions, masked

In [210]:
padding_size = 200

test_data_0 = torch.rand((5, 768, padding_size, 1))

test_layer_0 = ConvLayer(in_channels=768, out_channels=256, kernel_size=(9,1))
test_output_0 = test_layer_0(test_data_0)
print(test_output_0.shape)

test_layer_1 = PrimaryCaps(num_capsules=8, in_channels=256, out_channels=32, kernel_size=(9, 1))
test_output_1 = test_layer_1(test_output_0)
print(test_output_1.shape)

test_layer_2 = DigitCaps(num_routes=2944, in_channels=8, num_capsules=3, out_channels=16)
test_output_2 = test_layer_2(test_output_1)
print(test_output_2.shape)

test_layer_3 = Decoder(hidden_size=3*16, output_size=(768, 200, 1), num_classes=3)
test_output_31, test_output_32 = test_layer_3(test_output_2, test_data_0)
print(test_output_31.shape, test_output_31.shape)

# caps = CapsNet()
# caps(test_data_1)

torch.Size([5, 256, 192, 1])
torch.Size([5, 2944, 8])
torch.Size([5, 3, 16, 1])
torch.Size([5, 768, 200, 1]) torch.Size([5, 768, 200, 1])


In [225]:
class CapsNet(nn.Module):
    def __init__(self):
        super(CapsNet, self).__init__()

        self.phobert = AutoModel.from_pretrained("vinai/phobert-base")
        self.conv_layer = ConvLayer(in_channels=768, out_channels=256, kernel_size=(9,1))
        self.primary_capsules = PrimaryCaps(num_capsules=8, in_channels=256, out_channels=32, kernel_size=(9, 1))
        self.digit_capsules = DigitCaps(num_routes=2944, in_channels=8, num_capsules=3, out_channels=16)
        self.decoder = Decoder(hidden_size=3*16, output_size=(768, 200, 1), num_classes=3)

        self.mse_loss = nn.MSELoss()

        self.phobert.train()
        self.phobert.requires_grad_(True)

    def forward(self, ids, attn_mask):
        x = self.phobert(ids, attn_mask).last_hidden_state.squeeze(-1)
        output = self.digit_capsules(self.primary_capsules(self.conv_layer(x)))
        reconstructions, masked = self.decoder(output, x)
        return output, reconstructions, masked

    def loss(self, data, x, target, reconstructions):
        return self.margin_loss(x, target) + self.reconstruction_loss(data, reconstructions)

    def margin_loss(self, x, labels, size_average=True):
        batch_size = x.size(0)

        v_c = torch.sqrt((x**2).sum(dim=2, keepdim=True))

        left = F.relu(0.9 - v_c).view(batch_size, -1)
        right = F.relu(v_c - 0.1).view(batch_size, -1)

        loss = labels * left + 0.5 * (1.0 - labels) * right
        loss = loss.sum(dim=1).mean()

        return loss

    def reconstruction_loss(self, data, reconstructions):
        loss = self.mse_loss(reconstructions.view(reconstructions.size(0), -1), data.view(reconstructions.size(0), -1))
        return loss * 0.0005

In [227]:
capsule_net = CapsNet()

In [None]:
for batch in train_dataloader:
    print(batch)
    capsule_net(batch['input_ids'], batch['attention_mask'])
    break

{'input_ids': tensor([[    0,  1021,   944,  ...,     1,     1,     1],
        [    0,  3788,  2506,  ...,     1,     1,     1],
        [    0,  7220,  2052,  ...,     1,     1,     1],
        ...,
        [    0,  7304, 13373,  ...,     1,     1,     1],
        [    0,  1818,    83,  ...,     1,     1,     1],
        [    0,  2272,  2492,  ...,     1,     1,     1]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), 'labels': tensor([0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 1, 0, 0, 0, 0, 0, 0, 2, 0,
        2, 0, 0, 0, 2, 0, 0, 0])}
