In [1]:
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import Dataset, DataLoader
from torchtext.data.utils import get_tokenizer
from sklearn.metrics import fbeta_score
from torch.nn import CrossEntropyLoss
import torch.nn.functional as F
from torch.optim import Adam
from torch import nn
from tqdm import tqdm
import pandas as pd
import numpy as np
import random
import torch
import math
import json
import os

In [2]:
seed = 27
batch_size = 15
max_epochs = 1
device = "cuda"

In [3]:
random.seed(seed)
os.environ['PYTHONHASHSEED'] = str(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)

In [4]:
train_ds = pd.read_json('/kaggle/input/pii-detection-removal-from-educational-data/train.json')
test = pd.read_json('/kaggle/input/pii-detection-removal-from-educational-data/test.json')

# train = train_ds.sample(frac=0.8, random_state=seed)
# val = train_ds.drop(train.index)
train = train_ds.copy()

In [5]:
def build_vocabulary(corpus):
    tokenizer = get_tokenizer("spacy", language="en_core_web_lg")

    def tokenizer_fn(data_iterator):
        for text in tqdm(data_iterator):
            yield tokenizer(text)

    vocab = build_vocab_from_iterator(tokenizer_fn(corpus), specials=["<unk>"])
    vocab.set_default_index(vocab["<unk>"])

    return vocab, tokenizer

corpus = list(train_ds["full_text"].values) + list(test["full_text"].values)
vocab, tokenizer = build_vocabulary(corpus)
vocab_size = len(vocab)
print(f"\nVocabulary size: {vocab_size}")

100%|██████████| 6817/6817 [00:25<00:00, 264.45it/s]



Vocabulary size: 53986


In [6]:
def tokenize_text(dataframe, vocab, is_test=False):
    dataframe["token_num"] = dataframe["tokens"].apply(lambda x: np.array(vocab(x), dtype=np.int64))
    dataframe['seq_len'] = dataframe['tokens'].apply(lambda x: len(x))
    percentiles = [i * 0.1 for i in range(10)] + [.95, .99, .995]
    buckets = np.quantile(dataframe['seq_len'], percentiles)
    bucket_labels = [i for i in range(len(buckets) - 1)]
    dataframe['bucket'] = pd.cut(dataframe['seq_len'], bins=buckets, labels=bucket_labels)
    dataframe["bucket"] = dataframe["bucket"].fillna(0)
    dataframe["bucket"] = dataframe["bucket"].astype(int)
    dataframe["seq_len"] = dataframe["seq_len"].astype(int)
    if is_test:
        return dataframe[['token_num', 'seq_len', 'bucket', 'document']]
    return dataframe[['token_num', 'labels', "seq_len", "bucket"]]

In [7]:
train = tokenize_text(train, vocab, False)
# val = tokenize_text(val, vocab, False)
test = tokenize_text(test, vocab, True)

In [8]:
label_2_id = {
    "O": 0,
    "B-NAME_STUDENT": 1,
    "I-NAME_STUDENT": 2,
    "B-URL_PERSONAL": 3,
    "B-ID_NUM": 4,
    "B-EMAIL": 5,
    "I-STREET_ADDRESS": 6,
    "I-PHONE_NUM": 7,
    "B-USERNAME": 8,
    "B-PHONE_NUM": 9,
    "B-STREET_ADDRESS": 10,
    "I-URL_PERSONAL": 11,
    "I-ID_NUM": 12
}

id_2_label = {v: k for k, v in label_2_id.items()}

In [9]:
train["labels"] = train["labels"].apply(lambda x: [label_2_id[l] for l in x])
# val["labels"] = val["labels"].apply(lambda x: [label_2_id[l] for l in x])

In [10]:
def complete_batch(dataframe, batch_size):
    complete_buckets = []
    buckets = [bucket_df for _, bucket_df in dataframe.groupby('bucket')]

    for gr_id, bucket in enumerate(buckets):
        l = len(bucket)
        remainder = l % batch_size
        integer = l // batch_size

        if remainder != 0:
            bucket = pd.concat([bucket, pd.concat([bucket.iloc[:1]] * (batch_size - remainder))], ignore_index=True)
            integer += 1

        batch_ids = []
        for i in range(integer):
            batch_ids.extend([f'{i}_bucket{gr_id}'] * batch_size)

        bucket['batch_id'] = batch_ids
        complete_buckets.append(bucket)
    return pd.concat(complete_buckets, ignore_index=True)


def shuffle_batches(dataframe):
    batch_buckets = [df_new for _, df_new in dataframe.groupby('batch_id')]
    random.shuffle(batch_buckets)
    return pd.concat(batch_buckets).reset_index(drop=True)


def concater_collate(batch):
    (xx, yy, lengths, buckets) = zip(*batch)
    xx = torch.cat(xx, 0)
    yy = torch.from_numpy(np.array(yy))
    return xx, yy, list(lengths), list(buckets)

def concater_collate_test(batch):
    (xx, lengths, buckets, documents) = zip(*batch)
    xx = torch.cat(xx, 0)
    return xx, list(lengths), list(buckets), list(documents)


class PIIDataset(Dataset):
    def __init__(self, dataframe, batch_size, is_test=False):
        dataframe = complete_batch(dataframe=dataframe, batch_size=batch_size)
        dataframe = shuffle_batches(dataframe=dataframe)
        self.is_test = is_test
        if is_test:
            self.dataframe = dataframe[['token_num', 'seq_len', 'bucket', 'document']]
        else:
            self.dataframe = dataframe[['token_num', 'labels', 'seq_len', 'bucket']]

    def __getitem__(self, index):
        if self.is_test:
            X, seq_len, bucket, document = self.dataframe.iloc[index, :]
            X = torch.from_numpy(X)
            return X, seq_len, bucket, document
             
        else:
            X, Y, seq_len, bucket = self.dataframe.iloc[index, :]
            Y = torch.from_numpy(np.array(Y))
            padding = 3298 - len(Y)
            Y = F.pad(Y, (0, padding), value=99)
            X = torch.from_numpy(X)
            return X, Y, seq_len, bucket

    def __len__(self):
        return len(self.dataframe)

In [11]:
train_dataset = PIIDataset(train, batch_size, False)
# val_dataset = PIIDataset(val, batch_size, False)
test_dataset = PIIDataset(test, batch_size, True)

train_dataloader = DataLoader(
    train_dataset,
    batch_size=batch_size,
    shuffle=False,
    collate_fn=concater_collate,
    drop_last=False,
    num_workers=2
)

# val_dataloader = DataLoader(
#     val_dataset,
#     batch_size=batch_size,
#     shuffle=False,
#     collate_fn=concater_collate,
#     drop_last=False,
#     num_workers=2
# )

test_dataloader = DataLoader(
    test_dataset,
    batch_size=batch_size,
    collate_fn=concater_collate_test,
    shuffle=False,
    drop_last=False,
    num_workers=2
)

In [12]:
class MLP(nn.Module):
    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
        super().__init__()
        out_features = out_features or in_features
        hidden_features = hidden_features or in_features
        self.fc1 = nn.Linear(in_features, hidden_features)
        self.act = act_layer()
        self.fc2 = nn.Linear(hidden_features, out_features)
        self.drop = nn.Dropout(drop)

    def forward(self, x):
        x = self.fc1(x)
        x = self.act(x)
        x = self.drop(x)
        x = self.fc2(x)
        x = self.drop(x)
        return x


class RotateChord(nn.Module):
    def __init__(self, n_tracks, track_size):
        super(RotateChord, self).__init__()
        self.n_tracks = n_tracks
        self.track_size = track_size

    def forward(self, x, lengths):

        ys = torch.split(
            tensor=x,
            split_size_or_sections=lengths,
            dim=0
        )

        zs = []

        # roll sequences separately
        for y in ys:
            y = torch.split(
                tensor=y,
                split_size_or_sections=self.track_size,
                dim=-1
            )
            z = [y[0]]
            for i in range(1, len(y)):
                offset = -2 ** (i - 1)
                z.append(torch.roll(y[i], shifts=offset, dims=0))
            z = torch.cat(z, -1)
            zs.append(z)

        z = torch.cat(zs, 0)
        assert z.shape == x.shape, 'shape mismatch'
        return z


class ChordMixerBlock(nn.Module):
    def __init__(
            self,
            embedding_size,
            n_tracks,
            track_size,
            hidden_size,
            mlp_dropout,
            layer_dropout
    ):
        super(ChordMixerBlock, self).__init__()

        self.mixer = MLP(
            embedding_size,
            hidden_size,
            embedding_size,
            act_layer=nn.GELU,
            drop=mlp_dropout
        )

        self.dropout = nn.Dropout(layer_dropout)

        self.rotator = RotateChord(n_tracks, track_size)

    def forward(self, data, lengths):
        res_con = data
        data = self.mixer(data)
        data = self.dropout(data)
        data = self.rotator(data, lengths)
        data = data + res_con
        return data


class ChordMixer(nn.Module):
    def __init__(self, vocab_size=53986, max_seq_len=3298, track_size=16, hidden_size=196, mlp_dropout=0.0, layer_dropout=0.0):
        super(ChordMixer, self).__init__()
        self.max_n_layers = math.ceil(np.log2(max_seq_len))
        n_tracks = math.ceil(np.log2(max_seq_len))
        embedding_size = int(n_tracks * track_size)
        self.embedding_size = embedding_size
        self.max_seq_len = max_seq_len
        
        self.embedding = nn.Embedding(
            vocab_size,
            embedding_size
        )

        self.chordmixer_blocks = nn.ModuleList(
            [
                ChordMixerBlock(
                    embedding_size,
                    n_tracks,
                    track_size,
                    hidden_size,
                    mlp_dropout,
                    layer_dropout
                )
                for _ in range(self.max_n_layers)
            ]
        )

        self.reshape = nn.Linear(embedding_size, max_seq_len * embedding_size)
        self.final = nn.Linear(embedding_size, 13)

    def forward(self, data, lengths):
        n_layers = math.ceil(np.log2(lengths[0]))

        data = self.embedding(data)
        for layer in range(n_layers):
            data = self.chordmixer_blocks[layer](data, lengths)

        data = [torch.mean(t, dim=0) for t in torch.split(data, lengths)]
        data = torch.stack(data)
        
        data = self.reshape(data)
        data = data.view(data.size(0), self.max_seq_len, -1)
        
        data = self.final(data)
        
        return data
    
class Trainer:
    def __init__(self, model, train_dataloader, criterion, optimizer):
        self.model = model
        self.train_dataloader = train_dataloader
#         self.val_dataloader = val_dataloader
        self.criterion = criterion
        self.optimizer = optimizer

    def train(self, current_epoch_nr):
        self.model.train()

        num_batches = len(self.train_dataloader)

        running_loss = 0.0
        items_processed = 0
        fbeta_score_total = 0.0

#         loop = tqdm(enumerate(self.train_dataloader), total=num_batches)
        for idx, (x, y, seq_len, bucket) in enumerate(self.train_dataloader):
            x = x.to(device)
            y = y.to(device)
            
            y_hat = self.model(x, seq_len)

            y_hat_flat = y_hat.view(-1, y_hat.size(-1))
            y_flat = y.view(-1)
            
            mask = y_flat.ne(99)
            y_hat_flat_masked = y_hat_flat[mask]
            y_flat_masked = y_flat[mask]

            loss = self.criterion(y_hat_flat_masked, y_flat_masked)
            loss.backward()

            self.optimizer.step()
            self.optimizer.zero_grad()

#             running_loss += loss.item()
#             items_processed += y.size(0)
            
#             y_pred = torch.argmax(y_hat_flat_masked, dim=1)
#             fbeta_score_total += fbeta_score(y_flat_masked.cpu().numpy(), y_pred.cpu().numpy(), beta=5, average='micro')
            
#             loop.set_description(f'Epoch {current_epoch_nr + 1}')
#             loop.set_postfix(train_loss=round(running_loss / items_processed, 6), fbeta_score=fbeta_score_total / (idx + 1))
            
#     def validate(self, current_epoch_nr):
#         self.model.eval()

#         num_batches = len(self.val_dataloader)

#         running_loss = 0.0
#         items_processed = 0
#         fbeta_score_total = 0.0

#         loop = tqdm(enumerate(self.val_dataloader), total=num_batches)
#         for idx, (x, y, seq_len, bucket) in loop:
#             x = x.to(device)
#             y = y.to(device)
            
#             y_hat = self.model(x, seq_len)

#             y_hat_flat = y_hat.view(-1, y_hat.size(-1))
#             y_flat = y.view(-1)
            
#             mask = y_flat.ne(99)
#             y_hat_flat_masked = y_hat_flat[mask]
#             y_flat_masked = y_flat[mask]

#             loss = self.criterion(y_hat_flat_masked, y_flat_masked)

#             running_loss += loss.item()
#             items_processed += y.size(0)

#             y_pred = torch.argmax(y_hat_flat_masked, dim=1)
#             fbeta_score_total += fbeta_score(y_flat_masked.cpu().numpy(), y_pred.cpu().numpy(), beta=5, average='micro')
            
#             loop.set_description(f'Epoch {current_epoch_nr + 1}')
#             loop.set_postfix(val_loss=round(running_loss / items_processed, 6), fbeta_score=fbeta_score_total / (idx + 1))
        

In [13]:
model = ChordMixer().to(device)
criterion = CrossEntropyLoss()
optimizer = Adam(lr=0.0003, params=model.parameters())

trainer = Trainer(model, train_dataloader, criterion, optimizer)

for epoch in range(max_epochs):
    print("Epoch ", epoch)
    trainer.train(epoch)
#     trainer.validate(epoch)

Epoch  0


In [14]:
model.eval()
predictions = []
for x, seq_len, bucket, document in test_dataloader:
    x = x.to(device)
    
    y_hat = model(x, seq_len)
    y_pred = torch.argmax(y_hat, dim=2)
    temp_preds = {
        "tokens": x.cpu().numpy(),
        "document": document,
        "labels": y_pred.cpu().numpy()
    }
    predictions.append(temp_preds)

In [15]:
result = []
for pred in predictions:
    documents = pred['document']
    labels = pred['labels']
    tokens = pred['tokens']
    for i in range(batch_size):
        document = documents[i]
        for idx, label in enumerate(labels[i]):
            if label != 0 and idx < len(tokens):
                result.append({
                    "document": document,
                    "token": idx,
                    "word": vocab.get_itos()[tokens[idx]],
                    "label": id_2_label[label]
                })

In [16]:
result_df = pd.DataFrame(result)
result_df = result_df.sort_values(by=['document']).reset_index(drop=True)
result_df["row_id"] = result_df.index
result_df = result_df[["row_id", "document", "token", "label"]]

In [17]:
result_df

Unnamed: 0,row_id,document,token,label
0,0,7,3288,B-NAME_STUDENT
1,1,7,3289,I-NAME_STUDENT
2,2,7,3288,B-NAME_STUDENT
3,3,7,3289,I-NAME_STUDENT
4,4,7,3288,B-NAME_STUDENT
...,...,...,...,...
265,265,123,3289,I-NAME_STUDENT
266,266,123,3288,B-NAME_STUDENT
267,267,123,3289,I-NAME_STUDENT
268,268,123,3288,B-NAME_STUDENT


In [18]:
result_df.to_csv("submission.csv", index=False)