In [1]:
from data import *
from utils import *
import pandas as pd

df = pd.read_csv("data/data_sample.csv", sep="|")
df.shape

(29, 4)

Test class methods:
- build_vocab_from_data
- build_vocab_from_pretrain_emb
- build_with_transformer

In [2]:
data = df["headline"].str.strip() + " " + df["text"].str.strip()

dataset = MultiLabelDataset.build_vocab_from_data(
    data=data.values, 
    labels=df.label.values, 
    tokenizer=Tokenizer())

dataset[0]

(tensor([1, 1, 1,  ..., 1, 1, 1]),
 tensor([0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0,
         0, 0, 1, 1], dtype=torch.int16))

In [3]:
dataset = MultiLabelDataset.build_vocab_from_pretrain_emb(
    data=data.values, 
    labels=df.label.values, 
    tokenizer=Tokenizer(),
    pretrained_name="glove.6B.50d")

dataset[1]

(tensor([    0,     0,     0,    12,     0,     0,  4217,     0,     0,     0,
             0,     0,     4,     0,     0,     0,     0,     0,  3069,  5749,
             4,  1087,     0,   919, 24025,     0,  1246,     5,     0,     0,
             0,    14,   970,     7,  1903,  2309,   588,     0,   134, 10393,
             0,     4,     0,     0,     0,     0,     0,   177,  7124,     4,
           408,     0,   997, 24025,     0,     0,     0,  3096,  1852,  2575,
             6,  9068,     5,     0,     0,     0,     0,     4,     0,     0,
             0,     0,     0,   233,  7124,     4,   207,     0,   997, 24025,
             0,     0,     0,   195,     5,     0,     0,     0,     0,     0,
             0]),
 tensor([0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0,
         0, 0, 1, 1], dtype=torch.int16))

Test get_dataloaders()

In [4]:
train_loader, test_loader, num_classes = get_dataloaders(
    file="data/data_sample.csv",
    tokenizer=Tokenizer(),
    vocab_from="glove.6B.50d"
)

num_classes

45

In [5]:
for i in train_loader:
    print(i)
    break

{'x': tensor([[   0,    0,    0,  ...,    0,    0,    0],
        [   0,  545, 3065,  ...,    0,    0,    0],
        [   0,  896,  108,  ...,    0,    0,    0],
        ...,
        [   0,    0,   12,  ...,    0,    0,    0],
        [   0,    0,    0,  ...,    0,    0,    0],
        [   0,    0,   12,  ...,    0,    0,    0]]), 'y': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 1,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 1, 0],
        ...,
        [0, 0, 1,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [1, 0, 0,  ..., 0, 0, 1]], dtype=torch.int16), 'lengths': [6471, 754, 590, 457, 435, 353, 174, 135, 133, 122, 120, 120, 111, 109, 102, 100, 91, 86, 85, 74, 73, 69, 65]}


In [6]:
for i in test_loader:
    print(i)
    break

{'x': tensor([[    0,     0,  4649,  ...,    55,    31, 25059],
        [    0,     4,     0,  ...,     0,     0,     0],
        [    0,     0,    12,  ...,     0,     0,     0],
        [    0,  7949,  2058,  ...,     0,     0,     0],
        [    0,     0,    12,  ...,     0,     0,     0],
        [    0,     0,    12,  ...,     0,     0,     0]]), 'y': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0],
        [0, 0, 0, 0, 0,

In [7]:
train_loader, test_loader, num_classes = get_dataloaders(
    file="data/data_sample.csv",
    tokenizer="bert"
)

num_classes

45

In [8]:
for i in test_loader:
    print(i)
    break

{'texts': tensor([[  101,  2924,  1997,  2710,  6205,  1011,  2154,  1056,  1011,  3021,
         16189, 20704,  2290,  1017,  1012, 24709,  7473,  2102,  1012,  1996,
          2924,  1997,  2710,  2056,  1996,  2779, 10750,  2012,  2023,  2733,
          1055,   102],
        [  101,  7987,  2063,  1011,  1060,  6661,  2097,  2025, 13746,  2006,
         24529,  2063,  2651,  1012,  7987,  2063,  1060, 13246,  5183,  6661,
          2097,  2025, 13746,  6202,  2006,  1996,  4361,  4518,  3863,  2006,
          9857,   102],
        [  101,  2470,  9499,  1011, 10090,  9857,  5790,  2992,  1012,  2412,
          2368, 12012,  4297,  2056,  9857,  2009,  2992,  2049,  2416,  3204,
          5790,  2006, 10090,  9857,  4297,  2000,  2041,  4842, 14192,  2013,
          3006,   102],
        [  101,  4361,  4518,  3863, 13746,  2015,  6202,  1012,  1996,  4361,
          4518,  3863,  7943,  3274,  6202,  2004,  1997, 12457,  2629,  9765,
         26081, 13938,  2102,  2044,  1037,  3053