In [1]:
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.8.0-py3-none-any.whl (452 kB)
[K     |████████████████████████████████| 452 kB 5.8 MB/s 
Collecting multiprocess
  Downloading multiprocess-0.70.14-py38-none-any.whl (132 kB)
[K     |████████████████████████████████| 132 kB 64.1 MB/s 
[?25hCollecting xxhash
  Downloading xxhash-3.2.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (213 kB)
[K     |████████████████████████████████| 213 kB 73.0 MB/s 
Collecting huggingface-hub<1.0.0,>=0.2.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 60.6 MB/s 
[?25hCollecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1
  Downloading urllib3-1.25.11-py2.py3-none-any.whl (127 kB)
[K     |████████████████████████████████| 127 kB 75.5 MB/s 
Installing colle

In [3]:
import random
import numpy as np
import nltk
import gensim.downloader as api
import torch
import torch.nn as nn
import datasets

In [4]:
random.seed(42)
np.random.seed(42)
torch.random.manual_seed(42)
torch.cuda.random.manual_seed_all(42)

In [5]:
dataset = datasets.load_dataset("ag_news")
dataset["train"]

Downloading builder script:   0%|          | 0.00/4.06k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.65k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/7.94k [00:00<?, ?B/s]

Downloading and preparing dataset ag_news/default to /root/.cache/huggingface/datasets/ag_news/default/0.0.0/bc2bcb40336ace1a0374767fc29bb0296cdaf8a6da7298436239c54d79180548...


Downloading data:   0%|          | 0.00/11.0M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/751k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/120000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/7600 [00:00<?, ? examples/s]

Dataset ag_news downloaded and prepared to /root/.cache/huggingface/datasets/ag_news/default/0.0.0/bc2bcb40336ace1a0374767fc29bb0296cdaf8a6da7298436239c54d79180548. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

Dataset({
    features: ['text', 'label'],
    num_rows: 120000
})

In [6]:
dataset["train"][0]

{'text': "Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\\band of ultra-cynics, are seeing green again.",
 'label': 2}

In [7]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 120000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 7600
    })
})

In [9]:
print("\n".join(api.info()['models'].keys()))

fasttext-wiki-news-subwords-300
conceptnet-numberbatch-17-06-300
word2vec-ruscorpora-300
word2vec-google-news-300
glove-wiki-gigaword-50
glove-wiki-gigaword-100
glove-wiki-gigaword-200
glove-wiki-gigaword-300
glove-twitter-25
glove-twitter-50
glove-twitter-100
glove-twitter-200
__testing_word2vec-matrix-synopsis


In [10]:
word2vec = api.load("glove-twitter-50")



In [11]:
from torch.utils.data import Dataset


class AgNewsDataset(Dataset):
    def __init__(self, word2vec, train=True, max_length=128):
        self.data = dataset["train"] if train else dataset["test"]
        self.tokenizer = nltk.WordPunctTokenizer()
        self.word2vec = word2vec
        self.max_length = max_length 
        self.mean = np.mean(word2vec.vectors, axis=0)
        self.std = np.std(word2vec.vectors, axis=0)

    def __getitem__(self, item):
        text = self.data[item]["text"]
        tokens = self.tokenizer.tokenize(text.lower())
        embeds = [
            self.word2vec.get_vector(token) 
            for token in tokens if token in self.word2vec
        ][:self.max_length]
        return {"inputs": (np.array(embeds) - self.mean) / self.std, "label": self.data[item]["label"]}

    def __len__(self):
        return len(self.data)

In [13]:
from torch.utils.data import Sampler, RandomSampler

class TextSampler(Sampler):
    def __init__(self, sampler, batch_size_tokens=1e5):
        self.sampler = sampler
        self.batch_size_tokens = batch_size_tokens

    def __iter__(self):
        batch = []
        max_len = 0
        for ix in self.sampler:
            row = self.sampler.data_source[ix]
            max_len = max(max_len, len(row["inputs"]))
            if (len(batch) + 1) * max_len > self.batch_size_tokens:
                yield batch
                batch = []
                max_len = len(row["inputs"])
            batch.append(ix)
        if len(batch) > 0:
            yield batch

    def __len__(self):
        return len(self.sampler)

In [14]:
def collate_fn(batch):
    max_len = max(len(row["inputs"]) for row in batch)
    input_embeds = np.zeros((len(batch), max_len, word2vec.vector_size))
    labels = np.zeros((len(batch),))
    for idx, row in enumerate(batch):
        input_embeds[idx][:len(row["inputs"])] += row["inputs"]
        labels[idx] = row["label"]
    return {"inputs": torch.FloatTensor(input_embeds), "labels": torch.LongTensor(labels)}


In [15]:
from torch.utils.data import DataLoader, SequentialSampler, RandomSampler


train_dataset = AgNewsDataset(word2vec, train=True)
valid_dataset = AgNewsDataset(word2vec, train=False)
train_sampler = RandomSampler(train_dataset)
valid_sampler = SequentialSampler(valid_dataset)

train_loader = DataLoader(train_dataset, batch_sampler=TextSampler(train_sampler), collate_fn=collate_fn, num_workers=4)
valid_loader = DataLoader(valid_dataset, batch_sampler=TextSampler(valid_sampler), collate_fn=collate_fn, num_workers=4)



In [16]:
class CNNModel(nn.Module):
    def __init__(self, embed_size, hidden_size, num_classes=4):
        super().__init__()

        self.cnn = nn.Sequential(
            nn.Conv1d(embed_size, hidden_size, kernel_size=3, padding=1, stride=2),
            nn.BatchNorm1d(hidden_size),
            nn.ReLU(),
            nn.Conv1d(hidden_size, hidden_size, kernel_size=3, padding=1, stride=2),
            nn.BatchNorm1d(hidden_size),
            nn.ReLU(),
            nn.Conv1d(hidden_size, hidden_size, kernel_size=3, padding=1, stride=2),
            nn.BatchNorm1d(hidden_size),
            nn.ReLU(),
            nn.AdaptiveMaxPool1d(1),
            nn.Flatten(),
        )
        self.cl = nn.Sequential(
            nn.Linear(hidden_size, num_classes)
        )

    def forward(self, x):
        x = x.permute(0, 2, 1)
        features = self.cnn(x)
        prediction = self.cl(features)
        return prediction

In [22]:
device = "cuda" if torch.cuda.is_available() else "cpu"

model = CNNModel(word2vec.vector_size, 50).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-2)

num_epochs = 1

In [23]:
import time
from tqdm.notebook import tqdm


def training(model, criterion, optimizer, num_epochs, max_grad_norm=0):
    for e in range(num_epochs):
        model.train()
        num_iter = 0
        pbar = tqdm(train_loader)
        for batch in pbar:
            input_embeds = batch["inputs"].to(device)
            labels = batch["labels"].to(device)

            optimizer.zero_grad()
            prediction = model(input_embeds)
            loss = criterion(prediction, labels)
            loss.backward()
            pbar.update(labels.size(0))
            if max_grad_norm is not None:
                torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
            optimizer.step()
            num_iter += 1
        valid_loss = 0
        valid_acc = 0
        num_iter = 0
        model.eval()
        with torch.no_grad():
            correct = 0
            num_objects = 0
            
            for batch in valid_loader:
                input_embeds = batch["inputs"].to(device)
                labels = batch["labels"].to(device)

                prediction = model(input_embeds)
                valid_loss += criterion(prediction, labels)
                
                correct += (labels == prediction.argmax(-1)).float().sum()
                num_objects += len(labels)
                
                num_iter += 1

        print(f"Valid Loss: {valid_loss / num_iter}, accuracy: {correct / num_objects}")

In [24]:
training(model, criterion, optimizer, num_epochs)

  0%|          | 0/120000 [00:00<?, ?it/s]

Valid Loss: 1.793940782546997, accuracy: 0.21776314079761505


In [25]:
class RNN(nn.Module):
    def __init__(self, embed_size, hidden_size):
        super().__init__()

        self.embed_size = embed_size
        self.hidden_size = hidden_size

        self.w_h = nn.Parameter(torch.rand(hidden_size, hidden_size))
        self.b_h = nn.Parameter(torch.rand((1, hidden_size)))
        self.w_x = nn.Parameter(torch.rand(embed_size, hidden_size))
        self.b_x = nn.Parameter(torch.rand(1, hidden_size))

    def forward(self, x, hidden = None):

        hidden = torch.zeros((x.size(0), self.hidden_size)).to(x.device) if hidden is None else hidden
        for idx in range(x.size(1)):
            hidden = torch.tanh(x[:, idx] @ self.w_x + self.b_x + hidden @ self.w_h + self.b_h)
        return hidden

In [26]:
class RNNModel(nn.Module):
    def __init__(self, embed_size, hidden_size, num_classes=4):
        super().__init__()

        self.rnn = RNN(embed_size, hidden_size)
        self.cls = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        hidden = self.rnn(x)
        output = self.cls(hidden)
        return output

In [27]:
device = "cuda" if torch.cuda.is_available() else "cpu"

model = RNNModel(word2vec.vector_size, 50).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-2)

num_epochs = 1
max_grad_norm = 1.0

In [28]:
training(model, criterion, optimizer, num_epochs, max_grad_norm)

  0%|          | 0/120000 [00:00<?, ?it/s]

Valid Loss: 1.3736284971237183, accuracy: 0.30710524320602417


In [29]:
class GRU(nn.Module):
    def __init__(self, embed_size, hidden_size):
        super().__init__()

        self.embed_size = embed_size
        self.hidden_size = hidden_size

        self.w_rh = nn.Parameter(torch.rand(hidden_size, hidden_size))
        self.b_rh = nn.Parameter(torch.rand((1, hidden_size)))
        self.w_rx = nn.Parameter(torch.rand(embed_size, hidden_size))
        self.b_rx = nn.Parameter(torch.rand(1, hidden_size))

        self.w_zh = nn.Parameter(torch.rand(hidden_size, hidden_size))
        self.b_zh = nn.Parameter(torch.rand((1, hidden_size)))
        self.w_zx = nn.Parameter(torch.rand(embed_size, hidden_size))
        self.b_zx = nn.Parameter(torch.rand(1, hidden_size))

        self.w_nh = nn.Parameter(torch.rand(hidden_size, hidden_size))
        self.b_nh = nn.Parameter(torch.rand((1, hidden_size)))
        self.w_nx = nn.Parameter(torch.rand(embed_size, hidden_size))
        self.b_nx = nn.Parameter(torch.rand(1, hidden_size))

    def forward(self, x, hidden = None):

        hidden = torch.zeros((x.size(0), self.hidden_size)).to(x.device) if hidden is None else hidden
        for idx in range(x.size(1)):
            r = torch.sigmoid(x[:, idx] @ self.w_rx + self.b_rx + hidden @ self.w_rh + self.b_rh)
            z = torch.sigmoid(x[:, idx] @ self.w_zx + self.b_zx + hidden @ self.w_zh + self.b_zh)
            n = torch.tanh(x[:, idx] @ self.w_zx + self.b_zx + r * (hidden @ self.w_zh + self.b_zh))
            hidden = (1 - z) * n + z * hidden
        return hidden

In [30]:
class GRUModel(nn.Module):
    def __init__(self, embed_size, hidden_size, num_classes=4):
        super().__init__()

        self.gru = GRU(embed_size, hidden_size)
        self.cls = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        hidden = self.gru(x)
        output = self.cls(hidden)
        return output

In [33]:
device = "cuda" if torch.cuda.is_available() else "cpu"

model = GRUModel(word2vec.vector_size, 50).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-2)

num_epochs = 1
max_grad_norm = 1.0

In [32]:
training(model, criterion, optimizer, num_epochs, max_grad_norm)

  0%|          | 0/120000 [00:00<?, ?it/s]

Valid Loss: 1.309853434562683, accuracy: 0.3461841940879822


In [34]:
class AgNewsDatasetv2(Dataset):
    def __init__(self, train=True, max_length=128):
        self.data = dataset["train"] if train else dataset["test"]
        self.tokenizer = nltk.WordPunctTokenizer()
        self.max_length = max_length
        self.vocab = set(
            self.tokenizer.tokenize("".join(d["text"].lower() for d in self.data))
        )
        self.word2idx = {word: idx + 1 for idx, word in enumerate(self.vocab)}

    def __getitem__(self, item):
        text = self.data[item]["text"]
        tokens = self.tokenizer.tokenize(text.lower())
        embeds = [self.word2idx.get(token, 0) for token in tokens][:self.max_length]
        return {"inputs": embeds, "label": self.data[item]["label"]}

    def __len__(self):
        return len(self.data)

In [35]:
def collate_fn_v2(batch):
    max_len = max(len(row["inputs"]) for row in batch)
    input_embeds = np.zeros((len(batch), max_len))
    labels = np.zeros((len(batch),))
    for idx, row in enumerate(batch):
        input_embeds[idx][:len(row["inputs"])] += row["inputs"]
        labels[idx] = row["label"]
    return {"inputs": torch.LongTensor(input_embeds), "labels": torch.LongTensor(labels)}

In [36]:
train_dataset = AgNewsDatasetv2(train=True)
valid_dataset = AgNewsDatasetv2(train=False)
valid_dataset.vocab = train_dataset.vocab
valid_dataset.word2idx = train_dataset.word2idx
train_sampler = RandomSampler(train_dataset)
valid_sampler = SequentialSampler(valid_dataset)

train_loader = DataLoader(train_dataset, batch_sampler=TextSampler(train_sampler), collate_fn=collate_fn_v2, num_workers=4)
valid_loader = DataLoader(valid_dataset, batch_sampler=TextSampler(valid_sampler), collate_fn=collate_fn_v2, num_workers=4)

In [37]:
class GRUModelv2(nn.Module):
    def __init__(self, voc_size, embed_size, hidden_size, num_classes=4):
        super().__init__()

        self.emb = nn.Embedding(voc_size, embed_size)
        self.gru = GRU(embed_size, hidden_size)
        self.cls = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        x = self.emb(x)
        hidden = self.gru(x)
        output = self.cls(hidden)
        return output

In [38]:
device = "cuda" if torch.cuda.is_available() else "cpu"

model = GRUModelv2(len(train_dataset.vocab) + 1, word2vec.vector_size, 50).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-2)

num_epochs = 1
max_grad_norm = 1.0

In [39]:
with torch.no_grad():
    for word in train_dataset.vocab:
        if word in word2vec:
            model.emb.weight[train_dataset.word2idx[word]] = torch.from_numpy(word2vec.get_vector(word))

  model.emb.weight[train_dataset.word2idx[word]] = torch.from_numpy(word2vec.get_vector(word))


In [40]:
training(model, criterion, optimizer, num_epochs, max_grad_norm)

  0%|          | 0/120000 [00:00<?, ?it/s]

Valid Loss: 1.1700891256332397, accuracy: 0.47999998927116394
