In [1]:
from model import *
from data import *
from utils import *
import pandas as pd

df = pd.read_csv("data/data_sample.csv", sep="|")
data = df["headline"].str.strip() + " " + df["text"].str.strip()

In [2]:
rnn_config = {
    'type': 'gru',
    'params': {
        'hidden_size':128,
        'num_layers': 1
    }
}

nn_config = {
    'in_features': 128,
}

# 1. Vocab from data

In [3]:
train_loader, test_loader, NUM_CLASSES, dataset = get_dataloaders(
    file="data/data_sample.csv",
    tokenizer=Tokenizer()
)

In [4]:
VOCAB_SIZE = len(dataset.vocab)
model = RNN.from_data(rnn_config=rnn_config, 
                      nn_config=nn_config, 
                      NUM_CLASSES=NUM_CLASSES,
                      vocab_size=VOCAB_SIZE,
                      embedding_dim=22)

In [5]:
optimizer = torch.optim.Adam(params=model.parameters())
criterion = nn.BCELoss()

In [6]:
losses = []
EPOCH = 10

for epoch in range(EPOCH):
    loss = train(optimizer=optimizer, criterion=criterion, model=model, train_loader=train_loader)
    print(f"epoch: {epoch}\tloss: {loss:.3f}")
    losses.append(loss)

epoch: 0	loss: 0.679
epoch: 1	loss: 0.662
epoch: 2	loss: 0.645
epoch: 3	loss: 0.629
epoch: 4	loss: 0.612
epoch: 5	loss: 0.593
epoch: 6	loss: 0.573
epoch: 7	loss: 0.552
epoch: 8	loss: 0.528
epoch: 9	loss: 0.502


In [19]:
evaluate(model=model, criterion=criterion, test_loader=test_loader)

{'loss': 0.4867036044597626, 'acc': tensor(39.6667)}

# 2. Vocab from GloVe

In [8]:
import torchtext

print("Available pretrained:")
for k, v in torchtext.vocab.pretrained_aliases.items():
    print(k,v)

Available pretrained:
charngram.100d functools.partial(<class 'torchtext.vocab.vectors.CharNGram'>)
fasttext.en.300d functools.partial(<class 'torchtext.vocab.vectors.FastText'>, language='en')
fasttext.simple.300d functools.partial(<class 'torchtext.vocab.vectors.FastText'>, language='simple')
glove.42B.300d functools.partial(<class 'torchtext.vocab.vectors.GloVe'>, name='42B', dim='300')
glove.840B.300d functools.partial(<class 'torchtext.vocab.vectors.GloVe'>, name='840B', dim='300')
glove.twitter.27B.25d functools.partial(<class 'torchtext.vocab.vectors.GloVe'>, name='twitter.27B', dim='25')
glove.twitter.27B.50d functools.partial(<class 'torchtext.vocab.vectors.GloVe'>, name='twitter.27B', dim='50')
glove.twitter.27B.100d functools.partial(<class 'torchtext.vocab.vectors.GloVe'>, name='twitter.27B', dim='100')
glove.twitter.27B.200d functools.partial(<class 'torchtext.vocab.vectors.GloVe'>, name='twitter.27B', dim='200')
glove.6B.50d functools.partial(<class 'torchtext.vocab.vecto

## 2.1 GloVe 50d

In [9]:
train_loader, test_loader, NUM_CLASSES, dataset = get_dataloaders(
    file="data/data_sample.csv",
    tokenizer=Tokenizer(),
    vocab_from="glove.6B.50d"
)

In [10]:
VOCAB_SIZE = len(dataset.vocab)
model = RNN.from_glove(rnn_config=rnn_config,
                       nn_config=nn_config,
                       NUM_CLASSES=dataset.NUM_CLASSES,
                       glove_vectors=dataset.vectors,
                       embedding_dim=50)

In [11]:
optimizer = torch.optim.Adam(params=model.parameters())
criterion = nn.BCELoss()

In [12]:
losses = []
EPOCH = 10

for epoch in range(EPOCH):
    loss = train(optimizer=optimizer, criterion=criterion, model=model, train_loader=train_loader)
    print(f"epoch: {epoch}\tloss: {loss:.3f}")
    losses.append(loss)

epoch: 0	loss: 0.682
epoch: 1	loss: 0.662
epoch: 2	loss: 0.643
epoch: 3	loss: 0.623
epoch: 4	loss: 0.605
epoch: 5	loss: 0.585
epoch: 6	loss: 0.566
epoch: 7	loss: 0.545
epoch: 8	loss: 0.524
epoch: 9	loss: 0.503


In [13]:
evaluate(model=model, criterion=criterion, test_loader=test_loader)

{'loss': 0.4879262149333954, 'acc': tensor(40.3333)}

## 2.2 GloVe 200d

In [14]:
train_loader, test_loader, NUM_CLASSES, dataset = get_dataloaders(
    file="data/data_sample.csv",
    tokenizer=Tokenizer(),
    vocab_from="glove.6B.200d"
)

In [15]:
VOCAB_SIZE = len(dataset.vocab)
model = RNN.from_glove(rnn_config=rnn_config,
                       nn_config=nn_config,
                       NUM_CLASSES=dataset.NUM_CLASSES,
                       glove_vectors=dataset.vectors,
                       embedding_dim=200)

In [16]:
optimizer = torch.optim.Adam(params=model.parameters())
criterion = nn.BCELoss()

In [17]:
losses = []
EPOCH = 10

for epoch in range(EPOCH):
    loss = train(optimizer=optimizer, criterion=criterion, model=model, train_loader=train_loader)
    print(f"epoch: {epoch}\tloss: {loss:.3f}")
    losses.append(loss)

epoch: 0	loss: 0.719
epoch: 1	loss: 0.694
epoch: 2	loss: 0.669
epoch: 3	loss: 0.648
epoch: 4	loss: 0.625
epoch: 5	loss: 0.604
epoch: 6	loss: 0.582
epoch: 7	loss: 0.561
epoch: 8	loss: 0.539
epoch: 9	loss: 0.518


In [18]:
evaluate(model=model, criterion=criterion, test_loader=test_loader)

{'loss': 0.48572081327438354, 'acc': tensor(39.3333)}