In [None]:
import numpy as np
import pandas as pd
import torch
import torchtext

from torchtext import data

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
# Hyperparameters
VOCAB_SIZE = 10000
BATCH_SIZE = 16
EMBED_DIM = 32
NUM_CLASS = 4
N_EPOCHS = 5
NGRAMS = 2

In [None]:
train = pd.read_csv('./data/train.csv')
train.head()

In [None]:
LABEL = data.Field(sequential = False, use_vocab = False)
TITLE = data.Field(tokenize = "spacy", include_lengths = True)
DESCS = data.Field(tokenize = "spacy", include_lengths = True)

fields = [
    (None, None),
    ('label', LABEL),
    ('title', TITLE),
    ('descs', DESCS)
]

train_ds, test_ds = data.TabularDataset.splits(
    path = './data',
    train = 'train.csv',
    test = 'test.csv',
    format = 'csv',
    fields = fields,
    skip_header = True
)

train_ds, valid_ds = train_ds.split()

In [None]:
LABEL.build_vocab(train_ds, max_size = VOCAB_SIZE)
TITLE.build_vocab(train_ds, max_size = VOCAB_SIZE)
DESCS.build_vocab(train_ds, max_size = VOCAB_SIZE)

In [None]:
train_it, valid_it, test_it = data.BucketIterator.splits(
  (train_ds, valid_ds, test_ds),
    sort_key = lambda x: len(x.descs),
    sort = True,
    batch_size = BATCH_SIZE,
    device = device
)

In [None]:
# Define the model
import torch.nn as nn
import torch.nn.functional as f

class TextSentiment(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_class):
        super().__init__()
        self.embedding = nn.EmbeddingBag(vocab_size, embed_dim, sparse=True)
        self.fc = nn.Linear(embed_dim, num_class)
        self.init_weights()
        
    def init_weights(self):
        initrange = 0.5
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.fc.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()
        
    def forward(self, text):
        embedded = self.embedding(text)
        return self.fc(embedded)

In [None]:
def train_func(sub_train_):

    # Train the model
    train_loss = 0
    train_acc = 0
    
    for i, d in enumerate(train_it):
        optimizer.zero_grad()
        output = model(d.descs[0].T)
        loss = criterion(output, d.label)
        train_loss += loss.item()
        loss.backward()
        optimizer.step()
        train_acc += (output.argmax(1) == d.label).sum().item()

    # Adjust the learning rate
    scheduler.step()

    return train_loss / len(train_it), train_acc / len(train_it)

def test(test_it):
    loss = 0
    acc = 0

    for t in test_it:
        with torch.no_grad():
            output = model(t.descs[0].T)
            loss = criterion(output, t.label)
            loss += loss.item()
            acc += (output.argmax(1) == t.label).sum().item()

    return loss / len(test_it), acc / len(test_it)

In [None]:
import time

min_valid_loss = float('inf')

model = TextSentiment(VOCAB_SIZE, EMBED_DIM, NUM_CLASS).to(device)
criterion = torch.nn.CrossEntropyLoss().to(device)
optimizer = torch.optim.SGD(model.parameters(), lr = 1e-2)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1, gamma = 0.9)

for epoch in range(N_EPOCHS):
    start_time = time.time()
    train_loss, train_acc = train_func(train_it)
    valid_loss, valid_acc = test(valid_it)
    
    secs = int(time.time() - start_time)
    mins = secs / 60
    secs = secs % 60
    
    print('Epoch: %d' %(epoch + 1), " | time in %d minutes, %d seconds" %(mins, secs))
    print(f'\tLoss: {train_loss:.4f}(train)\t|\tAcc: {train_acc * 100:.1f}%(train)')
    print(f'\tLoss: {valid_loss:.4f}(valid)\t|\tAcc: {valid_acc * 100:.1f}%(valid)')

In [None]:
print('Checking the results of test dataset...')
test_loss, test_acc = test(test_it)
print(f'\tLoss: {test_loss:.4f}(test)\t|\tAcc: {test_acc * 100:.1f}%(test)')