In [8]:
!pip install datasets

Defaulting to user installation because normal site-packages is not writeable
Collecting datasets
  Downloading datasets-2.9.0-py3-none-any.whl (462 kB)
     |████████████████████████████████| 462 kB 18.1 MB/s            
[?25hCollecting multiprocess
  Downloading multiprocess-0.70.14-py39-none-any.whl (132 kB)
     |████████████████████████████████| 132 kB 59.9 MB/s            
Collecting pandas
  Downloading pandas-1.5.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.2 MB)
     |████████████████████████████████| 12.2 MB 67.2 MB/s            
Collecting aiohttp
  Downloading aiohttp-3.8.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.0 MB)
     |████████████████████████████████| 1.0 MB 80.6 MB/s            
[?25hCollecting xxhash
  Downloading xxhash-3.2.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
     |████████████████████████████████| 212 kB 149.2 MB/s            
[?25hCollecting responses<0.19
  Downloading responses-0.18.0-py3

In [9]:
import pickle
import numpy as np

import torch
import torchvision
import torchvision.transforms as transforms
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable

# show PyTorch version
print(torch.__version__)
# Check if CUDA is available
print('Is CUDA available?', torch.cuda.is_available())

1.13.1+cu117


In [10]:
class Net(nn.Module):
    def __init__(self, n_vocab, embedding_dim, hidden_dim, dropout=0.2):
        super(Net, self).__init__()

        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim # dim = dimension

        self.embeddings = nn.Embedding(n_vocab, embedding_dim)

        # LSTM Layer (input_size, hidden_size)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, dropout=dropout)

        # Fully connected layer, change "Hidden State" Linear to output
        self.hidden2out = nn.Linear(hidden_dim, n_vocab)

    def forward(self, seq_in):
        embeddings = self.embeddings(seq_in.t())

        lstm_out, _ = self.lstm(embeddings)
        ht = lstm_out[-1]

        out = self.hidden2out(ht)

        return out


In [11]:
def parse_corpus(seq_length=100):
    from datasets import load_dataset, Dataset
    source = 'sepidmnorozy/Cantonese_sentiment'
    split_test = load_dataset(source, split='test')

    # Obtain texts from dataset
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    torch.cuda.get_device_name(torch.cuda.current_device())
    
    ds = split_test.with_format("torch", device=device)

    all_texts = ''.join(ds['text'])

    # get unique characters
    chars = sorted(list(set(all_texts)))

    # map char to int / int to char
    char_to_int = dict((c, i) for i, c in enumerate(chars))
    int_to_char = dict((i, c) for i, c in enumerate(chars))

    # prepare training data for every <seq_length> chars, predict 1 char after the sequence
    n_chars = len(all_texts)
    dataX = [] # N x seq_length
    dataY = [] # N x 1
    for i in range(0, n_chars - seq_length):
        seq_in = all_texts[i:i + seq_length]
        seq_out = all_texts[i + seq_length]
        dataX.append([char_to_int[char] for char in seq_in])
        dataY.append(char_to_int[seq_out])

    return (dataX, dataY, char_to_int, int_to_char, chars)

def format_data(dataX, dataY, n_classes, batch_size=64):
    # for simplicity, discard trailing data not fitting into batch_size
    n_patterns = len(dataY)
    n_patterns = n_patterns - n_patterns % batch_size
    X = dataX[:n_patterns]
    Y = dataY[:n_patterns]

    # Parse X
    X = np.array(X)
    _, seq_length = X.shape
    X = X.reshape(-1, batch_size, seq_length)

    X = torch.LongTensor(X)

    # Parse Y
    Y = np.array(Y)
    Y = Y.reshape(-1, batch_size)

    Y = torch.LongTensor(Y)

    return list(zip(X, Y))

In [12]:
def load_data(seq_length, batch_size): # TODO: add back "path"
    dataX, dataY, char_to_int, int_to_char, chars = parse_corpus(seq_length=seq_length)
    data = format_data(dataX, dataY, n_classes=len(chars), batch_size=batch_size)

    return data, dataX, dataY, char_to_int, int_to_char, chars

def save_pickle(data, path):
    with open(path, 'wb') as f:
        pickle.dump(data, f)

def train(model, optimizer, epoch, data, log_interval):
    model.train()

    for batch_i, (seq_in, target) in enumerate(data):
        seq_in, target = Variable(seq_in), Variable(target)
        optimizer.zero_grad()

        output = model(seq_in)
        loss = F.cross_entropy(output, target) # ?
        loss.backward()
        optimizer.step()

        # Log training status
        if batch_i % log_interval == 0:
            print('Train epoch: {} ({:2.0f}%)\tLoss: {:.6f}'.format(epoch, 100. * batch_i / len(data), loss.data), end='\r')

In [14]:
# Prepare
train_data, dataX, dataY, char_to_int, int_to_char, chars = load_data(seq_length=100, batch_size=64) # default values
model = Net(len(chars), embedding_dim=128, hidden_dim=64, dropout=0.2)
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001) # lr = learning rate

from pathlib import Path
model_bin = Path("model.bin")
if model_bin.is_file():
    print("model.bin exists. Stopping")
else:
    # Train
    for epoch in range(5): # 5 epochs for now
        train(model, optimizer, epoch, train_data, log_interval=10) # default values

        if (epoch + 1) % 10 == 0: # save_interval = 10
            model.eval()
            torch.save(model, "model.bin") # model output file

    # Save mappings, vocabs and model
    save_pickle((dataX, char_to_int, int_to_char, chars), "corpus.bin") # corpus related output file

    # one last save
    model.eval()
    torch.save(model, "model.bin")

Using custom data configuration sepidmnorozy--Cantonese_sentiment-f2d23095df14a297
Found cached dataset csv (/home/jupyter-raptor/.cache/huggingface/datasets/sepidmnorozy___csv/sepidmnorozy--Cantonese_sentiment-f2d23095df14a297/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)


Train epoch: 0 ( 0%)	Loss: 8.541375
Train epoch: 0 ( 0%)	Loss: 8.546684
Train epoch: 0 ( 0%)	Loss: 8.582096
Train epoch: 0 ( 0%)	Loss: 8.538011
Train epoch: 0 ( 0%)	Loss: 8.554176
Train epoch: 0 ( 0%)	Loss: 8.522991
Train epoch: 0 ( 0%)	Loss: 8.541509
Train epoch: 0 ( 0%)	Loss: 8.521584
Train epoch: 0 ( 0%)	Loss: 8.525615
Train epoch: 0 ( 0%)	Loss: 8.525001
Train epoch: 0 ( 0%)	Loss: 8.529032
Train epoch: 0 ( 0%)	Loss: 8.531771
Train epoch: 0 ( 1%)	Loss: 8.509805
Train epoch: 0 ( 1%)	Loss: 8.529984
Train epoch: 0 ( 1%)	Loss: 8.494984
Train epoch: 0 ( 1%)	Loss: 8.514459
Train epoch: 0 ( 1%)	Loss: 8.501097
Train epoch: 0 ( 1%)	Loss: 8.500638
Train epoch: 0 ( 1%)	Loss: 8.488669
Train epoch: 0 ( 1%)	Loss: 8.496985
Train epoch: 0 ( 1%)	Loss: 8.490474
Train epoch: 0 ( 1%)	Loss: 8.462014
Train epoch: 0 ( 1%)	Loss: 8.471298
Train epoch: 0 ( 1%)	Loss: 8.492016
Train epoch: 0 ( 1%)	Loss: 8.463352
Train epoch: 0 ( 1%)	Loss: 8.456412
Train epoch: 0 ( 1%)	Loss: 8.421924
Train epoch: 0 ( 1%)	Loss: 8