In [1]:
import torch #Main library
from torch.autograd import Variable # To create Tensors
import numpy as np
from torch.utils.data import Dataset, DataLoader # To create data entities

Let's see some properties

In [2]:
torch.cuda.is_available() # Check if GPU is available

True

In [3]:
torch.cuda.get_device_name(0) # See you GPU name

'GeForce GTX 1050 Ti'

Let's create some tensors

In [63]:
x = Variable(torch.Tensor([[i , j] for i, j in zip(range(100), range(1, 101))])).cuda() # .cuda() puts the tensor on GPU

In [23]:
x.shape # See the shape of your tensor

torch.Size([100, 2])

In [10]:
x.data[:10,1] # Access the values of your tenosr by .data and slicing like numpy

tensor([  1.,   2.,   3.,   4.,   5.,   6.,   7.,   8.,   9.,  10.], device='cuda:0')

In [64]:
y = Variable(torch.Tensor([[2*i+3*j] for i, j in zip(range(100), range(1, 101))])).cuda() # Create an arbitrary target

In [62]:
y.shape

torch.Size([100, 1])

Let's create some models to predict y from x

# Linear Model

In [13]:
class model(torch.nn.Module): # To create a model just extend nn.Module and implement __init__ and forward methods
    def __init__(self):
        super(model, self).__init__()
        self.lin = torch.nn.Linear(2, 1) # in __init__ you define the elements of the network
    def forward(self, x):
        y = self.lin(x) # in forward just pass the input through the element you created in __init__
        return y

In [69]:
m = model() # instantiate you model
m.cuda() # put it on GPU
criterion = torch.nn.MSELoss() # define a loss 
optim = torch.optim.Adam(m.parameters(),0.2) # pass the parameters of your model to an optimizer

In [70]:
for i in range(2000): # define your training loop
    y_ = m(x) # pass the tensor you created before through the model
    loss = criterion(y_, y) # calculate the loss of the prediction from actual
    if i%100 ==0:
        print('epoch: ',i, 'loss: ', loss.item()) # .item is used with tensors with rank 0 i.e. scalers
    optim.zero_grad() # reset any gradient calculations if any
    loss.backward() # .backward() calculates the gradient for all the parameters involved in calculating the loss
    optim.step() # finally do the gradient descent update
    

epoch:  0 loss:  97722.359375
epoch:  100 loss:  1.993674635887146
epoch:  200 loss:  1.5573391914367676
epoch:  300 loss:  1.1968315839767456
epoch:  400 loss:  0.8683893084526062
epoch:  500 loss:  0.5970905423164368
epoch:  600 loss:  0.3897525668144226
epoch:  700 loss:  0.24170124530792236
epoch:  800 loss:  0.14239442348480225
epoch:  900 loss:  0.07965555042028427
epoch:  1000 loss:  0.042272165417671204
epoch:  1100 loss:  0.021254301071166992
epoch:  1200 loss:  0.010108155198395252
epoch:  1300 loss:  0.004539478570222855
epoch:  1400 loss:  0.0019210150931030512
epoch:  1500 loss:  0.0007641716510988772
epoch:  1600 loss:  0.0002848840958904475
epoch:  1700 loss:  9.940329618984833e-05
epoch:  1800 loss:  3.2302294130204245e-05
epoch:  1900 loss:  9.733891602081712e-06


# DataLoader

In [72]:
# Datasets are there to conveniently fetch data for training. To define them you have to implement 3 methods as shown below
class custom_dataset(Dataset):
    def __init__(self):
        self.x = torch.Tensor([[i , j] for i, j in zip(range(100), range(1, 101))])
        self.y = torch.Tensor([[2*i+3*j] for i, j in zip(range(100), range(1, 101))])
    def __getitem__(self, index):
        return self.x[index], self.y[index]
    def __len__(self):
        return self.x.shape[0]
dataset = custom_dataset()

In [73]:
dloader = DataLoader(dataset, 10, True) # Dataloader is built on datasets to fascilitate fetching data as you will see

In [88]:
m = model() # instantiate you model
m.cuda() # put it on GPU
criterion = torch.nn.MSELoss() # define a loss 
optim = torch.optim.Adam(m.parameters(),0.2) # pass the parameters of your model to an optimizer

In [89]:
for i in range(200):
    for j, data  in enumerate(dloader, 0): # This is very useful when you are using mini bathces
        x, y = data
        x, y = Variable(x).cuda(), Variable(y).cuda()
        y_ = m(x)
        loss = criterion(y_, y)
        if (10*i+j) %50 ==0:
            print('iter: ', 10*i+j, 'loss: ', loss.item())
        optim.zero_grad()
        loss.backward()
        optim.step()

iter:  0 loss:  82786.875
iter:  50 loss:  388.1860046386719
iter:  100 loss:  1.9986851215362549
iter:  150 loss:  0.30013787746429443
iter:  200 loss:  0.633473813533783
iter:  250 loss:  0.3923751413822174
iter:  300 loss:  0.6905971169471741
iter:  350 loss:  0.16636331379413605
iter:  400 loss:  0.3838123381137848
iter:  450 loss:  0.28431567549705505
iter:  500 loss:  0.3048263490200043
iter:  550 loss:  0.05782446265220642
iter:  600 loss:  0.06819303333759308
iter:  650 loss:  0.15540829300880432
iter:  700 loss:  0.05706700682640076
iter:  750 loss:  0.04764743521809578
iter:  800 loss:  0.0459921658039093
iter:  850 loss:  0.024443689733743668
iter:  900 loss:  0.015892446041107178
iter:  950 loss:  0.02069912850856781
iter:  1000 loss:  0.015077618882060051
iter:  1050 loss:  0.005280360579490662
iter:  1100 loss:  0.006497129797935486
iter:  1150 loss:  0.0035856044851243496
iter:  1200 loss:  0.002094197552651167
iter:  1250 loss:  0.0011256985599175096
iter:  1300 loss:  

# MNIST

Now let's try a model on MNIST dataset

In [90]:
from torchvision import datasets, transforms

In [91]:
# let's use built-in MNISt dataset from torch bu using the following syntax. We also make normalization transformation.
train_loader = torch.utils.data.DataLoader(
        datasets.MNIST('./mnist_data', train=True, download=False,
                       transform=transforms.Compose([
                           transforms.ToTensor(),
                           transforms.Normalize((0.1307,), (0.3081,))
                       ])),
        batch_size=64, shuffle=True)
test_loader = torch.utils.data.DataLoader(
        datasets.MNIST('./mnist_data', train=False, transform=transforms.Compose([
                           transforms.ToTensor(),
                           transforms.Normalize((0.1307,), (0.3081,))
                       ])),
        batch_size=64, shuffle=True)

In [92]:
import torch.nn.functional as F # To access activation functions

In [122]:
class model(torch.nn.Module):
    def __init__(self):
        super(model, self).__init__()
        self.l1 = torch.nn.Linear(784, 520)
        self.l2 = torch.nn.Linear(520, 320)
        self.l3 = torch.nn.Linear(320, 240)
        self.l4 = torch.nn.Linear(240, 120)
        self.l5 = torch.nn.Linear(120, 10)
    def forward(self, x):
        x = x.view(-1, 784) # .view help us view a tensor in desired shape which is a vector 28x28
        x = F.relu(self.l1(x)) # between layers we apply activation
        x = F.relu(self.l2(x))
        x = F.relu(self.l3(x))
        x = F.relu(self.l4(x))
        x = self.l5(x) # Softmax is not needed as we use nn.CrossEntropyLoss()
        return x

In [123]:
m = model()
m.cuda()
criterion = torch.nn.CrossEntropyLoss() # This is the loss for multi-class classification
optim = torch.optim.SGD(m.parameters(), 0.01, .5)

In [124]:
n_batches = len(train_loader)
for i in range(4):
    for j, (x, y)  in enumerate(train_loader, 0):
        x, y = Variable(x).cuda(), Variable(y).cuda()
        y_ = m(x)
        loss = criterion(y_, y)
        optim.zero_grad()
        loss.backward()
        optim.step()
        if (i*n_batches+j)%100 ==0:
            print('iter: ', n_batches*i+j, 'loss: ', loss.item())

iter:  0 loss:  2.297628164291382
iter:  100 loss:  2.278578281402588
iter:  200 loss:  2.1996800899505615
iter:  300 loss:  1.76529860496521
iter:  400 loss:  0.811119019985199
iter:  500 loss:  0.4767480790615082
iter:  600 loss:  0.366688072681427
iter:  700 loss:  0.5400040149688721
iter:  800 loss:  0.6790627837181091
iter:  900 loss:  0.40882009267807007
iter:  1000 loss:  0.36237797141075134
iter:  1100 loss:  0.15433190762996674
iter:  1200 loss:  0.25847288966178894
iter:  1300 loss:  0.18450406193733215
iter:  1400 loss:  0.2636382579803467
iter:  1500 loss:  0.17533031105995178
iter:  1600 loss:  0.2104531079530716
iter:  1700 loss:  0.3601965308189392
iter:  1800 loss:  0.10654174536466599
iter:  1900 loss:  0.11978012323379517
iter:  2000 loss:  0.2898464500904083
iter:  2100 loss:  0.16514664888381958
iter:  2200 loss:  0.1471841037273407
iter:  2300 loss:  0.05693632364273071
iter:  2400 loss:  0.1773693561553955
iter:  2500 loss:  0.21616941690444946
iter:  2600 loss:  

Now we calculate the test accuracy

In [115]:
correct = 0
for (x, y) in test_loader:
    x, y = Variable(x).cuda(), Variable(y).cuda()
    y_=m(x)
    pred = torch.max(y_.data, 1)[1] # The 10 outputs for eacn input indicate the likelihood of the classes so we take the max
    correct += pred.eq(y.data.view_as(pred)).cpu().sum() # Bring the data on CPU to use with numpy
print(correct.numpy()/len(test_loader.dataset))

0.966


# Convolutional

In [125]:
class model(torch.nn.Module): # This is not much different from previous procedures
    def __init__(self):
        super(model, self).__init__()
        self.c1 = torch.nn.Conv2d(1, 10, 5)
        self.c2 = torch.nn.Conv2d(10, 20, 5)
        self.mp = torch.nn.MaxPool2d(2)
        self.l1 = torch.nn.Linear(2000, 10)
    def forward(self, x):
        x = F.relu(self.c1(x))
        x = F.relu(self.c2(x))
        x = F.relu(self.mp(x))
        x = x.view(x.size(0), -1)
        x = self.l1(x)
        return x

In [126]:
m = model()
m.cuda()
criterion = torch.nn.CrossEntropyLoss()
optim = torch.optim.SGD(m.parameters(), 0.01, .5)

In [127]:
n_batches = len(train_loader)
for i in range(4):
    for j, (x, y)  in enumerate(train_loader, 0):
        x, y = Variable(x).cuda(), Variable(y).cuda()
        y_ = m(x)
        loss = criterion(y_, y)
        optim.zero_grad()
        loss.backward()
        optim.step()
        if (i*n_batches+j)%100 ==0:
            print('iter: ', n_batches*i+j, 'loss: ', loss.item())

iter:  0 loss:  2.318005323410034
iter:  100 loss:  0.49953484535217285
iter:  200 loss:  0.3001226782798767
iter:  300 loss:  0.20771130919456482
iter:  400 loss:  0.06426689773797989
iter:  500 loss:  0.3679344356060028
iter:  600 loss:  0.08783812820911407
iter:  700 loss:  0.18321630358695984
iter:  800 loss:  0.05472637712955475
iter:  900 loss:  0.07104971259832382
iter:  1000 loss:  0.031181804835796356
iter:  1100 loss:  0.08725129812955856
iter:  1200 loss:  0.13378097116947174
iter:  1300 loss:  0.07342841476202011
iter:  1400 loss:  0.034585341811180115
iter:  1500 loss:  0.02790471911430359
iter:  1600 loss:  0.04863227158784866
iter:  1700 loss:  0.02295522391796112
iter:  1800 loss:  0.045062221586704254
iter:  1900 loss:  0.07014953345060349
iter:  2000 loss:  0.016720734536647797
iter:  2100 loss:  0.025574102997779846
iter:  2200 loss:  0.020966418087482452
iter:  2300 loss:  0.042456962168216705
iter:  2400 loss:  0.01690921187400818
iter:  2500 loss:  0.0144619196653

In [128]:
correct = 0
for (x, y) in test_loader:
    x, y = Variable(x).cuda(), Variable(y).cuda()
    y_=m(x)
    pred = torch.max(y_.data, 1)[1]
    correct += pred.eq(y.data.view_as(pred)).cpu().sum()
print(correct.numpy()/len(test_loader.dataset))

0.9852


Now shift our focus on sequences

# RNN

Lets teach the network to say 'hihello', by teaching it to predict the next character after each input 

In [243]:
idx_char = list('hielo')
char_idx = {char:i for i, char in enumerate(idx_char)}
def lookup(idx):
    out = [0]*num_classes
    out[idx] = 1
    return out
x = torch.Tensor([[lookup(char_idx[char]) for char in 'hihell']])
y = torch.LongTensor([char_idx[char] for char in 'ihello'])# the target is the input shifted one place to the left

In [217]:
batch_size = 1
hidden_size = 5 # arbitrary but good choice is the alphabet cardinality
num_layers = 1
input_size = 5 # one hot vector of input has dim 5
num_classes = 5 # we have 5 classes: h i e l o
sequence_len = 6 # The input sequence has 6 chars
class model(torch.nn.Module):
    def __init__(self):
        super(model, self).__init__()
        self.rnn = torch.nn.LSTM(input_size=input_size, hidden_size=hidden_size, batch_first=True)
    def forward(self, x):
        x = x.view(batch_size, sequence_len, input_size)
        out, hidden = self.rnn(x)
        out = out.view(-1, num_classes) # this predict the next char
        return hidden, out

In [242]:
m = model()
m.cuda()
criterion = torch.nn.CrossEntropyLoss()
optim = torch.optim.Adam(m.parameters(), 0.1)

In [244]:
for i in range(40):
    x, y = Variable(x).cuda(), Variable(y).cuda()
    hidden, y_ = m(x)
    loss = criterion(y_, y)
    optim.zero_grad()
    loss.backward()
    optim.step()
    _, outputs = y_.max(1)
    outputs = outputs.cpu().numpy()
    result = [idx_char[idx] for idx in outputs.squeeze()]
    if i%1 ==0:
        print('epoch:', i, 'loss:', loss.item(), 'output:', ''.join(result))

epoch: 0 loss: 1.6514673233032227 output: iiiiii
epoch: 1 loss: 1.5523813962936401 output: iiiiii
epoch: 2 loss: 1.4848428964614868 output: iiilll
epoch: 3 loss: 1.421666145324707 output: illlll
epoch: 4 loss: 1.354124903678894 output: ehelll
epoch: 5 loss: 1.2811726331710815 output: ehelll
epoch: 6 loss: 1.2022839784622192 output: ehelll
epoch: 7 loss: 1.1221809387207031 output: ehelll
epoch: 8 loss: 1.0482736825942993 output: ehelll
epoch: 9 loss: 0.9841535091400146 output: ehelll
epoch: 10 loss: 0.9335740208625793 output: ehelll
epoch: 11 loss: 0.8924670815467834 output: ihelll
epoch: 12 loss: 0.8565268516540527 output: ihelll
epoch: 13 loss: 0.8272908329963684 output: ihelll
epoch: 14 loss: 0.8011173605918884 output: ihelll
epoch: 15 loss: 0.7755684852600098 output: ihelll
epoch: 16 loss: 0.7538593411445618 output: ihelll
epoch: 17 loss: 0.7384834885597229 output: ihelll
epoch: 18 loss: 0.7260566353797913 output: ihelll
epoch: 19 loss: 0.715537965297699 output: ihelll
epoch: 20 los

# With Embedding

let's see how we can use embedding with previous problem

In [95]:
batch_size = 1
hidden_size = 5
num_layers = 1
num_classes = 5
sequence_len = 6
embed_dim = 4 # choose an aribtrary embedding dim
class model(torch.nn.Module):
    def __init__(self):
        super(model, self).__init__()
        self.embed = torch.nn.Embedding(num_classes, embed_dim,
                                        _weight=torch.Tensor(np.eye(num_classes, embed_dim)))# initialize it with one-hot
        self.rnn = torch.nn.LSTM(input_size=embed_dim, hidden_size=hidden_size, batch_first=True)
    def forward(self, x):
        x = x.view(batch_size, sequence_len)
        x = self.embed(x)
        out, hidden = self.rnn(x)
        out = out.view(-1, num_classes)
        return hidden, out
    def init_hidden(self):
        return Variable(torch.zeros(1, num_layers, batch_size, hidden_size)).cuda()

In [96]:
m = model()
m.cuda()
criterion = torch.nn.CrossEntropyLoss()
optim = torch.optim.Adam(m.parameters(), 0.1)

In [97]:
idx_char = list('hielo')
char_idx = {char:i for i, char in enumerate(idx_char)}
x = torch.LongTensor([[char_idx[char] for char in 'hihell']])
y = torch.LongTensor([char_idx[char] for char in 'ihello'])

In [98]:
for i in range(40):
#     hidden = m.init_hidden()
    x, y = Variable(x).cuda(), Variable(y).cuda()
    hidden, y_ = m(x)
    loss = criterion(y_, y)
    optim.zero_grad()
    loss.backward()
    optim.step()
    _, outputs = y_.max(1)
    outputs = outputs.cpu().numpy()
    result = [idx_char[idx] for idx in outputs.squeeze()]
    if i%1 ==0:
        print('epoch:', i, 'loss:', loss.item(), 'output:', ''.join(result))

epoch: 0 loss: 1.6220346689224243 output: oeoooo
epoch: 1 loss: 1.512691617012024 output: oooooo
epoch: 2 loss: 1.4197088479995728 output: oheooo
epoch: 3 loss: 1.329732894897461 output: eheloo
epoch: 4 loss: 1.2419105768203735 output: eheloo
epoch: 5 loss: 1.1701480150222778 output: eheloo
epoch: 6 loss: 1.1195186376571655 output: ehello
epoch: 7 loss: 1.0829476118087769 output: ehello
epoch: 8 loss: 1.0513219833374023 output: ehello
epoch: 9 loss: 1.024389386177063 output: ehelll
epoch: 10 loss: 1.0052191019058228 output: ehelll
epoch: 11 loss: 0.977078914642334 output: ehelll
epoch: 12 loss: 0.9521985650062561 output: ehelll
epoch: 13 loss: 0.9315541386604309 output: ehelll
epoch: 14 loss: 0.9066872596740723 output: ihelll
epoch: 15 loss: 0.8779399991035461 output: ihelll
epoch: 16 loss: 0.8530540466308594 output: ihelll
epoch: 17 loss: 0.8387728333473206 output: ihelll
epoch: 18 loss: 0.8274298310279846 output: ihelll
epoch: 19 loss: 0.8180227875709534 output: ihelll
epoch: 20 loss

In [99]:
m.embed.weight # the last row of embedding matrix does not change becuase the network never sees o as input

Parameter containing:
tensor([[ 2.1113, -0.4399, -2.2628,  0.9665],
        [-1.4425,  2.8398, -1.7243,  1.5508],
        [ 1.5335, -1.4601,  1.4267,  2.3674],
        [ 0.4673, -1.3563, -1.1912,  2.7823],
        [ 0.0000,  0.0000,  0.0000,  0.0000]], device='cuda:0')

# Name Country Classification

Can we predict which country a name comes from?

In [100]:
from name_dataset import NameDataset # import data from helper file 

In [118]:
import torch.nn as nn
import time
import math

In [155]:
HIDDEN_SIZE = 100
N_LAYERS = 1
BATCH_SIZE = 256
N_EPOCHS = 100

In [102]:
test_dataset = NameDataset(is_train_set=False)
test_loader = DataLoader(dataset=test_dataset,
                         batch_size=BATCH_SIZE, shuffle=True)


train_dataset = NameDataset(is_train_set=True)
train_loader = DataLoader(dataset=train_dataset,
                          batch_size=BATCH_SIZE, shuffle=True)

In [116]:
N_COUNTRIES = len(train_dataset.get_countries())
print(N_COUNTRIES, "countries")
N_CHARS = 128  # ASCII

18 countries


In [151]:
class RNNClassifier(nn.Module):

    def __init__(self, input_size, hidden_size, output_size, n_layers=1, bidirectional=True):
        super(RNNClassifier, self).__init__()
        self.hidden_size = hidden_size
        self.n_layers = n_layers
        self.n_directions = int(bidirectional) + 1

        self.embedding = nn.Embedding(input_size, hidden_size, 0)
        self.lstm = nn.GRU(hidden_size, hidden_size, n_layers,
                          bidirectional=bidirectional, batch_first = True)
        self.fc = nn.Linear(hidden_size*self.n_directions, output_size)

    def forward(self, input, seq_lengths):
        
        batch_size = input.size(0)

        embedded = self.embedding(input)

#         lstm_input = pack_padded_sequence(
#             embedded, seq_lengths.data.cpu().numpy())

        # To compact weights again call flatten_parameters().
#         self.lstm.flatten_parameters()
        output, hidden = self.lstm(embedded)

        # Use the last layer output as FC's input
        # No need to unpack, since we are going to use hidden
        fc_output = self.fc(output[:,-1,:])
        return fc_output

In [152]:
def train():
    total_loss = 0

    for i, (names, countries) in enumerate(train_loader, 1):
        input, seq_lengths, target = make_variables(names, countries)
        output = classifier(input, seq_lengths)
#         print(input.shape, output.shape)
        loss = criterion(output, target)
        total_loss += loss.item()

        classifier.zero_grad()
        loss.backward()
        optimizer.step()

        if i % 10 == 0:
            print('[{}] Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.2f}'.format(
                time_since(start), epoch,  i *
                len(names), len(train_loader.dataset),
                100. * i * len(names) / len(train_loader.dataset),
                total_loss / i * len(names)))

    return total_loss


# Testing cycle
def test(name=None):
    # Predict for a given name
    if name:
        input, seq_lengths, target = make_variables([name], [])
        output = classifier(input, seq_lengths)
        pred = output.data.max(1, keepdim=True)[1]
        country_id = pred.cpu().numpy()[0][0]
        print(name, "is", train_dataset.get_country(country_id))
        return

    print("evaluating trained model ...")
    correct = 0
    train_data_size = len(test_loader.dataset)

    for names, countries in test_loader:
        input, seq_lengths, target = make_variables(names, countries)
        output = classifier(input, seq_lengths)
        pred = output.data.max(1, keepdim=True)[1]
        correct += pred.eq(target.data.view_as(pred)).cpu().sum()

    print('\nTest set: Accuracy: {}/{} ({:.0f}%)\n'.format(
        correct, train_data_size, 100. * correct / train_data_size))

In [153]:
def time_since(since):
    s = time.time() - since
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def create_variable(tensor):
    # Do cuda() before wrapping with variable
    if torch.cuda.is_available():
        return Variable(tensor.cuda())
    else:
        return Variable(tensor)


# pad sequences and sort the tensor
def pad_sequences(vectorized_seqs, seq_lengths, countries):
    seq_tensor = torch.zeros((len(vectorized_seqs), seq_lengths.max())).long()
    for idx, (seq, seq_len) in enumerate(zip(vectorized_seqs, seq_lengths)):
        seq_tensor[idx, :seq_len] = torch.LongTensor(seq)

    # Sort tensors by their length
    seq_lengths, perm_idx = seq_lengths.sort(0, descending=True)
    seq_tensor = seq_tensor[perm_idx]

    # Also sort the target (countries) in the same order
    target = countries2tensor(countries)
    if len(countries):
        target = target[perm_idx]

    # Return variables
    # DataParallel requires everything to be a Variable
    return create_variable(seq_tensor), \
        create_variable(seq_lengths), \
        create_variable(target)


# Create necessary variables, lengths, and target
def make_variables(names, countries):
    sequence_and_length = [str2ascii_arr(name) for name in names]
    vectorized_seqs = [sl[0] for sl in sequence_and_length]
    seq_lengths = torch.LongTensor([sl[1] for sl in sequence_and_length])
    return pad_sequences(vectorized_seqs, seq_lengths, countries)


def str2ascii_arr(msg):
    arr = [ord(c) for c in msg]
    return arr, len(arr)


def countries2tensor(countries):
    country_ids = [train_dataset.get_country_id(
        country) for country in countries]
    return torch.LongTensor(country_ids)

In [156]:
classifier = RNNClassifier(N_CHARS, HIDDEN_SIZE, N_COUNTRIES, N_LAYERS)

if torch.cuda.is_available():
    classifier.cuda()

optimizer = torch.optim.Adam(classifier.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

start = time.time()
print("Training for %d epochs..." % N_EPOCHS)
for epoch in range(1, N_EPOCHS + 1):
    # Train cycle
    train()

    # Testing
    test()

Training for 100 epochs...
evaluating trained model ...

Test set: Accuracy: 3134/6700 (46%)

Sung is Russian
Jungwoo is Russian
Soojin is Russian
Nako is Russian
evaluating trained model ...

Test set: Accuracy: 3839/6700 (57%)

Sung is English
Jungwoo is Russian
Soojin is Russian
Nako is Russian
evaluating trained model ...

Test set: Accuracy: 4316/6700 (64%)

Sung is German
Jungwoo is Russian
Soojin is Russian
Nako is Russian
evaluating trained model ...

Test set: Accuracy: 4636/6700 (69%)

Sung is English
Jungwoo is Russian
Soojin is Russian
Nako is Russian
evaluating trained model ...

Test set: Accuracy: 4826/6700 (72%)

Sung is English
Jungwoo is Russian
Soojin is Russian
Nako is Russian
evaluating trained model ...

Test set: Accuracy: 5007/6700 (74%)

Sung is Irish
Jungwoo is Russian
Soojin is Russian
Nako is Russian
evaluating trained model ...

Test set: Accuracy: 5105/6700 (76%)

Sung is Irish
Jungwoo is Russian
Soojin is Russian
Nako is Russian
evaluating trained model .

evaluating trained model ...

Test set: Accuracy: 5438/6700 (81%)

Sung is Dutch
Jungwoo is Russian
Soojin is Russian
Nako is Russian
evaluating trained model ...

Test set: Accuracy: 5455/6700 (81%)

Sung is Dutch
Jungwoo is English
Soojin is Russian
Nako is Japanese
evaluating trained model ...

Test set: Accuracy: 5486/6700 (81%)

Sung is Dutch
Jungwoo is Russian
Soojin is Russian
Nako is Russian
evaluating trained model ...

Test set: Accuracy: 5463/6700 (81%)

Sung is Dutch
Jungwoo is English
Soojin is Russian
Nako is Japanese
evaluating trained model ...

Test set: Accuracy: 5476/6700 (81%)

Sung is Dutch
Jungwoo is Russian
Soojin is Russian
Nako is Russian
evaluating trained model ...

Test set: Accuracy: 5469/6700 (81%)

Sung is Dutch
Jungwoo is Russian
Soojin is Russian
Nako is Russian
evaluating trained model ...

Test set: Accuracy: 5484/6700 (81%)

Sung is Dutch
Jungwoo is Russian
Soojin is Russian
Nako is Russian
evaluating trained model ...

Test set: Accuracy: 5486/6700 

evaluating trained model ...

Test set: Accuracy: 5466/6700 (81%)

Sung is Dutch
Jungwoo is English
Soojin is Russian
Nako is Russian
evaluating trained model ...

Test set: Accuracy: 5452/6700 (81%)

Sung is Dutch
Jungwoo is English
Soojin is Russian
Nako is Russian
evaluating trained model ...

Test set: Accuracy: 5435/6700 (81%)

Sung is Dutch
Jungwoo is English
Soojin is Russian
Nako is Russian
evaluating trained model ...

Test set: Accuracy: 5449/6700 (81%)

Sung is Dutch
Jungwoo is English
Soojin is Russian
Nako is Russian
evaluating trained model ...

Test set: Accuracy: 5437/6700 (81%)

Sung is Dutch
Jungwoo is Russian
Soojin is Russian
Nako is Russian
evaluating trained model ...

Test set: Accuracy: 5453/6700 (81%)

Sung is Dutch
Jungwoo is English
Soojin is Russian
Nako is Russian
evaluating trained model ...

Test set: Accuracy: 5460/6700 (81%)

Sung is Dutch
Jungwoo is Russian
Soojin is Russian
Nako is Russian
evaluating trained model ...

Test set: Accuracy: 5456/6700 (8

KeyboardInterrupt: 