In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

torch.manual_seed(1)

<torch._C.Generator at 0x10c895550>

In [2]:
lin = nn.Linear(5, 3)  # maps from R^5 to R^3, parameters A, b
# data is 2x5.  A maps from 5 to 3... can we map "data" under A?
data = torch.randn(2, 5)


tensor([[ 0.1755, -0.3268, -0.5069],
        [-0.6602,  0.2260,  0.1089]], grad_fn=<AddmmBackward>)


In [4]:
lin(data).shape

torch.Size([2, 3])

In [5]:
data.shape

torch.Size([2, 5])

In [7]:
data = torch.randn(2, 2)

In [8]:
data

tensor([[ 1.3800, -1.3505],
        [ 0.3455,  0.5046]])

In [9]:
F.relu(data)

tensor([[1.3800, 0.0000],
        [0.3455, 0.5046]])

In [10]:
data = torch.randn(5)
print(data)
print(F.softmax(data, dim=0))
print(F.softmax(data, dim=0).sum())  # Sums to 1 because it is a distribution!
print(F.log_softmax(data, dim=0))  # theres also log_softmax

tensor([ 1.8213, -0.1814, -0.9515,  0.4057, -1.5164])
tensor([0.6776, 0.0915, 0.0423, 0.1645, 0.0241])
tensor(1.0000)
tensor([-0.3892, -2.3919, -3.1620, -1.8048, -3.7269])


## Bag of words example

In [40]:
data = [("me gusta comer en la cafeteria".split(), "SPANISH"),
        ("Give it to me".split(), "ENGLISH"),
        ("No creo que sea una buena idea".split(), "SPANISH"),
        ("No it is not a good idea to get lost at sea".split(), "ENGLISH")]

test_data = [("Yo creo que si".split(), "SPANISH"),
             ("it is lost on me".split(), "ENGLISH")]

# word_to_ix maps each word in the vocab to a unique integer, which will be its
# index into the Bag of words vector
word_to_ix = {}
for sent, _ in data + test_data:
    for word in sent:
        if word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix)
print(word_to_ix)

VOCAB_SIZE = len(word_to_ix)
NUM_LABELS = 2


class BoWClassifier(nn.Module):  # inheriting from nn.Module!

    def __init__(self, num_labels, vocab_size):
        # calls the init function of nn.Module.  Dont get confused by syntax,
        # just always do it in an nn.Module
        super(BoWClassifier, self).__init__()

        # Define the parameters that you will need.  In this case, we need A and b,
        # the parameters of the affine mapping.
        # Torch defines nn.Linear(), which provides the affine map.
        # Make sure you understand why the input dimension is vocab_size
        # and the output is num_labels!
        self.linear = nn.Linear(vocab_size, num_labels)

        # NOTE! The non-linearity log softmax does not have parameters! So we don't need
        # to worry about that here

    def forward(self, bow_vec):
        # Pass the input through the linear layer,
        # then pass that through log_softmax.
        # Many non-linearities and other functions are in torch.nn.functional
        return F.log_softmax(self.linear(bow_vec), dim=1)


def make_bow_vector(sentence, word_to_ix):
    vec = torch.zeros(len(word_to_ix))
    for word in sentence:
        vec[word_to_ix[word]] += 1
    return vec.view(1, -1)


def make_target(label, label_to_ix):
    return torch.LongTensor([label_to_ix[label]])


model = BoWClassifier(NUM_LABELS, VOCAB_SIZE)

# the model knows its parameters.  The first output below is A, the second is b.
# Whenever you assign a component to a class variable in the __init__ function
# of a module, which was done with the line
# self.linear = nn.Linear(...)
# Then through some Python magic from the PyTorch devs, your module
# (in this case, BoWClassifier) will store knowledge of the nn.Linear's parameters

{'me': 0, 'gusta': 1, 'comer': 2, 'en': 3, 'la': 4, 'cafeteria': 5, 'Give': 6, 'it': 7, 'to': 8, 'No': 9, 'creo': 10, 'que': 11, 'sea': 12, 'una': 13, 'buena': 14, 'idea': 15, 'is': 16, 'not': 17, 'a': 18, 'good': 19, 'get': 20, 'lost': 21, 'at': 22, 'Yo': 23, 'si': 24, 'on': 25}


In [41]:
for param in model.parameters():
    print(param)

Parameter containing:
tensor([[ 0.1553,  0.1855, -0.0398, -0.1524,  0.1931, -0.0418, -0.0807,  0.0478,
         -0.1371,  0.1289,  0.1229, -0.1556, -0.1611, -0.0172,  0.0824, -0.0057,
         -0.0994,  0.0045, -0.1843, -0.1386, -0.1306,  0.1615,  0.1729, -0.0666,
          0.0088,  0.0875],
        [ 0.0235, -0.0982,  0.1131,  0.1206, -0.0114, -0.0241,  0.1782,  0.1714,
         -0.1112,  0.1919,  0.0485, -0.1303,  0.1074, -0.1464,  0.1812, -0.1261,
          0.0555,  0.0597,  0.0466,  0.1627, -0.0815, -0.0828, -0.1699, -0.0080,
         -0.0929,  0.0079]], requires_grad=True)
Parameter containing:
tensor([-0.0402,  0.0651], requires_grad=True)


In [42]:
# To run the model, pass in a BoW vector
# Here we don't need to train, so the code is wrapped in torch.no_grad()
with torch.no_grad():
    sample = data[0]
    bow_vector = make_bow_vector(sample[0], word_to_ix)
    log_probs = model(bow_vector)
    print(log_probs)

tensor([[-0.6582, -0.7294]])


In [43]:
label_to_ix = {"SPANISH": 0, "ENGLISH": 1}

In [44]:
# Run on test data before we train, just to see a before-and-after
with torch.no_grad():
    for instance, label in test_data:
        bow_vec = make_bow_vector(instance, word_to_ix)
        log_probs = model(bow_vec)
        print(log_probs)

# Print the matrix column corresponding to "creo"
print(next(model.parameters())[:, word_to_ix["creo"]])

loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

# Usually you want to pass over the training data several times.
# 100 is much bigger than on a real data set, but real datasets have more than
# two instances.  Usually, somewhere between 5 and 30 epochs is reasonable.
for epoch in range(100):
    for instance, label in data:
        # Step 1. Remember that PyTorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()

        # Step 2. Make our BOW vector and also we must wrap the target in a
        # Tensor as an integer. For example, if the target is SPANISH, then
        # we wrap the integer 0. The loss function then knows that the 0th
        # element of the log probabilities is the log probability
        # corresponding to SPANISH
        bow_vec = make_bow_vector(instance, word_to_ix)
        target = make_target(label, label_to_ix)

        # Step 3. Run our forward pass.
        log_probs = model(bow_vec)

        # Step 4. Compute the loss, gradients, and update the parameters by
        # calling optimizer.step()
        loss = loss_function(log_probs, target)
        loss.backward()
        optimizer.step()

tensor([[-0.6997, -0.6866]])
tensor([[-0.6578, -0.7298]])
tensor([0.1229, 0.0485], grad_fn=<SelectBackward>)


In [45]:
with torch.no_grad():
    for instance, label in test_data:
        bow_vec = make_bow_vector(instance, word_to_ix)
        log_probs = model(bow_vec)
        print(log_probs)

tensor([[-0.1292, -2.1104]])
tensor([[-2.3582, -0.0994]])


In [46]:
import numpy as np

In [48]:
np.exp(-2.3582)

0.09459033265063503

In [50]:
np.exp(-0.0994)

0.9053804833900955

In [59]:
# Index corresponding to Spanish goes up, English goes down!
print(next(model.parameters())[:, word_to_ix["creo"]])

tensor([ 0.5585, -0.3870], grad_fn=<SelectBackward>)


In [57]:
next(model.parameters())[1]

tensor([ 0.1015, -0.5390, -0.3277, -0.3202, -0.4521, -0.4649,  0.6970,  0.9217,
         0.6391, -0.0121, -0.3870, -0.5658, -0.0966, -0.5820, -0.2544, -0.3301,
         0.2870,  0.2913,  0.2782,  0.3942,  0.1501,  0.1488,  0.0616, -0.0080,
        -0.0929,  0.0079], grad_fn=<SelectBackward>)