In [1]:
import pandas as pd

In [2]:
from google.colab import files
uploaded = files.upload()

Saving opus_books.csv to opus_books.csv


In [3]:
import io
opus = pd.read_csv(io.BytesIO(uploaded['opus_books.csv']))
opus.head()

Unnamed: 0,en,es
0,"In the society of his nephew and niece, and th...","En compañía de su sobrino y sobrina, y de los ..."
1,"By a former marriage, Mr. Henry Dashwood had o...","De un matrimonio anterior, el señor Henry Dash..."
2,"By his own marriage, likewise, which happened ...","Además, su propio matrimonio, ocurrido poco de..."
3,"But the fortune, which had been so tardy in co...","Pero la fortuna, que había tardado tanto en ll..."
4,But Mrs. John Dashwood was a strong caricature...,Pero la señora de John Dashwood era una áspera...


In [4]:
import re
from collections import defaultdict

special_tokens={
    "<PAD>":0,
    "<UNK>":1,
    "<BOS>":2,
    "<EOS>":3
}

vocab=special_tokens.copy()

In [5]:
# Now, defining a sentence cleaner and a tokenizer

token_limit=11

def clean(text):
  text=re.sub(r'[^\w\s,?.!]',' ', text)
  text = text.strip()
  return text


def tokenize(text):
    # Split on consecutive whitespace and punctuation
    tokens = re.findall(r'\w+|[^\w\s]+|[\s]+', text)

    # Pad sentences that are too short
    if len(tokens) < token_limit:
        pad_count = token_limit - len(tokens)
        tokens = ["<PAD>"] * pad_count + tokens

    # Only take tokens up to the limit
    tokens = tokens[:token_limit]
    return tokens

In [6]:

# Example tokenization
tokenize("This is fucking amazing.")

['<PAD>',
 '<PAD>',
 '<PAD>',
 'This',
 ' ',
 'is',
 ' ',
 'fucking',
 ' ',
 'amazing',
 '.']

In [7]:
opus_tokens = defaultdict(int) # As this will not raise a key error and will return a defualt value for a key that does not exist

for index,row in opus.iterrows():
  cleaned=clean(row["en"])
  tokens=tokenize(cleaned)
  for token in tokens:
    opus_tokens[token] +=1 # Storing the count of occurence for each token

# Now creating the vocabulary

# Since we have some special tokens already present, adjusting the counter accordingly

counter=len(vocab)

# Assigning a unique if for the tokens appearing more than once


for index, token in enumerate(opus_tokens):
  if opus_tokens[token]>1:
    vocab[token]=counter
    counter += 1
  else:
    vocab[token] = 1 # Else assigning an unknown id

In [8]:
len(vocab)

11731

In [9]:
# Now creating a reverse dictionary

reverse_vocab = {v: k for k, v in vocab.items()}

# Several tokens could be mapped to the <UNK> token id, so make sure we set the reverse mapping correctly
for k, v in special_tokens.items():
    reverse_vocab[v] = k

In [10]:
len(reverse_vocab)

5628

In [11]:
import torch

def encode(text):
    # Yokenize input text
    tokens = tokenize(clean(text))
    # Convert to token ids
    encoded = torch.tensor([vocab[token] for token in tokens])
    return encoded

def decode(encoded):
    # The input is a torch tensor - convert it to a list
    encoded = encoded.detach().cpu().tolist()
    # Decode a list of integers into text
    decoded = "".join([reverse_vocab[token] for token in encoded])
    return decoded

In [12]:
tokenized = []
for index, row in opus.iterrows():
    # Encode the English sentences
    en_text = row["en"]
    en = encode(en_text)
    tokenized.append(en)

In [13]:
tokenized[1]

tensor([11,  5, 12,  5, 13,  5, 14, 15,  5, 16, 17])

In [14]:
# For now, we will be predicting the 11th word in the sentence.

# Creating a tensor dataset iterator which will be used for training the neural network


from torch.utils.data import DataLoader, Dataset

class TextData(Dataset):
  def __init__(self,data):
    self.tokens=torch.vstack(data).long()

  def __len__(self):
    return len(self.tokens)

  def __getitem__(self, index):
     x = self.tokens[index][:10]
     y = self.tokens[index][10]
     return x, y

train_ds = TextData(tokenized)
# Initialize dataloader with a high batch size
train = DataLoader(train_ds, batch_size=16)


In [15]:
train_ds[1]

(tensor([11,  5, 12,  5, 13,  5, 14, 15,  5, 16]), tensor(17))

In [16]:
# The dataloader is an iterator
# next(iter()) will get the first batch
batch = next(iter(train))
batch

# This is just to get the first batch - Getting a better understanding of how the text data flows

[tensor([[ 4,  5,  6,  5,  7,  5,  8,  5,  9,  5],
         [11,  5, 12,  5, 13,  5, 14, 15,  5, 16],
         [11,  5,  9,  5, 18,  5, 14, 15,  5, 19],
         [20,  5,  6,  5, 21, 15,  5, 22,  5, 23],
         [20,  5, 24, 17,  5, 25,  5, 26,  5, 27],
         [28,  5, 29,  5, 30,  5, 31, 15,  5, 32],
         [33,  5, 27,  5, 34,  5, 35,  5, 36, 37],
         [33,  5, 27,  5,  1, 15,  5, 39, 15,  5],
         [41,  5, 42, 15,  5, 43,  5, 44, 15,  5],
         [45,  5, 46,  5, 47,  5, 48,  5, 49,  5],
         [50,  5, 51,  5,  8,  5, 52,  5, 22,  5],
         [41, 15,  5, 54, 15,  5, 27,  5, 55,  5],
         [57,  5,  1,  5, 32,  5, 12,  5, 58,  5],
         [24, 17,  5, 25,  5, 26,  5, 60,  5, 61],
         [62,  5, 63,  5, 64,  5, 65,  5, 66,  5],
         [68,  5, 69,  5, 70,  5, 71,  5, 72,  5]]),
 tensor([10, 17, 15,  5,  5,  5, 38, 40,  6, 32, 53, 56, 59,  5, 67, 73])]

In [18]:
batch[0]

tensor([[ 4,  5,  6,  5,  7,  5,  8,  5,  9,  5],
        [11,  5, 12,  5, 13,  5, 14, 15,  5, 16],
        [11,  5,  9,  5, 18,  5, 14, 15,  5, 19],
        [20,  5,  6,  5, 21, 15,  5, 22,  5, 23],
        [20,  5, 24, 17,  5, 25,  5, 26,  5, 27],
        [28,  5, 29,  5, 30,  5, 31, 15,  5, 32],
        [33,  5, 27,  5, 34,  5, 35,  5, 36, 37],
        [33,  5, 27,  5,  1, 15,  5, 39, 15,  5],
        [41,  5, 42, 15,  5, 43,  5, 44, 15,  5],
        [45,  5, 46,  5, 47,  5, 48,  5, 49,  5],
        [50,  5, 51,  5,  8,  5, 52,  5, 22,  5],
        [41, 15,  5, 54, 15,  5, 27,  5, 55,  5],
        [57,  5,  1,  5, 32,  5, 12,  5, 58,  5],
        [24, 17,  5, 25,  5, 26,  5, 60,  5, 61],
        [62,  5, 63,  5, 64,  5, 65,  5, 66,  5],
        [68,  5, 69,  5, 70,  5, 71,  5, 72,  5]])

In [29]:
# Now creating the embedding later


import math
from torch import nn

class Embedding(nn.Module):
  def __init__(self,vocab_size,embed_dim):
    super().__init__()
    k = 1/math.sqrt(embed_dim) #Xavier/Glorot Initialization
    self.weights =  torch.rand(vocab_size, embed_dim) * 2 * k - k # Defining weights centered around 0 in the range (-k,k). This is helpful in efficient convergence, stability of weight update
    self.weights[0] = 0 # Zero out the padding embedding
        # Using nn.Parameter tells torch to update this value in the backward pass
    self.weights = nn.Parameter(self.weights)

  def forward(self,token_ids):
    return self.weights[token_ids] # Wil return a 3D matrix. (B,T,E) - B- Batch size, T- tokens in each sentence, E- Encoding for tokens (embed_dim)


array([[0.89526292, 0.59138934, 0.17612222, ..., 0.41971541, 0.25957032,
        0.72454456],
       [0.08089466, 0.06763467, 0.06935093, ..., 0.85823329, 0.54371574,
        0.85797862],
       [0.06509178, 0.63566992, 0.7541466 , ..., 0.20550716, 0.9141946 ,
        0.04202733],
       ...,
       [0.52854091, 0.08820994, 0.97176282, ..., 0.10825685, 0.10440298,
        0.3383994 ],
       [0.25624182, 0.95547562, 0.67998267, ..., 0.68865402, 0.00142333,
        0.00168771],
       [0.09015057, 0.60996736, 0.59030291, ..., 0.07842738, 0.06210588,
        0.69879868]])

In [30]:
# Now defining the token predictor layer

class TokenPredictor(nn.Module):
  def __init__(self,vocab_size, input_token_count, hidden_units): # This is B,T,E
    super().__init__()
    torch.manual_seed(100)

    # Starting with the embedding layer
    self.embedding=Embedding(vocab_size,hidden_units)
    # Next is a Dense layer
    self.dense1=nn.Linear(hidden_units,hidden_units)
    # Next is the activation layer (ReLU)
    self.relu=nn.ReLU()
    # Defining the output layer
    # Output layer looks at all embedding vectors and generates a prediction
    self.output=nn.Linear(hidden_units*input_token_count, hidden_units)

  def forward(self,x):
    embedded=self.embedding(x) # Creating the vector embeddings
    # Run the network
    x=self.relu(self.dense1(embedded))

    # Flatten the vectors into one large vector per sentence for the final layer

    flat=torch.flatten(x,start_dim=1)

    # output layer
    network_out=self.output(flat)

    # Unembed the vector (Output should be batch_size,vocab_size) - will project onto this space using the embedding weights
    # Argmax against this output will give the predicted token
    out_vector=network_out @ self.embedding.weights.T
    return out_vector

In [34]:
from statistics import mean

def train_loop(net, optimizer, epochs):
    # We're doing classification, so we use crossentropy loss.
    loss_fn = nn.CrossEntropyLoss(ignore_index=0)
    train_losses = []
    for epoch in range(epochs):
        for batch, (x, y) in enumerate(train):
            # zero_grad will set all the gradients to zero
            # We need this because gradients will accumulate in the backward pass
            optimizer.zero_grad()
            # Make a prediction using the network
            pred = net(x)
            # Calculate the loss
            loss = loss_fn(pred, y)
            # Call loss.backward to run backpropagation
            loss.backward()
            # Step the optimizer to update the parameters
            optimizer.step()
            train_losses.append(loss.item())


    return train_losses

In [None]:
# Define our hyperparameters
epochs = 50
lr = 1e-3

# Initialize our network
net = TokenPredictor(len(vocab), 10, 256)
# Optimizer
optimizer = torch.optim.SGD(net.parameters(), lr=lr)
losses = train_loop(net, optimizer, epochs)