<a href="https://colab.research.google.com/github/sanjeevr5/NLP_Excercises/blob/main/DL_NLP_With_Torch_5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Football name generator using character RNN

A simple name generator without the condition of country.

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import pandas as pd
import torchtext

In [None]:
SEED = 10
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
data = pd.read_csv('./players_fifa22.csv')
data.head()

Unnamed: 0,ID,Name,FullName,Age,Height,Weight,PhotoUrl,Nationality,Overall,Potential,...,LMRating,CMRating,RMRating,LWBRating,CDMRating,RWBRating,LBRating,CBRating,RBRating,GKRating
0,158023,L. Messi,Lionel Messi,34,170,72,https://cdn.sofifa.com/players/158/023/22_60.png,Argentina,93,93,...,93,90,93,69,67,69,64,53,64,22
1,188545,R. Lewandowski,Robert Lewandowski,32,185,81,https://cdn.sofifa.com/players/188/545/22_60.png,Poland,92,92,...,87,83,87,67,69,67,64,63,64,22
2,20801,Cristiano Ronaldo,C. Ronaldo dos Santos Aveiro,36,187,83,https://cdn.sofifa.com/players/020/801/22_60.png,Portugal,91,91,...,89,81,89,66,62,66,63,56,63,23
3,231747,K. Mbappé,Kylian Mbappé,22,182,73,https://cdn.sofifa.com/players/231/747/22_60.png,France,91,95,...,92,84,92,70,66,70,66,57,66,21
4,200389,J. Oblak,Jan Oblak,28,188,87,https://cdn.sofifa.com/players/200/389/22_60.png,Slovenia,91,93,...,38,41,38,35,39,35,35,36,35,92


In [None]:
names = data.FullName
names.shape

(19260,)

In [None]:
allowed_chars = set()
allowed_chars.add('<bos>')
allowed_chars.add('<eos>')
allowed_chars.add('<pad>')

for name in names.values:
  allowed_chars.update(list(name))

i2s = dict(enumerate(sorted(list(allowed_chars))))
s2i = {c:i for i, c in i2s.items()}


print('The total unique characters are :', len(s2i))

The total unique characters are : 148


In [None]:
names.apply(len).max()

35

In [None]:
class NamesDataGenerator:
  def __init__(self, names, max_len = 35):
    self.names = names
    self.max_len = max_len
  
  def __len__(self):
    return len(self.names)
  
  def __getitem__(self, idx):
    x = []
    x.append(s2i['<bos>'])
    for char in self.names[idx]:
      x.append(s2i[char])
    x = [*x, *[s2i['<pad>']] * (self.max_len - len(x))] [:self.max_len]
    y = x[1:]
    y.append(s2i['<eos>'])
    return torch.tensor(x), torch.tensor(y)

train_dataset = NamesDataGenerator(names.values)

train_iter = iter(train_dataset)
print(next(train_iter))
    

(tensor([ 4, 18, 41, 47, 46, 37, 44,  0, 19, 37, 51, 51, 41,  6,  6,  6,  6,  6,
         6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6]), tensor([18, 41, 47, 46, 37, 44,  0, 19, 37, 51, 51, 41,  6,  6,  6,  6,  6,  6,
         6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  5]))


In [None]:
from torch.utils.data import DataLoader

BATCH_SIZE = 64
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

In [None]:
train_iter = iter(train_loader)
X, Y = train_iter.next()
print(X.size(), Y.size())

torch.Size([64, 35]) torch.Size([64, 35])


In [None]:
class NameGeneratorModel(nn.Module):
  def __init__(self, vocab_size, embed_dim, hidden_units, num_layers):
    super().__init__()
    self.vocab_size = vocab_size
    self.hu = hidden_units
    self.num_layers = num_layers
    self.embed = nn.Embedding(vocab_size, embed_dim)
    self.rnn = nn.LSTM(embed_dim, hidden_units, num_layers = num_layers, batch_first = True)
    self.fc = nn.Linear(hidden_units, vocab_size)
  
  def forward(self, data, hidden = None, cell = None):
    out = self.embed(data)
    if hidden != None and cell != None:
      out, (hidden, cell) = self.rnn(out, (hidden, cell))
    else:
      out, (hidden,cell) = self.rnn(out)
    out = self.fc(out)
    return out, (hidden, cell)

model = NameGeneratorModel(vocab_size = len(s2i), embed_dim = 64,  hidden_units= 128, num_layers = 2)
print(f'The total number of trainable parameters are : {sum(p.numel() for p in model.parameters() if p.requires_grad):,}')

The total number of trainable parameters are : 259,988


In [None]:
lr = 0.005

criterion = nn.CrossEntropyLoss()
criterion = criterion.to(device)
optimizer = optim.Adam(model.parameters(), lr=lr)

## !!--IMPORTANT--!!

<b> CROSS ENTROPY LOSS REQUIRES THE PREDICTIONS IN THE FORMAT (BATCH_SIZE, N_CLASSES, TIME_STEP) </b>

The labels are of the format (batch_size, an array of timesteps with correct labels)

The predictions are of the format (batch_size, n_timesteps, n_classes)

In [None]:
model = model.to(device)

In [None]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

def train(model, iterator = train_loader, loss_fn = criterion, optimizer = None):
  e_loss = i = 0
  model.train()
  for inputs, labels in iterator:
    inputs, labels = inputs.to(device), labels.to(device)
    optimizer.zero_grad()
    preds, _ = model(inputs)
    loss = loss_fn(preds.transpose(1,2), labels.long()) #Predictions are transposed
    loss.backward()
    optimizer.step()
    e_loss += loss.item()
    i += 1
  return e_loss/i

In [None]:
N_EPOCHS = 10

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss = train(model, optimizer = optimizer)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    print(f'Epoch: {epoch+1:02} / {N_EPOCHS} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f}')

Epoch: 01 / 10 | Epoch Time: 0m 2s
	Train Loss: 0.942
Epoch: 02 / 10 | Epoch Time: 0m 1s
	Train Loss: 0.892
Epoch: 03 / 10 | Epoch Time: 0m 1s
	Train Loss: 0.859
Epoch: 04 / 10 | Epoch Time: 0m 1s
	Train Loss: 0.837
Epoch: 05 / 10 | Epoch Time: 0m 1s
	Train Loss: 0.817
Epoch: 06 / 10 | Epoch Time: 0m 1s
	Train Loss: 0.802
Epoch: 07 / 10 | Epoch Time: 0m 1s
	Train Loss: 0.791
Epoch: 08 / 10 | Epoch Time: 0m 1s
	Train Loss: 0.780
Epoch: 09 / 10 | Epoch Time: 0m 1s
	Train Loss: 0.769
Epoch: 10 / 10 | Epoch Time: 0m 1s
	Train Loss: 0.759


In [None]:
def generate_name(model, start='<bos> L', k = 7):
  with torch.no_grad():    
    ht = torch.zeros((2, 1, 128)).to(device) #2 layered single time step hidden state
    ct = torch.zeros((2, 1, 128)).to(device) #2 layered single time step cell state
    length = 0
    name = start

    for char in start.split(' '):
      #LSTM requires the data in 3d format hence shaping into
      out, (ht, ct) = model(torch.tensor([[s2i[char]]]).to(device), (ht, ct)) #[batch_size, timestep, num_chars]
      length += 1
    vals, idxs = torch.topk(out[0], k) # 0 -> first eg in a batch
    idx = np.random.choice(idxs.cpu().numpy()[0]) # 0 -> first...
    char = i2s[idx]
    
    while char != '<eos>' or char != '<pad>' and length <= 34:
        out, (ht, ct) = model(torch.tensor([[s2i[char]]]).to(device), (ht, ct))
        vals, idxs = torch.topk(out[0], k) # 0 -> first eg in a batch
        idx = np.random.choice(idxs.cpu().numpy()[0]) # 0 -> first...
        char = i2s[idx]
        length += 1
        name += char
  return name

In [None]:
generate_name(model)

'<bos> Ly-MoqveyangalalloūlivázjethMogP<pad><eos>'

## Markov Chain Text Generation

- Uses bigrams to generate text

In [None]:
def bigrams(tokenized_text):
    return list(zip(tokenized_text, tokenized_text[1:]))

import string
chars = list(string.ascii_letters)
grams = bigrams(chars)

from collections import defaultdict

d = defaultdict(lambda : defaultdict(int))


for index in range(len(grams)):
    if index + 1 < len(grams):
        d[' '.join(grams[index])][' '.join(grams[index + 1])] += 1

for curr_s, curr_coll in d.items():
    total =  sum(curr_coll.values())
    for next_s in curr_coll.keys():
        d[curr_s][next_s] /= total