In [444]:
import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
from tqdm import tqdm
import torch.optim as optim
!pip install torchsummary
from torchsummary import summary

[0m

In [445]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [446]:
with open('/kaggle/input/popular-names/names.txt','r') as file:
    data = file.read().split('\n')

In [447]:
data[:5]

['emma', 'olivia', 'ava', 'isabella', 'sophia']

In [448]:
train_data = data[:int(len(data)*0.8)]
val_data = data[int(len(data)*0.8):int(len(data)*0.9)]
test_data = data[int(len(data)*0.9):int(len(data))]

In [449]:
print(len(train_data) , len(val_data) , len(test_data))

25626 3203 3204


**dataset**
*the below function create a dataset given data and chuck size (here a chunck mean how  many char given to predict the next char)*

In [450]:
def makedataset(data,chunck):
    x = []
    y = []
    dummy=''
    for i in range(chunck):
        dummy+='.'
    for word in data:
        word = dummy + word + '.'
        for i in range(len(word)-chunck-1):
            x.append(word[i:i+chunck])
            y.append(word[i+chunck])
    return x , y

In [451]:
x , y = makedataset(train_data,4)

**use some type maping to make them into number so that we can do operations on them**

*one way is one hot encodding where echar will be represented by a list of size with vocublary length*
*othre way is we can represent each char into a lower dimensional vector which helps in increasing computaional speed and data requriments*

In [452]:
hashmap = {}
for i , ch in enumerate(range(97,123)):
    hashmap[chr(ch)] = i
hashmap['.']=26
hashmap

{'a': 0,
 'b': 1,
 'c': 2,
 'd': 3,
 'e': 4,
 'f': 5,
 'g': 6,
 'h': 7,
 'i': 8,
 'j': 9,
 'k': 10,
 'l': 11,
 'm': 12,
 'n': 13,
 'o': 14,
 'p': 15,
 'q': 16,
 'r': 17,
 's': 18,
 't': 19,
 'u': 20,
 'v': 21,
 'w': 22,
 'x': 23,
 'y': 24,
 'z': 25,
 '.': 26}

In [453]:
x_num = []
y_num = []
for i in range(len(x)):
    tmp = []
    for ch in x[i]:
        tmp.append(hashmap[ch])
    y_num.append(hashmap[y[i]])
    x_num.append(tmp)

In [454]:
x_num[0:5]

[[26, 26, 26, 26],
 [26, 26, 26, 4],
 [26, 26, 4, 12],
 [26, 4, 12, 12],
 [26, 26, 26, 26]]

In [455]:
y_num[0:5]

[4, 12, 12, 0, 14]

In [456]:
x_num = torch.tensor(x_num)
y_num = torch.tensor(y_num)

In [457]:
x_num.shape
print(x_num[:10])

tensor([[26, 26, 26, 26],
        [26, 26, 26,  4],
        [26, 26,  4, 12],
        [26,  4, 12, 12],
        [26, 26, 26, 26],
        [26, 26, 26, 14],
        [26, 26, 14, 11],
        [26, 14, 11,  8],
        [14, 11,  8, 21],
        [11,  8, 21,  8]])


In [458]:
y_num.shape
print(y_num[:10])

tensor([ 4, 12, 12,  0, 14, 11,  8, 21,  8,  0])


**approach**
1. *a trigram laungauge model is a model where it take input of 3 sequence characters and predicts the next char in the sequence*
2. *this can be used as document filling model*
3. *here i will be implementing a trigram model with wavenet architecture*  

*input after will be (3,3) encoded vector out put will be a softmax of dim->(27,1) each giving probs of each char*
*considering batch size input will be (batch_size , 3) -> (batch_size,3,3) -> (batch_size,9) -> *

In [478]:
class Model(nn.Module):
    def __init__(self):
        super(Model, self).__init__()
        self.embed = nn.Embedding(27,4)
        self.linear64 = nn.Linear(16,64)
#         self.norm1= nn.BatchNorm1d(2)
        self.linear256 = nn.Linear(64,256)
#         self.norm2=nn.BatchNorm1d(1)
        self.linear128 =nn.Linear(256,128)
#         self.norm3 = nn.BatchNorm1d(1)
        self.linear = nn.Linear(128,27)
        self.logits = nn.Softmax(dim=1)
    def forward(self, x):
        # 64 , 4 ,4
#         x = x.to('cpu')
        x = self.embed(x)
        x = x.view(64,16)
        x = self.linear64(x)
#         x = self.norm1(x)
#         x = x.view(64,1,128)
        x = self.linear256(x)
#         x = self.norm2(x)
        # 64 1 256
        x = self.linear128(x)
#         x = self.norm3(x)
        x = self.linear(x)
        x = self.logits(x)
        return x

In [460]:
class textset(Dataset):
    def __init__(self,x_num,y_num):
        self.x=x_num
        self.y=y_num
    def __len__(self):
        return len(self.x)
    def __getitem__(self,idx):
        x=self.x[idx]
        y=self.y[idx]
        return x.to(device) , y.to(device)

In [461]:
data = textset(x_num,y_num)

In [462]:
loader = DataLoader(data,batch_size=64)

In [479]:
trigram = Model().to(device)

In [474]:
summary(trigram,(1,4),64)

RuntimeError: shape '[64, 16]' is invalid for input of size 32

In [465]:
optimizer = optim.Adam(trigram.parameters(),lr=0.001)

In [480]:
for parm in trigram.parameters():
    print(parm[:5])

tensor([[ 1.1846,  1.1537,  0.3813,  2.2242],
        [ 0.7513,  1.0656,  1.0289, -0.4807],
        [-0.5596,  0.8705,  0.4391,  0.0615],
        [-1.0894, -0.1862, -0.1217,  1.2825],
        [ 1.7309,  1.1174, -0.0812,  0.0745]], device='cuda:0',
       grad_fn=<SliceBackward0>)
tensor([[-2.1493e-01,  2.8640e-05, -1.7397e-02,  7.0398e-02, -1.1500e-01,
          1.1547e-01,  2.4832e-01, -3.9346e-02, -3.3767e-02, -3.5864e-02,
          1.4336e-01,  2.9966e-02,  9.1847e-02,  8.9076e-03,  4.3572e-02,
         -4.0713e-02],
        [ 2.3830e-01, -6.9435e-02,  1.1514e-01,  2.1991e-01,  1.4444e-01,
          2.2016e-02, -4.8118e-02,  2.7755e-02,  1.4928e-01,  6.3503e-02,
          2.1662e-02,  2.3529e-01, -6.8684e-02, -6.0639e-02,  5.7035e-02,
          1.2135e-01],
        [ 5.7495e-02,  2.3843e-01, -5.2311e-02,  1.2087e-01, -9.4072e-02,
         -1.7996e-01,  1.1324e-01,  1.8237e-01,  1.3722e-01,  1.4455e-01,
          4.7336e-02, -2.1342e-01, -1.3485e-01, -3.9537e-02,  6.5599e-02,
       

In [482]:
trigram.train()

Model(
  (embed): Embedding(27, 4)
  (linear64): Linear(in_features=16, out_features=64, bias=True)
  (linear256): Linear(in_features=64, out_features=256, bias=True)
  (linear128): Linear(in_features=256, out_features=128, bias=True)
  (linear): Linear(in_features=128, out_features=27, bias=True)
  (logits): Softmax(dim=1)
)

In [494]:
epochs = 2000
losses = []
for i in range(epochs):
    running_loss = 0
    for x , y in tqdm(loader):
        if(x.shape[0]!=64):
            print(f"loss at {i}th epoch is : {running_loss}")
            continue
        pred = trigram(x)
        print(pred)
        loss = F.cross_entropy(pred,y)
        loss.backward()
        print(loss.grad)
        break
        running_loss+=loss.item()
#         tqdm.set_description(f"current batch running loss: {running_loss}")
    break
    print(f"loss at {i}th epoch is : {running_loss}")
    losses.append(running_loss)

  print(loss.grad)
  0%|          | 0/2456 [00:00<?, ?it/s]

tensor([[0.0431, 0.0408, 0.0395,  ..., 0.0406, 0.0402, 0.0306],
        [0.0414, 0.0373, 0.0392,  ..., 0.0420, 0.0410, 0.0344],
        [0.0437, 0.0404, 0.0349,  ..., 0.0394, 0.0406, 0.0333],
        ...,
        [0.0368, 0.0334, 0.0383,  ..., 0.0351, 0.0381, 0.0429],
        [0.0341, 0.0321, 0.0328,  ..., 0.0404, 0.0390, 0.0406],
        [0.0411, 0.0359, 0.0314,  ..., 0.0359, 0.0403, 0.0341]],
       device='cuda:0', grad_fn=<SoftmaxBackward0>)
None





In [None]:
# plt.plot(losses)