<a href="https://colab.research.google.com/github/sukritikala/Deep-Learning-Models/blob/main/DL4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install torchdata==0.6.0 # to be compatible with torch 2.0
!pip install portalocker>=2.0.0

Collecting torchdata==0.6.0
  Downloading torchdata-0.6.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.6/4.6 MB[0m [31m45.0 MB/s[0m eta [36m0:00:00[0m
Collecting torch==2.0.0 (from torchdata==0.6.0)
  Downloading torch-2.0.0-cp310-cp310-manylinux1_x86_64.whl (619.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m619.9/619.9 MB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu11==11.7.99 (from torch==2.0.0->torchdata==0.6.0)
  Downloading nvidia_cuda_nvrtc_cu11-11.7.99-2-py3-none-manylinux1_x86_64.whl (21.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.0/21.0 MB[0m [31m77.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-runtime-cu11==11.7.99 (from torch==2.0.0->torchdata==0.6.0)
  Downloading nvidia_cuda_runtime_cu11-11.7.99-py3-none-manylinux1_x86_64.whl (849 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
import os

#torch specific
import torch
import torch.nn as nn
import torch.nn.functional as F

#Data loader
from torch.utils.data import Dataset,DataLoader


#text lib
import torchtext

#fetch data
from torchtext.datasets import AG_NEWS

# tokenizer
from torchtext.data.utils import get_tokenizer

#build vocabulary
from torchtext.vocab import vocab
from torchtext.vocab import build_vocab_from_iterator

# get input_ids (numericalization)
from torchtext.transforms import VocabTransform

# get embeddings
from torch.nn import Embedding

# get rnn model and layers
from torch.nn import RNN, Linear, Sigmoid, Softmax

# optimizer
import torch.optim as optim

# utils
from torch.nn.utils.rnn import pad_sequence
from torch.nn.utils.rnn import pack_padded_sequence
from torch.nn.utils.rnn import pad_packed_sequence

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
os.makedirs('./data',exist_ok=True)
train_samples = AG_NEWS(root='./data',split='train')
print('Number of training samples: ',len(list(train_samples)))
print('A sample: \n',next(iter(train_samples)))

Number of training samples:  120000
A sample: 
 (3, "Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\\band of ultra-cynics, are seeing green again.")


In [None]:
tokenizer = get_tokenizer(tokenizer="basic_english",language='en')

In [None]:
text = ['This is called tokenization!','this is not the best approach by the way']
token_list = [tokenizer(sentence) for sentence in text]
print(token_list)

[['this', 'is', 'called', 'tokenization', '!'], ['this', 'is', 'not', 'the', 'best', 'approach', 'by', 'the', 'way']]


In [None]:
# token iterator
def yield_tokens(corpus):
  for (label,sentence) in corpus:
    yield tokenizer(sentence)

In [None]:
v = build_vocab_from_iterator(yield_tokens(train_samples),min_freq=100,specials=['<pad>','<unk>'])
v.set_default_index(v['<unk>']) # index of OOV



In [None]:
print(v['deep'],v['learning'])

2162 4700


In [None]:
vocab_transform = VocabTransform(v)

for sample in train_samples:
  input_ids = vocab_transform(tokenizer(sample[1])) # 0th index is a label
  print(input_ids)
  break

[432, 426, 2, 1606, 1, 114, 67, 3, 849, 14, 28, 15, 28, 16, 1, 4, 432, 375, 17, 10, 1, 7, 1, 4, 43, 4010, 784, 326, 2]


In [None]:
def get_input_ids(sample):
  tokens = tokenizer(sample[1]) # again, oth index is a label
  return torch.LongTensor(vocab_transform(tokens))

In [None]:
embedding = Embedding(num_embeddings = len(v),embedding_dim=6,padding_idx=0)

In [None]:
for sample in train_samples:
  input_ids = get_input_ids(sample)
  print(input_ids)
  print(embedding(input_ids))
  break

tensor([ 432,  426,    2, 1606,    1,  114,   67,    3,  849,   14,   28,   15,
          28,   16,    1,    4,  432,  375,   17,   10,    1,    7,    1,    4,
          43, 4010,  784,  326,    2])
tensor([[ 0.6501, -0.1928, -2.2136,  0.2353, -1.4875, -0.5320],
        [ 1.1267,  1.8755, -0.0230,  0.1487,  0.1805, -0.1433],
        [-0.7042, -0.1911,  1.3894, -0.5377, -0.6318,  0.0080],
        [ 0.3692, -1.0841, -0.1932, -0.3323, -0.0788, -0.2880],
        [-0.9397, -0.5662, -0.0467,  0.2801, -0.7259,  0.0396],
        [ 0.8393,  0.3802,  0.2668, -0.2152, -0.8384, -1.2904],
        [ 0.5632, -0.3148, -1.0201,  1.3652, -0.2572, -0.8222],
        [ 1.5739, -1.0177,  0.2626, -0.3058, -0.6208, -0.2340],
        [-0.9553,  0.3795, -0.2180,  0.3681, -0.1209, -0.7572],
        [ 0.4704, -0.2824,  0.9410,  0.0526, -1.6643,  0.7100],
        [-0.1172, -0.0592,  0.3631,  0.8544,  1.3885, -0.3067],
        [-1.3740, -0.6994, -2.7269,  1.1591,  0.2955,  1.4775],
        [-0.1172, -0.0592,  0.363

In [None]:
for sample in train_samples:
  input_ids = get_input_ids(sample)
  print(input_ids.shape)
  prompt = input('Continue?')
  if prompt == 'y':
    continue
  else:
    break

torch.Size([29])
Continue?y
torch.Size([42])
Continue?y
torch.Size([40])


KeyboardInterrupt: ignored

In [None]:
examples = [(1,'padding is necessary'),(4,'you know the reason right?')]
batch_input_ids = [get_input_ids(sample) for sample in examples ]
padded_input_ids = pad_sequence(batch_input_ids,batch_first=True,padding_value=0.0)

In [None]:
print(padded_input_ids)

tensor([[   1,   22, 4425,    0,    0,    0],
        [ 166, 1200,    3, 2257,  480,   81]])


In [None]:
def collate_function(batch_samples):
  '''
  Input : Sample : (label,sentence)
  return :  (label tensor, padded_seuence ,lengths of unpadded seq in batches)
  '''

  #padded_seq
  batch_input_ids = [get_input_ids(sample) for sample in batch_samples ]

  padded_input_ids = pad_sequence(batch_input_ids,batch_first=True,padding_value=0.0)

  # label tensor
  # -1 is added to make class num starting from 0, required for one-hot encoding
  labels = torch.tensor([torch.LongTensor([sample[0]-1]) for sample in batch_samples])

  # lengths of unpadded seq

  lengths = [len(tokenizer(sample[1]))for sample in batch_samples]

  return (labels,padded_input_ids,lengths)

In [None]:
label,sample,lengths = collate_function([(1,'this is great'),(2,'why is this taking such a long time?')])
print('label tensor: \n ',label)
print('Padded sequence: \n',sample)
print('Actual lengths: ', lengths)

label tensor: 
  tensor([0, 1])
Padded sequence: 
 tensor([[  53,   22,  811,    0,    0,    0,    0,    0,    0],
        [1165,   22,   53,  608,  560,    6,  443,  102,   81]])
Actual lengths:  [3, 9]


In [None]:
class RNNClassifier(torch.nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_class):
        super().__init__()
        self.embedding = Embedding(vocab_size, embed_dim,padding_idx=0)
        self.rnn = RNN(embed_dim,hidden_dim,batch_first=True)
        self.fc = Linear(hidden_dim, num_class)


    def forward(self, x, lengths):
        # get embedding for padded sequence
        x = self.embedding(x)
        x = pack_padded_sequence(x,lengths=lengths,enforce_sorted=False,batch_first=True)

        # get hidden states for all time steps, last time step h_T as packed sequence
        x = self.rnn(x)
        # get the final state h_T
        x = self.fc(x[1])  # logits
        return x

In [None]:
batch_size = 32
dataloader = DataLoader(train_samples,batch_size=batch_size,collate_fn = collate_function,shuffle=True)

In [None]:
vocab_size = len(v)
embedding_dim = 300
num_classes = 4
hidden_dim = 60
model = RNNClassifier(vocab_size,embedding_dim,hidden_dim,num_classes)
model.to(device)

RNNClassifier(
  (embedding): Embedding(5002, 300, padding_idx=0)
  (rnn): RNN(300, 60, batch_first=True)
  (fc): Linear(in_features=60, out_features=4, bias=True)
)

In [None]:
Loss = nn.functional.cross_entropy
optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)

In [None]:
from torch.nn.functional import one_hot
for epoch in range(1):  # loop over the dataset multiple times

    running_loss = 0.0
    running_acc = 0.0
    for i, data in enumerate(dataloader, 0):
        # get the inputs; data is a list of [inputs, labels]
        labels, samples,lengths = data
        labels_ohe = torch.tensor(one_hot(labels,num_classes=4),dtype=torch.float32)
        labels_ohe = labels_ohe.to(device)
        samples = samples.to(device)

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = model(samples,lengths)

        loss = Loss(outputs.squeeze(), labels_ohe.squeeze())
        loss.backward()
        optimizer.step()

        # Loss
        running_loss += loss.item()


        class_correct = torch.argmax(outputs.to('cpu'),axis=2) == torch.as_tensor(labels)
        running_acc += torch.count_nonzero(class_correct)/batch_size
        if i % 100 == 99:    # print every 10000 mini-batches
            print('[%d, %5d] loss: %.3f Accuracy:%.3f' %
                  (epoch + 1, i + 1, running_loss / 99,running_acc/99))
            running_loss = 0.0
            running_acc = 0.0


  labels_ohe = torch.tensor(one_hot(labels,num_classes=4),dtype=torch.float32)


[1,   100] loss: 1.462 Accuracy:0.246
[1,   200] loss: 1.460 Accuracy:0.234
[1,   300] loss: 1.471 Accuracy:0.238
[1,   400] loss: 1.456 Accuracy:0.234
[1,   500] loss: 1.465 Accuracy:0.227
[1,   600] loss: 1.458 Accuracy:0.233
[1,   700] loss: 1.461 Accuracy:0.244
[1,   800] loss: 1.456 Accuracy:0.240
[1,   900] loss: 1.462 Accuracy:0.240
[1,  1000] loss: 1.452 Accuracy:0.251
[1,  1100] loss: 1.459 Accuracy:0.250
[1,  1200] loss: 1.458 Accuracy:0.246
[1,  1300] loss: 1.455 Accuracy:0.251
[1,  1400] loss: 1.470 Accuracy:0.236
[1,  1500] loss: 1.443 Accuracy:0.250
[1,  1600] loss: 1.452 Accuracy:0.247
[1,  1700] loss: 1.464 Accuracy:0.242
[1,  1800] loss: 1.451 Accuracy:0.245
[1,  1900] loss: 1.457 Accuracy:0.261
[1,  2000] loss: 1.456 Accuracy:0.244
[1,  2100] loss: 1.457 Accuracy:0.242
[1,  2200] loss: 1.450 Accuracy:0.259
[1,  2300] loss: 1.447 Accuracy:0.250
[1,  2400] loss: 1.447 Accuracy:0.261
[1,  2500] loss: 1.450 Accuracy:0.246
[1,  2600] loss: 1.467 Accuracy:0.243
[1,  2700] l

In [None]:
#text = "all the focus is now on the biggest T20 league in the world "
text = "The league uses a lot of technologies to trace the ball "

In [None]:
classes = ['World','Sports','Business','Sci-Tech']

In [None]:
def get_input_ids_inf(text):
  tokens = tokenizer(text)
  input_ids = vocab_transform(tokens)
  return torch.LongTensor(input_ids).unsqueeze(0)

In [None]:
model_inference = model.to('cpu')
with torch.inference_mode(True):
  logits = model_inference(get_input_ids_inf(text),[len(tokenizer(text))])
  scores = torch.nn.functional.softmax(logits,dim=2)
  print(scores)
  print(classes[torch.argmax(scores)])

tensor([[[0.3417, 0.1478, 0.2527, 0.2577]]])
World


In [None]:
img = torch.randint(low=0,high=255,size=(4,6,3))
print(img.shape)
print(img)

torch.Size([4, 6, 3])
tensor([[[  8, 155, 197],
         [ 60, 101, 240],
         [134,  35, 118],
         [ 49, 216, 148],
         [112, 194, 123],
         [191, 249, 202]],

        [[159, 195, 244],
         [ 43,  39, 208],
         [161,   2, 148],
         [196, 220, 132],
         [202,  31, 113],
         [ 23,  19, 204]],

        [[ 40, 242,  74],
         [ 70, 153, 123],
         [167, 138,  56],
         [170, 213, 168],
         [207,  64, 137],
         [ 55,  90, 130]],

        [[116, 240,  90],
         [ 22, 125, 235],
         [ 56, 125, 162],
         [191,  40, 156],
         [154,  92,  80],
         [104, 169, 247]]])


In [None]:
torch.permute(img,(2,0,1)).shape

torch.Size([3, 100, 100])

In [None]:
torch.sum(img,dim=1)

tensor([[ 554,  950, 1028],
        [ 784,  506, 1049],
        [ 709,  900,  688],
        [ 643,  791,  970]])