# LipReading using Convolution neural network and Bidirectional LSTM

In [1]:
!pip install gdown

Collecting gdown
  Downloading gdown-4.7.1-py3-none-any.whl (15 kB)
Installing collected packages: gdown
Successfully installed gdown-4.7.1
[0m

In [2]:
# importing important libraries
import gdown
import torch
from torch.utils.data import Dataset , DataLoader
import os
import glob
import cv2
import numpy as np
import torch
import torch.nn as nn
import torchvision.models as models
from tqdm import tqdm

In [3]:
# download and extracting all to data.zip
url = 'https://drive.google.com/uc?id=1YlvpDLix3S-U8fd-gqRwPcWXAXm8JwjL'
output = 'data.zip'
gdown.download(url, output, quiet=False)
gdown.extractall('data.zip')

Downloading...
From (uriginal): https://drive.google.com/uc?id=1YlvpDLix3S-U8fd-gqRwPcWXAXm8JwjL
From (redirected): https://drive.google.com/uc?id=1YlvpDLix3S-U8fd-gqRwPcWXAXm8JwjL&confirm=t&uuid=5e6198b1-cc89-4b42-8437-87841d1e34e1
To: /kaggle/working/data.zip
100%|██████████| 423M/423M [00:03<00:00, 117MB/s] 


['data/',
 'data/alignments/',
 'data/alignments/s1/',
 'data/alignments/s1/bbaf2n.align',
 'data/alignments/s1/bbaf3s.align',
 'data/alignments/s1/bbaf4p.align',
 'data/alignments/s1/bbaf5a.align',
 'data/alignments/s1/bbal6n.align',
 'data/alignments/s1/bbal7s.align',
 'data/alignments/s1/bbal8p.align',
 'data/alignments/s1/bbal9a.align',
 'data/alignments/s1/bbas1s.align',
 'data/alignments/s1/bbas2p.align',
 'data/alignments/s1/bbas3a.align',
 'data/alignments/s1/bbaszn.align',
 'data/alignments/s1/bbaz4n.align',
 'data/alignments/s1/bbaz5s.align',
 'data/alignments/s1/bbaz6p.align',
 'data/alignments/s1/bbaz7a.align',
 'data/alignments/s1/bbbf6n.align',
 'data/alignments/s1/bbbf7s.align',
 'data/alignments/s1/bbbf8p.align',
 'data/alignments/s1/bbbf9a.align',
 'data/alignments/s1/bbbm1s.align',
 'data/alignments/s1/bbbm2p.align',
 'data/alignments/s1/bbbm3a.align',
 'data/alignments/s1/bbbmzn.align',
 'data/alignments/s1/bbbs4n.align',
 'data/alignments/s1/bbbs5s.align',
 'data/al

**Now the two important functions get_stoi and itos (string to integer and integer to string) are defined. get_stoi function takes the alignments file as a string and returns the index of each element according to the vocabulary whereas the itos does the opposite.**

In [4]:
def get_stoi(file):
  vocabulary = "abcdefghijklmnopqrstuvwxyz1234567890!?' "
  f = open(file, "r")
  list = []
  vocab = []
  for line in f:
    line = line.strip()
    txt = line.split(" ")[2]
    if txt != "sil":
        list.append(txt)
        list.append(" ")
  for ls in list:
    for t in ls:
      if t in vocabulary:
          vocab.append(vocabulary.index(t))
  # The length of the sequence is 35
  if len(vocab) < 35:
    for i in range(35-len(vocab)):
      vocab.append(38)
  vocab = np.array(vocab)
  return vocab

def itos(vec):
  vocabulary = "abcdefghijklmnopqrstuvwxyz1234567890!?' "
  sentence = ""
  for elem in vec:
    sentence += vocabulary[elem]
    
  return sentence

In [5]:
# reading all the files
file = glob.glob("/kaggle/working/data/alignments/s1/*.align")

In [6]:
# Lets check if the functions defined above
sentence_indx = get_stoi(file[0])
sentence = itos(sentence_indx)
print(sentence_indx)
print(sentence)

[11  0 24 39  6 17  4  4 13 39  8 13 39 18 39 14 13  4 39  0  6  0  8 13
 39 38 38 38 38 38 38 38 38 38 38]
lay green in s one again ''''''''''


In [7]:
frames = []

path = "/kaggle/working/data/s1/bbaf5a.mpg"
cap = cv2.VideoCapture(path)
ret = True
while ret:
    ret, img = cap.read() # read one frame from the 'capture' object; img is (H, W, C)
    if ret:
        frames.append(img[190:236, 80:220])
mpg = np.stack(frames, axis=0)
frames = torch.from_numpy(mpg)

In [8]:
class CustomDataset(Dataset):
  def __init__(self , files):
    self.files = files 

  def __len__(self):
    return len(self.files)

  def __getitem__(self, idx):
    path = self.files[idx]
    vocab = get_stoi(path)
    mpgpath = "/working/data/s1/"
    mpgpath = mpgpath + path.split("/")[5].split(".")[0] + ".mpg"
    frames = []
    cap = cv2.VideoCapture(mpgpath)
    ret = True
    size = cap.get(cv2.CAP_PROP_FRAME_COUNT)
    while ret:
      ret, img = cap.read() # read one frame from the 'capture' object; img is (H, W, C)
      if ret:
         img = cv2.cvtColor(img , cv2.COLOR_BGR2GRAY)
         img = np.reshape(img , ( img.shape[0] , img.shape[1] , 1 ))
         frames.append(img[190:236, 80:220, :])

    for i in range(75-int(size)):
        frames.append(np.zeros((46, 140, 1)))
    mpg = np.stack(frames, axis=0)
    frames = torch.from_numpy(mpg)
    frames = torch.permute(frames , (3 , 0 , 1 , 2))
    return frames , vocab

In [9]:
train = CustomDataset(file[:900])
val = CustomDataset(file[900:])

In [10]:
train_data = DataLoader(train, batch_size=16, shuffle = True )
val_data = DataLoader(val, batch_size = 16, shuffle = True)

In [11]:
class LSTM(nn.Module):
  def __init__(self, vocab_size, embed_size, hidden_size):
    super(LSTM, self).__init__()
    self.vocab_size = vocab_size
    self.embed_size = embed_size
    self.hidden_size = hidden_size
    self.embed = nn.Embedding(vocab_size , embed_size)
    self.model = nn.LSTM(self.embed_size, self.hidden_size, batch_first=True, bidirectional = True)
    self.linear = nn.Linear(hidden_size , vocab_size)
    self.dropout = nn.Dropout(0.5)
    self.conv1 = self._conv_layer_set(1 , 32)
    self.conv2 = self._conv_layer_set(32, 64)
    self.linear = nn.Sequential(
                       nn.Linear(25088, 1024),
                       nn.ReLU(),
                       nn.Linear(1024, 256),
                       nn.ReLU(),
                       nn.Linear(256, 128)
                              )
    self.ln = nn.Linear(self.hidden_size*2, self.vocab_size)
  def _conv_layer_set(self, in_c, out_c):
        conv_layer = nn.Sequential(
        nn.Conv3d(in_c, out_c, kernel_size=(3, 3, 3), padding=0),
        nn.LeakyReLU(),
        nn.MaxPool3d((3, 3, 3)),
        )
        return conv_layer
    
  def forward(self , x, sentence):
    x = self.conv1(x)
    x = self.conv2(x)
    x = x.view(x.shape[0] , -1)
    features = self.linear(x)
    embedding = self.embed(sentence)
    embedding = torch.cat((features.unsqueeze(1), embedding) , dim = 1 )
    embedding = self.dropout(embedding)
    embedding = embedding.cuda()
    out , _ = self.model(embedding)
    lipRead = self.ln(out.squeeze(1))
    
    return lipRead

In [12]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [13]:
model = LSTM(40 , 128 , 10).to(device)

In [15]:
import torch.optim as optim
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters() , 0.001)

In [16]:
for j in range(20):
  print(j , "EPOCHS")
  for m , (x , y) in enumerate(tqdm(train_data)):
    x = x.type(torch.cuda.FloatTensor)
    x = x.to(device)
    y = np.array(y)
    y = torch.from_numpy(y)
    y = y.to(device)

    sen = model(x , y)
    sentence = sen[: , 1: , :] #the model returns tensor shape (36 , 40)
    word = [ ]
    words= [ ]
    for i in range(y.shape[0]):
      for n in range(35):
         max = torch.argmax(sentence[i][n])
         word.append(max.cpu().detach().numpy())
      words.append(word)
      word = []
    words = np.stack(words , axis=0)
     
       
    
#   print(words.shape)
    word = itos(words[0])
    
    words = torch.from_numpy(words)
    words = words.to(device)
    loss = criterion(sentence.reshape(-1 , sentence.shape[2]) , y.reshape(-1)) 

    optimizer.zero_grad()
    loss.backward(loss)
    optimizer.step()
  print(word)
  print(itos(y[0]))

0 EPOCHS


100%|██████████| 57/57 [00:26<00:00,  2.17it/s]


ean b ne        nine   o  '''''''''
lay blue with k nine soon '''''''''
1 EPOCHS


100%|██████████| 57/57 [00:20<00:00,  2.84it/s]


 e    ee   it    ae o  eeaee ''''''
set green with q zero please ''''''
2 EPOCHS


100%|██████████| 57/57 [00:19<00:00,  2.91it/s]


tet  reen bb   teeen ooon '''''''''
set green by c seven soon '''''''''
3 EPOCHS


100%|██████████| 57/57 [00:19<00:00,  2.94it/s]


bin re   itl a tlree toon '''''''''
bin red with a three soon '''''''''
4 EPOCHS


100%|██████████| 57/57 [00:18<00:00,  3.02it/s]


bin white in t eero slease ''''''''
bin white in t zero please ''''''''
5 EPOCHS


100%|██████████| 57/57 [00:18<00:00,  3.01it/s]


plaee blue in n aero please '''''''
place blue in v zero please '''''''
6 EPOCHS


100%|██████████| 57/57 [00:19<00:00,  2.95it/s]


bin ihite at n one again ''''''''''
bin white at n one again ''''''''''
7 EPOCHS


100%|██████████| 57/57 [00:20<00:00,  2.83it/s]


lay red with l sii please '''''''''
lay red with l six please '''''''''
8 EPOCHS


100%|██████████| 57/57 [00:18<00:00,  3.07it/s]


plaae green at   three soon '''''''
place green at k three soon '''''''
9 EPOCHS


100%|██████████| 57/57 [00:19<00:00,  2.97it/s]


lay blue at   nine soon '''''''''''
lay blue at x nine soon '''''''''''
10 EPOCHS


100%|██████████| 57/57 [00:17<00:00,  3.19it/s]


lay white by l sii now ''''''''''''
lay white by l six now ''''''''''''
11 EPOCHS


100%|██████████| 57/57 [00:17<00:00,  3.21it/s]


bin green in a two now ''''''''''''
bin green in a two now ''''''''''''
12 EPOCHS


100%|██████████| 57/57 [00:19<00:00,  3.00it/s]


set red at b eero now '''''''''''''
set red at b zero now '''''''''''''
13 EPOCHS


100%|██████████| 57/57 [00:21<00:00,  2.67it/s]


lay blue sp with e sii please '''''
lay blue sp with e six please '''''
14 EPOCHS


100%|██████████| 57/57 [00:19<00:00,  2.92it/s]


lay red by e senen soon '''''''''''
lay red by e seven soon '''''''''''
15 EPOCHS


100%|██████████| 57/57 [00:19<00:00,  3.00it/s]


set green by c seven soon '''''''''
set green by c seven soon '''''''''
16 EPOCHS


100%|██████████| 57/57 [00:20<00:00,  2.76it/s]


set white at h aero please ''''''''
set white at v zero please ''''''''
17 EPOCHS


100%|██████████| 57/57 [00:19<00:00,  2.86it/s]


place red with i nine soon ''''''''
place red with j nine soon ''''''''
18 EPOCHS


100%|██████████| 57/57 [00:18<00:00,  3.09it/s]


place white at e seven again ''''''
place white at x seven again ''''''
19 EPOCHS


100%|██████████| 57/57 [00:19<00:00,  2.95it/s]


set red by u eight please '''''''''
set red by u eight please '''''''''
