#importing dependencies

In [None]:
import tensorflow as tf
import os
import cv2
import numpy as np
from matplotlib import pyplot as plt
import imageio
import gdown # to download data from google drive
from google.colab import drive
from tensorflow.keras import layers, models, optimizers
from tensorflow.keras.callbacks import LearningRateScheduler, ModelCheckpoint, Callback
from tensorflow.data import Dataset, AUTOTUNE
from tensorflow.keras import backend

In [None]:
device = tf.config.list_physical_devices("GPU")
try:
  tf.config.experimental.set_memory_growth(device[0], True)
except:
  pass

In [None]:
drive.mount("/content/drive")

Mounted at /content/drive


#data loading

In [None]:
if "data.zip" not in os.listdir():
  url = "https://drive.google.com/uc?id=1YlvpDLix3S-U8fd-gqRwPcWXAXm8JwjL"
  output = "data.zip"
  gdown.download(url, output)
  gdown.extractall("data.zip")

Downloading...
From: https://drive.google.com/uc?id=1YlvpDLix3S-U8fd-gqRwPcWXAXm8JwjL
To: /content/data.zip
100%|██████████| 423M/423M [00:14<00:00, 28.8MB/s]


## preprocessing video

In [None]:
cascade_path = '/content/drive/MyDrive/lip-reader/mouth-cascade.xml'
mouth_cascade = cv2.CascadeClassifier(cascade_path)

In [None]:
def extract_mouth_loc(img):
  lips = mouth_cascade.detectMultiScale(img.numpy(), 1.1, 50)
  x, y, w, h = lips[0]
  return y-10, y+h+10, x-10, x+w+10

In [None]:
def preprocess_frame(frame, x_range, y_range):
  xs, xe = x_range
  ys, ye = y_range
  frame = tf.cast(frame[ys: ye, xs: xe], "float32")
  frame = tf.image.resize(frame, [60, 90])
  return frame

In [None]:
def load_video(path):
  cap = cv2.VideoCapture(path)
  frames = []
  ret, frame = cap.read()
  frame = tf.image.rgb_to_grayscale(frame)

  try:
    ys, ye, xs, xe = extract_mouth_loc(frame)
  except:
    ys, ye, xs, xe = 190, 250, 80, 170

  frame = preprocess_frame(frame,
                           (xs, xe),
                           (ys, ye))
  frames.append(frame)

  for i in range(1, int(cap.get(cv2.CAP_PROP_FRAME_COUNT))):
    ret, frame = cap.read()
    frame = tf.image.rgb_to_grayscale(frame)
    frame = preprocess_frame(frame,
                             (xs, xe),
                             (ys, ye))
    frames.append(frame)
  cap.release()
  mu = tf.math.reduce_mean(frames)
  mu = tf.cast(mu, tf.float32)
  sigma = tf.math.reduce_std(frames)
  sigma = tf.cast(sigma, tf.float32)
  frame_normalized = tf.cast((frames - mu), tf.float32)/sigma
  return frame_normalized

## preprocessing alignments

In [None]:
possible_chars = "abcdefghijklmnopqrstuvwxyz?!,.0123456789 "
vocab = list(possible_chars)

In [None]:
txt2vec = layers.StringLookup(vocabulary=vocab,
                             oov_token="_")
vec2txt = layers.StringLookup(vocabulary=txt2vec.get_vocabulary(),
                              oov_token="_",
                              invert=True)

In [None]:
def load_alignments(path):
  with open(path, 'r') as alignment_file:
    lines = alignment_file.readlines()
  tokens = []
  for l in lines:
    l = l.split()
    if l[2] != "sil":
      tokens.append(" " + l[2])
  txt_uni = tf.strings.unicode_split(tokens,
                                     input_encoding="UTF-8")
  txt_uni = tf.reshape(txt_uni, (-1))
  return txt2vec(txt_uni)[1:]

## Combining them together

In [None]:
def load_data(path):
  filename = str(path).split("/")[-1].split(".")[0]
  video_path = f"/content/data/s1/{filename}.mpg"
  align_path = f"/content/data/alignments/s1/{filename}.align"

  frames = load_video(video_path)
  aligns = load_alignments(align_path)

  return frames, aligns

In [None]:
test_path = "/content/data/s1/bbal7s.mpg"
frames, aligns = load_data(test_path)

In [None]:
# for pure string processing it is better to use a tensorflow mappable function
def mappable_dataload(path):
  return tf.py_function(load_data, [path], (tf.float32, tf.int64))

## Creating data pipeline

In [None]:
data = Dataset.list_files("/content/data/s1/*.mpg")
data = data.shuffle(500)
data = data.map(mappable_dataload)
data = data.padded_batch(2, padded_shapes=([75, None, None, None], [40]))
data = data.prefetch(AUTOTUNE)

In [None]:
frame, alignment = data.as_numpy_iterator().next()

In [None]:
alignment.shape

(2, 40)

# Creating the model

In [None]:
tf.random.set_seed(42)

In [None]:
model = models.Sequential([
    layers.Conv3D(128, 3,
                  input_shape=(75, 60, 90, 1),
                  padding="same",
                  activation="relu"),
    layers.MaxPool3D((1, 2, 2)),

    layers.Conv3D(256, 3,
                  padding="same",
                  activation="relu"),
    layers.MaxPool3D((1, 2, 2)),

    layers.Conv3D(75, 3,
                  padding="same",
                  activation="relu"),
    layers.MaxPool3D((1, 2, 2)),

    layers.TimeDistributed(layers.Flatten()),

    layers.Bidirectional(layers.LSTM(128,
                                     kernel_initializer="Orthogonal",
                                     return_sequences=True)),
    layers.Dropout(0.5),

    layers.Bidirectional(layers.LSTM(128,
                                     kernel_initializer="Orthogonal",
                                     return_sequences=True)),
    layers.Dropout(0.5),

    layers.Dense(txt2vec.vocabulary_size()+1,
                 kernel_initializer="he_normal",
                 activation="softmax")
])

In [None]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv3d (Conv3D)             (None, 75, 60, 90, 128)   3584      
                                                                 
 max_pooling3d (MaxPooling3  (None, 75, 30, 45, 128)   0         
 D)                                                              
                                                                 
 conv3d_1 (Conv3D)           (None, 75, 30, 45, 256)   884992    
                                                                 
 max_pooling3d_1 (MaxPoolin  (None, 75, 15, 22, 256)   0         
 g3D)                                                            
                                                                 
 conv3d_2 (Conv3D)           (None, 75, 15, 22, 75)    518475    
                                                                 
 max_pooling3d_2 (MaxPoolin  (None, 75, 7, 11, 75)     0

## setting up training and prediction options

Following things were set up:
1. a custom learning rate scheduler
2. CTC loss
3. A custom callback that prints a random original caption and a predicted one to assess the performance of the model.

In [None]:
def decode_string(y):
  output_string = vec2txt(y).numpy()
  output_string = tf.strings.reduce_join(output_string)
  return output_string.numpy()

In [None]:
def scheduler(epoch, lr):
    if epoch < 30:
        return lr
    else:
        return lr * tf.math.exp(-0.1)

In [None]:
def CTCLoss(y_true, y_pred):
    batch_len = tf.cast(tf.shape(y_true)[0], dtype="int64")
    input_len = tf.cast(tf.shape(y_pred)[1], dtype="int64")
    label_len = tf.cast(tf.shape(y_true)[1], dtype="int64")

    input_len = input_len * tf.ones(shape=(batch_len, 1),
                                          dtype="int64")
    label_len = label_len * tf.ones(shape=(batch_len, 1),
                                          dtype="int64")

    return tf.keras.backend.ctc_batch_cost(y_true, y_pred,
                                           input_len, label_len)

In [None]:
batch_len = tf.cast(2, dtype="int64")
input_len = tf.cast(75, dtype="int64")
label_len = tf.cast(42, dtype="int64")

input_len = input_len * tf.ones(shape=(batch_len, 1), dtype="int64")
label_len = label_len * tf.ones(shape=(batch_len, 1), dtype="int64")
input_len, label_len

(<tf.Tensor: shape=(2, 1), dtype=int64, numpy=
 array([[75],
        [75]])>,
 <tf.Tensor: shape=(2, 1), dtype=int64, numpy=
 array([[42],
        [42]])>)

In [None]:
class RandomExample(Callback):
  def __init__(self, dataset):
    self.dataset = dataset.as_numpy_iterator()

  def on_epoch_end(self, epoch, logs=None):
    data = self.dataset.next()
    yhat = self.model.predict(data[0])
    decoded = backend.ctc_decode(yhat, [75, 75],
                                 greedy=False)[0][0].numpy()
    for i in range(len(yhat)):
      print(f"""
      Original: {decode_string(data[1][i])}
      Prdicted: {decode_string(decoded[i])}""")
      print("~"*250)

In [None]:
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001),
              loss=CTCLoss)

In [None]:
train_checkpoint = ModelCheckpoint("/content/checkpoints",
                                   save_weights_only=True,
                                   monitor="loss")
lr_sched = LearningRateScheduler(scheduler)
example = RandomExample(data)

In [None]:
val = data.as_numpy_iterator().next()

In [None]:
pred = model.predict(val[0])



In [None]:
output_string = tf.argmax(pred, axis=2)
for s in output_string:
  print(decode_string(s))

b',,,,,,,,,,,,,,,,,,,,,,,9999999999999999999,,,99999999999999999999999aaa3...'
b',,,,,99999999999999999999999999999999999999999999999999999999999999999999..'


In [None]:
model.fit(data,
          epochs=100,
          callbacks=[lr_sched, example, train_checkpoint])

Epoch 1/100

      Original: b'bin red in f five soon__________________'
      Prdicted: b'le e e e no________________________________________________________________'
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

      Original: b'set blue in t two now___________________'
      Prdicted: b'le e e e e o_______________________________________________________________'
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Epoch 2/100

      Original: b'bin white with n nine again_____________'
      Prdicted: b'la e e e eo________________________________________________________________'
~~~~~~

UnknownError: ignored