In [None]:
import os
import glob

import torch
import torchvision
import cv2
from torchvision import models
import torch.nn as nn
from torch.autograd import Variable

from google.colab import drive
from google.colab.patches import cv2_imshow

In [None]:
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
os.chdir('/content/gdrive/My Drive/lipreading')

In [None]:
# Lets investigate pre-train, where we have word annotations with time, 
# which will probably be necessary for training the model accurately

pretrain_paths = glob.glob('data/pretrain/*/*.mp4')
pretrain_transcripts = [x.replace('.mp4', '.txt') for x in pretrain_paths]

In [None]:
def get_frames(video_path, desired_frames=None):
  video_cap = cv2.VideoCapture(video_path)

  fps = int(video_cap.get(5))
  frame_count = int(video_cap.get(7))
  
  if desired_frames:
    if desired_frames > frame_count:
      print(f'Requested {desired_frames} frames, the video only contains {frame_count} frames')
      return []

  print('FPS: ', fps)
  print('Frames: ', frame_count)
  print(f'Duration: {(frame_count/fps)}s')

  frames = []

  frame_limit = frame_count if not desired_frames else desired_frames
  for i in range(int(frame_limit)):
    ret, val = video_cap.read()

    if ret:
      frames.append(val)
    else:
      print('Error occured')
      break

  video_cap.release()
  return frames

In [None]:
pretrain_paths[0], pretrain_transcripts[0]

('data/pretrain/ZzSVEj5RLWM/00005.mp4', 'data/pretrain/ZzSVEj5RLWM/00005.txt')

In [None]:
frames = get_frames(pretrain_paths[0])

FPS:  25
Frames:  692
Duration: 27.68s


In [None]:
# This will let us know what the lowest number of frames is in the videos and the location in the array
def findLowestNumFrames():
  numFrames = []

  for i in range(len(pretrain_paths)):
    frames = get_frames(pretrain_paths[i])
    numFrames.append(len(frames))

  print("Lowest Number of Frames in All Videos: " + str( min(numFrames)))
  print("Index of the Lowest Number of Frames in All Videos: " + str(numFrames.index(min(numFrames))))

findLowestNumFrames() 

FPS:  25
Frames:  692
Duration: 27.68s
FPS:  25
Frames:  1117
Duration: 44.68s
FPS:  25
Frames:  1184
Duration: 47.36s
FPS:  25
Frames:  221
Duration: 8.84s
FPS:  25
Frames:  204
Duration: 8.16s
Lowest Number of Frames in All Videos: 204
Index of the Lowest Number of Frames in All Videos: 4


In [None]:
with open(pretrain_transcripts[0], 'r') as f:
  s = f.read()

print(s)

Text:  OUTFIT I'M PASSIONATE ABOUT CHANGING THE FASHION INDUSTRY ABOUT PROVIDING SUSTAINABLE CLOTHING OPTIONS AND SUPPORTING DESIGNERS WHO ARE DOING THE RIGHT THING EVERYTHING YOU ARE WEARING TODAY HAS A STORY WITH CONTRIBUTORS AND CHANGE COMES FROM PEOPLE LIKE YOU THE EVERYDAY CONSUMERS OF CLOTHING AND TOGETHER WE CAN CHANGE THE FASHION INDUSTRY 
Conf:  4

WORD START END ASDSCORE
OUTFIT 0.08 0.58 4.5
I'M 4.05 4.38 5.0
PASSIONATE 4.38 4.86 3.8
ABOUT 4.86 5.20 2.7
CHANGING 5.20 5.65 2.6
THE 5.65 5.78 1.8
FASHION 5.78 6.14 4.4
INDUSTRY 6.14 6.69 0.5
ABOUT 7.24 7.54 2.2
PROVIDING 7.54 8.10 4.9
SUSTAINABLE 8.10 8.71 5.3
CLOTHING 8.71 9.04 7.0
OPTIONS 9.04 9.55 3.9
AND 9.93 10.04 13.0
SUPPORTING 10.04 10.57 8.4
DESIGNERS 10.57 11.08 4.1
WHO 11.08 11.28 3.6
ARE 11.28 11.35 2.9
DOING 11.35 11.79 2.7
THE 11.79 11.90 3.0
RIGHT 11.90 12.22 6.1
THING 12.25 12.56 2.1
EVERYTHING 13.59 14.15 2.6
YOU 14.15 14.23 3.5
ARE 14.23 14.36 6.7
WEARING 14.36 14.67 4.9
TODAY 14.67 15.38 3.9
HAS 15.78 16.00 8.0

In [None]:
frames[0].shape

In [None]:
example_length_seconds = 1
fps = 25
batch_size = 2
width, height = 224, 224
channels = 3


data_shape = (batch_size, fps*example_length_seconds, channels, width, height)

fake_data = Variable(torch.randint(0, 255, data_shape)).float()
fake_data.shape

torch.Size([2, 25, 3, 224, 224])

In [None]:
resnet = models.resnet50(pretrained=True)

In [None]:
class LREncoder(nn.Module):
  def __init__(self, params):
    super(LREncoder,self).__init__()  

    dr_rate= params["dr_rate"]
    hidden_size = params["hidden_size"]
    rnn_num_layers = params["rnn_num_layers"]
    embedding_dim = params["embedding_dim"]
      
    basemodel = models.alexnet(pretrained=True)
    basemodel.fc = nn.Linear(in_features=2048, out_features=embedding_dim, bias=True)
    self.basemodel = basemodel

    self.dropout= nn.Dropout(dr_rate)
    # self.encoder = nn.Transformer(d_model=embedding_dim)
    self.encoder = nn.GRU(input_size=embedding_dim, hidden_size=hidden_size, batch_first=True)

  def features(self, x):
    bs, ts, c, h, w = x.shape
    ys = []
    for ii in range(0, ts):
      yi = self.basemodel((x[:,ii]))
      ys.append(yi)
    ys = torch.stack(ys, 1)
    return ys

  def forward(self, x):
    ys = self.features(x)
    y = self.encoder(ys)
    return y


In [None]:
model_params = {}
model_params['dr_rate'] = 0.1
model_params['hidden_size'] = 32
model_params['rnn_num_layers'] = 1
model_params['embedding_dim'] = 1000

model = LREncoder(model_params)

  f"The parameter '{pretrained_param}' is deprecated since 0.13 and will be removed in 0.15, "


In [None]:
print(fake_data.shape)
fake_ys = model.features(fake_data)
print(fake_ys.shape)
fake_y = model.encoder(fake_ys)
for item in fake_y:
  print(item.shape)

torch.Size([2, 25, 3, 224, 224])
torch.Size([2, 25, 1000])
torch.Size([2, 25, 32])
torch.Size([1, 2, 32])


In [None]:
rnn = nn.GRU(10, 20, 2)
input = torch.randn(5, 3, 10)
h0 = torch.randn(2, 3, 20)
output, hn = rnn(input, h0)
output.shape, hn.shape

(torch.Size([5, 3, 20]), torch.Size([2, 3, 20]))

In [None]:
# model.resnet(Variable(torch.randint(0, 255, (1, 3, 224, 224))).float())
fake_y = model(Variable(fake_data).float())
fake_data.shape, fake_y.shape

(torch.Size([2, 25, 3, 224, 224]), torch.Size([2, 24, 1000]))

In [None]:
class Identity(nn.Module):
    def __init__(self):
        super(Identity, self).__init__()
    def forward(self, x):
        return x  

class Resnt18Rnn(nn.Module):
    def __init__(self, params_model):
        super(Resnt18Rnn, self).__init__()
        num_classes = params_model["num_classes"]
        dr_rate = params_model["dr_rate"]
        pretrained = params_model["pretrained"]
        rnn_hidden_size = params_model["rnn_hidden_size"]
        rnn_num_layers = params_model["rnn_num_layers"]
        
        baseModel = models.resnet18(pretrained=pretrained)
        num_features = baseModel.fc.in_features
        baseModel.fc = Identity()
        self.baseModel = baseModel
        self.dropout= nn.Dropout(dr_rate)
        self.rnn = nn.LSTM(num_features, rnn_hidden_size, rnn_num_layers)
        self.fc1 = nn.Linear(rnn_hidden_size, num_classes)
  
    def forward(self, x):
        b_z, ts, c, h, w = x.shape
        ii = 0
        y = self.baseModel((x[:,ii]))
        output, (hn, cn) = self.rnn(y.unsqueeze(1))
        for ii in range(1, ts):
            y = self.baseModel((x[:,ii]))
            out, (hn, cn) = self.rnn(y.unsqueeze(1), (hn, cn))
        out = self.dropout(out[:,-1])
        out = self.fc1(out) 
        print(hn.shape)
        return out 

In [None]:
params_model = {}
params_model["num_classes"] = 10
params_model["dr_rate"] = 0.1
params_model["pretrained"] = 'pretrained'
params_model["rnn_hidden_size"] = 32
params_model["rnn_num_layers"] = 1

model = Resnt18Rnn(params_model)

  f"The parameter '{pretrained_param}' is deprecated since 0.13 and will be removed in 0.15, "


In [None]:
fake_y = model(Variable(fake_data).float())
fake_y.shape, fake_data.shape

torch.Size([1, 1, 32])


(torch.Size([2, 10]), torch.Size([2, 25, 3, 224, 224]))