In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
from torch.utils.data import TensorDataset, Dataset, DataLoader, random_split
import torchvision
import torchvision.transforms as transforms

import matplotlib.pyplot as plt
import math
import numpy as np
from pathlib import Path
import cv2
import json
import pandas as pd
from tqdm import tqdm
from PIL import Image
from sklearn.model_selection import train_test_split
import datetime
import time

## Util

In [4]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

## Data Import

In [5]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
dtype_float = torch.cuda.FloatTensor if torch.cuda.is_available() else torch.FloatTensor

In [6]:
letters = ['A', 'I', 'U', 'E', 'O', 'N']

In [7]:
workdir = Path('/home/jphacks/LipNet-JP/')
youtube_id = '1'
# youtube_id = '2'
spk = 's{}'.format(youtube_id)
txtpath = workdir / 'data/align' / 'output{}word.align'.format(youtube_id)
aligned_lm_path = Path('/home/jphacks/LipNet-JP/data/processed2/{0}/{0}_aligned.csv'.format(youtube_id))
lm_path = Path('/home/jphacks/LipNet-JP/data/processed/{0}/{0}.csv'.format(youtube_id))
croppeddir = Path('/home/jphacks/LipNet-JP/data/processed2/{0}/{0}_aligned_aligned_cropped'.format(youtube_id))
assert croppeddir.exists()

datadir = Path('/home/jphacks/LipNet-JP/data')
videodir = datadir / 'lip_video'
txtdir = datadir / 'align_txt'

In [8]:
# preprocess = transforms.Compose([
#     transforms.Resize(152),
#     transforms.CenterCrop(152),
#     transforms.ToTensor(),
#     transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
# ])

# preprocess = transforms.Compose([
#     transforms.RandomResizedCrop(224),
#     transforms.RandomHorizontalFlip(),
#     transforms.ToTensor(),
#     transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
# ])

preprocess = transforms.Compose([
    transforms.Resize(152),
    transforms.CenterCrop(152),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

# preprocess = transforms.Compose([
#     transforms.CenterCrop((122, 122)),
#     transforms.CenterCrop((112, 112)),
#     transforms.Grayscale(num_output_channels=1),
#     transforms.ToTensor(),
#     transforms.Normalize([0.4161,],[0.1688,]),
# ])

## Load

In [9]:
inwidth, inheight = 160, 80

In [19]:
aligned_lm_df = pd.read_csv(str(aligned_lm_path))
aligned_lm_df['timestamp'] = (aligned_lm_df['frame'] - 1) * (1/30)

with open(txtpath, 'r') as f:
    txt = json.load(f)

aligned_lm_df['target'] = -1

for word in txt:
    print("".join([c["word"] for c in word]))
    
    for c in word:
        aligned_lm_df.loc[(aligned_lm_df.timestamp >= c['start']) & (aligned_lm_df.timestamp < c['end']), 'target'] = letters.index(c['word'].upper())

nana
nee
oo
aao
oouu
eeu
ne
an
aai
iai
ani
anae
onai
aao
oo
nee
nuiuuu
ooa
ai
ooon
au
oe
no
ii
iu
a
ina
nai
ae
a
e
nani
ae
ina
enoo
iou
uu
nani
ae
ni
oo
aiei
io
aa
ae
a
o
oo
a
io
oo
a
ii
i
oe
no
anae
aaii
o
uunauoo
onenu
aiin
i
iun
no
oiii
o
i
a
oee
uuau
o
ne
e
io
a
oo
aao
oaa
i
aiau
iu
eo
nani
iun
ooi
au
oo
oa
iooenei
anae
anaa
nau
iun
ui
ni
ii
ie
e
oe
anaeaa
a
oion
eu
ono
nai
aa
uuau
aa
ooo
oie
ue
aoea
oeu
ioo
uia
iu
aeo
ion
ni
nee
o
oiu
e
a
anoo
oa
ooao
oa
iu
iii
i
euo
eo
ae
oau
e
iu
ae
a
euni
oo
ia
a
ue
aiaa
i
oa
au
e
aa
ia
na
aa
ea
anoii
e
iu
a
ia
ai
oe
o
uuau
uen
e
io
a
uou
anoii
o
ooini
eoo
uae
iu
ooini
eoo
nan
eeoo
nana
nee
aao
o
oouu
eeu
ei
ne
an
ie
aai
iai
ani
anae
oe
onai
aao
ono
eeu
iun
no
ooi
au
oo
anan
ono
ian
uoeu
oou
aoea
oe
aiin
nana
eain
no
enoo
oa
eain
enoo
e
oo
no
ae
eo
nai
aeo
nana
eoo
aai
oa
ooi
a
eain
iai
ooi
o
uoi
ooi
a
enen
oouu
oa
oouu
no
enen
nani
nana
eain
ui
oo
enoo
i
ona
an
eoo
oouu
eain
ae
o
eoo
ne
no
on
oo
oouu
ai
eain
oooi
oo
ni
iai
oa
on
i
ooioi
oe
a
n

In [9]:
aligned_lm_df

Unnamed: 0,frame,face_id,timestamp,confidence,success,gaze_0_x,gaze_0_y,gaze_0_z,gaze_1_x,gaze_1_y,...,AU15_c,AU17_c,AU20_c,AU23_c,AU25_c,AU26_c,AU28_c,AU45_c,timestamp.1,target
0,1,0,0.0,0.98,1,0.077917,0.293945,-0.952641,-0.316540,0.281786,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.000000,-1
1,2,0,0.0,0.98,1,0.077251,0.272675,-0.959000,-0.317211,0.262823,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.033333,-1
2,3,0,0.0,0.98,1,0.078179,0.285228,-0.955266,-0.303922,0.250068,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.066667,-1
3,4,0,0.0,0.98,1,0.082503,0.286471,-0.954530,-0.309884,0.242810,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.100000,-1
4,5,0,0.0,0.98,1,0.072832,0.289409,-0.954431,-0.322312,0.255926,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.133333,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8833,8834,0,0.0,0.93,1,0.211204,0.278979,-0.936784,-0.193433,0.300300,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,294.433333,-1
8834,8835,0,0.0,0.88,1,0.156352,0.355814,-0.921385,-0.199006,0.464844,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,294.466667,-1
8835,8836,0,0.0,0.88,1,0.206907,0.507335,-0.836541,-0.249781,0.325380,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,294.500000,-1
8836,8837,0,0.0,0.88,1,0.195739,0.543425,-0.816318,-0.214373,0.394439,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,294.533333,-1


In [43]:
imglist = []
targetlist = []
_lett_counter = {l: 0 for l in letters}
for idx, row in aligned_lm_df.iterrows():
    if row.target < 0:
        continue
    imgpath = croppeddir / 'frame_det_00_{:06d}.bmp'.format(int(row.frame))
    img = Image.open(str(imgpath))
    input_tensor = preprocess(img)
    size = np.asarray(img).shape
    if size != (inheight, inwidth, 3):
        continue
    # img = np.moveaxis(img, 2, 0)  # (80, 160, 3) -> (3, 80, 160)
    imglist.append(input_tensor)
    targetlist.append(int(row.target))
    _lett_counter[letters[int(row.target)]] += 1

## Dataset

In [44]:
class LipDataset(Dataset):
    def __init__(self, imglist, targetlist, idxlist):
        self.imglist = imglist
        self.targetlist = targetlist
        self.idxlist = idxlist
    
    def __len__(self):
        return len(self.idxlist)
    
    def __getitem__(self, idx):
        return (self.imglist[self.idxlist[idx]], self.targetlist[self.idxlist[idx]])
    
    def _to_list(self):
        return [self[i] for i in range(len(self))]

In [45]:
train_idxlist, test_idxlist = train_test_split(list(range(len(imglist))), test_size=.2, shuffle=True)
train_idxlist, validate_idxlist = train_test_split(train_idxlist, test_size=.2, shuffle=True)

print(len(train_idxlist), len(validate_idxlist), len(test_idxlist))

3196 799 999


In [46]:
train_lipdataset = LipDataset(imglist, targetlist, train_idxlist)
validate_lipdataset = LipDataset(imglist, targetlist, validate_idxlist)
test_lipdataset = LipDataset(imglist, targetlist, test_idxlist)

In [None]:
BATCH_SIZE = 18

In [47]:
trainloader = torch.utils.data.DataLoader(train_lipdataset, batch_size=BATCH_SIZE,
                                          shuffle=False, num_workers=2)
validateloader = torch.utils.data.DataLoader(validate_lipdataset, batch_size=1,
                                         shuffle=False, num_workers=2)
testloader = torch.utils.data.DataLoader(test_lipdataset, batch_size=1,
                                         shuffle=False, num_workers=2)

## Model

In [56]:
# net = torchvision.models.resnet34(pretrained=True, num_classes=6).to(device)
net = torchvision.models.resnet34(pretrained=True).to(device)

In [57]:
criterion = nn.CrossEntropyLoss()
# optimizer = optim.SGD(net.parameters(), lr=0.5, momentum=0.9)
optimizer = optim.Adam(net.parameters(), lr=0.001)

## Train

In [50]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch_idx, (data, target) in enumerate(iterator):
        
        data, target = data.to(device), target.to(device)
        
        optimizer.zero_grad()
        
        output = net(data)
        
        loss = criterion(output, target)
        _, predicted = torch.max(output, 1)
        acc = (predicted == target.to(device)).sum()
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item() / dtype_float(BATCH_SIZE)
        epoch_acc += acc.item() / dtype_float(BATCH_SIZE)
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [51]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch_idx, (data, target) in enumerate(iterator):
            
            data, target = data.to(device), target.to(device)
                        
            output = model(data.type(dtype_float))
            
            loss = criterion(output, target)
            
            _, predicted = torch.max(output.data, 1)
            
            acc = (predicted == target.to(device)).sum()
            
            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [58]:
N_EPOCHS = 20

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train(net, trainloader, optimizer, criterion)
    valid_loss, valid_acc = evaluate(net, validateloader, criterion)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(net.state_dict(), 'tut5-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

Epoch: 01 | Epoch Time: 1m 25s
	Train Loss: 0.109 | Train Acc: 27.81%
	 Val. Loss: 0.097 |  Val. Acc: 1.39%
Epoch: 02 | Epoch Time: 1m 25s
	Train Loss: 0.093 | Train Acc: 33.08%
	 Val. Loss: 0.105 |  Val. Acc: 1.34%
Epoch: 03 | Epoch Time: 1m 25s
	Train Loss: 0.090 | Train Acc: 36.86%
	 Val. Loss: 0.100 |  Val. Acc: 1.49%
Epoch: 04 | Epoch Time: 1m 25s
	Train Loss: 0.085 | Train Acc: 39.33%
	 Val. Loss: 0.105 |  Val. Acc: 1.40%
Epoch: 05 | Epoch Time: 1m 25s
	Train Loss: 0.079 | Train Acc: 44.38%
	 Val. Loss: 0.097 |  Val. Acc: 1.62%
Epoch: 06 | Epoch Time: 1m 25s
	Train Loss: 0.072 | Train Acc: 50.37%
	 Val. Loss: 0.102 |  Val. Acc: 1.86%
Epoch: 07 | Epoch Time: 1m 25s
	Train Loss: 0.061 | Train Acc: 59.39%
	 Val. Loss: 0.105 |  Val. Acc: 2.11%
Epoch: 08 | Epoch Time: 1m 25s
	Train Loss: 0.053 | Train Acc: 64.95%
	 Val. Loss: 0.104 |  Val. Acc: 2.12%
Epoch: 09 | Epoch Time: 1m 25s
	Train Loss: 0.043 | Train Acc: 71.82%
	 Val. Loss: 0.112 |  Val. Acc: 2.30%
Epoch: 10 | Epoch Time: 1m 2

## Test

In [1]:
def test(model, iterator, optimizer, criterion):
    model.eval()
    class_correct = [0.] * len(letters)
    class_total = [0.] * len(letters)
    with torch.no_grad():
        for data in iterator:
            images, labels = data
            outputs = net(images.type(dtype_float))
            _, predicted = torch.max(outputs, 1)
            c = (predicted == labels.to(device)).squeeze()
            class_correct[labels.item()] += c.item() * 1
            class_total[labels.item()] += 1

    for i, l in enumerate(letters):
        print('Accuracy of    {}: {:.4f} ({:4d}/{:4d})'.format(l, class_correct[i]/class_total[i] if class_total[i] > 0 else 0, int(class_correct[i]), int(class_total[i])))

In [None]:
test(net, testloader, optimizer, criterion)