In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
from torch.utils.data import TensorDataset, Dataset, DataLoader, random_split
import torchvision
import torchvision.transforms as transforms

import matplotlib.pyplot as plt
import math
import numpy as np
from pathlib import Path
import cv2
import json
import pandas as pd
from tqdm import tqdm
from PIL import Image
from sklearn.model_selection import train_test_split
import datetime
import time

import xgboost
from sklearn.metrics import accuracy_score
import lightgbm as lgb

## Data Import

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
dtype_float = torch.cuda.FloatTensor if torch.cuda.is_available() else torch.FloatTensor

In [4]:
letters = ['A', 'I', 'U', 'E', 'O', 'N']

In [5]:
workdir = Path('/home/jphacks/LipNet-JP/')
youtube_id = '1'
# youtube_id = '2'
spk = 's{}'.format(youtube_id)
txtpath = workdir / 'data/align' / 'output{}word.align'.format(youtube_id)
aligned_lm_path = Path('/home/jphacks/LipNet-JP/data/processed2/{0}/{0}_aligned.csv'.format(youtube_id))
lm_path = Path('/home/jphacks/LipNet-JP/data/processed/{0}/{0}.csv'.format(youtube_id))
croppeddir = Path('/home/jphacks/LipNet-JP/data/processed2/{0}/{0}_aligned_aligned_cropped'.format(youtube_id))
assert croppeddir.exists()

datadir = Path('/home/jphacks/LipNet-JP/data')
videodir = datadir / 'lip_video'
txtdir = datadir / 'align_txt'

## Load

In [6]:
inwidth, inheight = 160, 80

In [7]:
aligned_lm_df = pd.read_csv(str(aligned_lm_path))
aligned_lm_df['timestamp'] = (aligned_lm_df['frame'] - 1) * (1/30)

with open(txtpath, 'r') as f:
    txt = json.load(f)

aligned_lm_df['target'] = -1

for word in txt:
    for c in word:
        aligned_lm_df.loc[(aligned_lm_df.timestamp >= c['start']) & (aligned_lm_df.timestamp < c['end']), 'target']         = letters.index(c['word'].upper())

In [8]:
aligned_lm_df

Unnamed: 0,frame,face_id,timestamp,confidence,success,gaze_0_x,gaze_0_y,gaze_0_z,gaze_1_x,gaze_1_y,...,AU15_c,AU17_c,AU20_c,AU23_c,AU25_c,AU26_c,AU28_c,AU45_c,timestamp.1,target
0,1,0,0.0,0.98,1,0.077917,0.293945,-0.952641,-0.316540,0.281786,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.000000,-1
1,2,0,0.0,0.98,1,0.077251,0.272675,-0.959000,-0.317211,0.262823,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.033333,-1
2,3,0,0.0,0.98,1,0.078179,0.285228,-0.955266,-0.303922,0.250068,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.066667,-1
3,4,0,0.0,0.98,1,0.082503,0.286471,-0.954530,-0.309884,0.242810,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.100000,-1
4,5,0,0.0,0.98,1,0.072832,0.289409,-0.954431,-0.322312,0.255926,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.133333,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8833,8834,0,0.0,0.93,1,0.211204,0.278979,-0.936784,-0.193433,0.300300,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,294.433333,-1
8834,8835,0,0.0,0.88,1,0.156352,0.355814,-0.921385,-0.199006,0.464844,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,294.466667,-1
8835,8836,0,0.0,0.88,1,0.206907,0.507335,-0.836541,-0.249781,0.325380,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,294.500000,-1
8836,8837,0,0.0,0.88,1,0.195739,0.543425,-0.816318,-0.214373,0.394439,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,294.533333,-1


In [9]:
## 話していないときのデータを削除
aligned_lm_df = aligned_lm_df[aligned_lm_df['target'] != -1]

In [10]:
aligned_lm_df.target.value_counts()

0    1227
4    1097
1     894
3     645
5     575
2     561
Name: target, dtype: int64

## Split

In [11]:
train_df, test_df = train_test_split(aligned_lm_df, test_size=.2, shuffle=True)

In [12]:
train_x = train_df[(set(train_df.columns) - {'target'})].values
train_y = train_df.target.values

test_x = test_df[(set(test_df.columns) - {'target'})].values
test_y = test_df.target.values

## Model

In [45]:
class NN(torch.nn.Module):
    def __init__(self, in_size, hidden_size, out_size):
        # クラスの初期化
        # :param in_size: 入力層のサイズ
        # :param hidden_size: 隠れ層のサイズ
        # :param out_size: 出力層のサイズ
        super(NN, self).__init__()
        self.xh = torch.nn.Linear(in_size, hidden_size)
        self.hL = [torch.nn.Linear(hidden_size, hidden_size) for _ in range(150)]
        self.hy = torch.nn.Linear(hidden_size, out_size)
 
    def __call__(self, x):
        # 順伝播を計算する関数
        # :param x: 入力値
        h = F.relu(self.xh(x))
        for hh in self.hL:
            h = F.relu(hh(h))
        y = F.log_softmax(self.hy(h))
        return y

## Dataset

In [13]:
train_x = np.array(train_x, dtype="float32")
train_y = np.array(train_y, dtype="int32")

test_x = np.array(test_x, dtype="float32")
test_y = np.array(test_y, dtype="int32")

train_x = torch.from_numpy(train_x)
train_y = torch.from_numpy(train_y)

test_x = torch.from_numpy(test_x)
test_y = torch.from_numpy(test_y)

In [30]:
EPOCH_NUM = 100
HIDDEN_SIZE = 20
BATCH_SIZE = 20
N = 100

In [19]:
train = torch.utils.data.TensorDataset(train_x, train_y)
train_loader = torch.utils.data.DataLoader(train, batch_size=BATCH_SIZE, shuffle=True)

## Model

In [39]:
in_size = train_x.shape[1]
out_size = len(letters)

In [47]:
model = NN(in_size=in_size, hidden_size=HIDDEN_SIZE, out_size=out_size)
# model = torchvision.models.resnet34(pretrained=False).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters())

## Train

In [43]:
# http://www.ie110704.net/2017/08/31/pytorch%E3%81%A7%E3%83%8B%E3%83%A5%E3%83%BC%E3%83%A9%E3%83%AB%E3%83%8D%E3%83%83%E3%83%88%E3%83%AF%E3%83%BC%E3%82%AF%E3%80%81rnn%E3%80%81cnn%E3%82%92%E5%AE%9F%E8%A3%85%E3%81%97%E3%81%A6%E3%81%BF/A

In [48]:
st = datetime.datetime.now()
for epoch in range(EPOCH_NUM):
    # ミニバッチ学習
    total_loss = 0
    for i, data in enumerate(train_loader):
        x, y = data
        x, y = Variable(x), Variable(y).long()
        
        y_ = model(x)
        loss = criterion(y_, y)
        total_loss += loss.data

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    if (epoch+1) % 10 == 0:
        # accuracy
        x, y = Variable(train_x), Variable(train_y).long()
        _, y_ = torch.max(model(x).data, 1)
        accuracy = sum(y.data.numpy() == y_.numpy()) / N
        # test accuracy
        x, y = Variable(test_x), Variable(test_y)
        _, y_ = torch.max(model(x).data, 1)
        test_accuracy = sum(y.data.numpy() == y_.numpy()) / len(y.data.numpy())
        ed = datetime.datetime.now()
        print("epoch:\t{}\ttotal loss:\t{}\taccuracy:\t{}\tvaridation accuracy\t{}\ttime:\t{}".format(epoch+1, total_loss, accuracy, test_accuracy, ed-st))
        st = datetime.datetime.now()



epoch:	10	total loss:	349.3305969238281	accuracy:	9.79	varidation accuracy	0.248	time:	0:01:09.997629
epoch:	20	total loss:	349.3291015625	accuracy:	9.79	varidation accuracy	0.248	time:	0:01:09.535319
epoch:	30	total loss:	349.31939697265625	accuracy:	9.79	varidation accuracy	0.248	time:	0:01:10.262240
epoch:	40	total loss:	349.3290710449219	accuracy:	9.79	varidation accuracy	0.248	time:	0:01:09.764727
epoch:	50	total loss:	349.3013610839844	accuracy:	9.79	varidation accuracy	0.248	time:	0:01:10.075288
epoch:	60	total loss:	349.34320068359375	accuracy:	9.79	varidation accuracy	0.248	time:	0:01:09.801421
epoch:	70	total loss:	349.3209228515625	accuracy:	9.79	varidation accuracy	0.248	time:	0:01:06.731227
epoch:	80	total loss:	349.3235168457031	accuracy:	9.79	varidation accuracy	0.248	time:	0:01:05.312586
epoch:	90	total loss:	349.3182373046875	accuracy:	9.79	varidation accuracy	0.248	time:	0:01:03.439705
epoch:	100	total loss:	349.3143615722656	accuracy:	9.79	varidation accuracy	0.248	t

## XGBoost Model

In [32]:
model = xgboost.XGBClassifier()
model.fit(train_x, train_y)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='multi:softprob', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

## LightGBM

In [14]:
model = lgb.LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
        importance_type='split', learning_rate=0.1, max_depth=-1,
        min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
        n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
        random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
        subsample=1.0, subsample_for_bin=200000, subsample_freq=0)
model.fit(train_x, train_y)

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.1, max_depth=-1,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
               random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
               subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

## Evaluate

In [15]:
accuracy_score(model.predict(train_x), train_y)

0.9989997499374844

In [16]:
accuracy_score(model.predict(test_x), test_y)

0.652

In [17]:
pred = model.predict(test_x)

class_correct = [0] * len(letters)
class_total = [0] * len(letters)

for p, a in zip(pred, test_y):
    class_total[a] += 1
    if p == a:
        class_correct[a] += 1
        
for i, l in enumerate(letters):
    print('Accuracy of    {}: {:.4f} ({:4d}/{:4d})'.format(l, class_correct[i]/class_total[i], class_correct[i], class_total[i]))

Accuracy of    A: 0.7625 ( 183/ 240)
Accuracy of    I: 0.6117 ( 115/ 188)
Accuracy of    U: 0.6053 (  69/ 114)
Accuracy of    E: 0.5571 (  78/ 140)
Accuracy of    O: 0.7864 ( 162/ 206)
Accuracy of    N: 0.4018 (  45/ 112)
