In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
from torch.utils.data import TensorDataset, Dataset, DataLoader, random_split
import torchvision
import torchvision.transforms as transforms

import matplotlib.pyplot as plt
import math
import numpy as np
from pathlib import Path
import cv2
import json
import pandas as pd
from tqdm import tqdm
from PIL import Image
from sklearn.model_selection import train_test_split
import datetime
import time

import xgboost
from sklearn.metrics import accuracy_score
import lightgbm as lgb

## Data Import

In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
dtype_float = torch.cuda.FloatTensor if torch.cuda.is_available() else torch.FloatTensor

In [5]:
letters = ['A', 'I', 'U', 'E', 'O', 'N']

In [6]:
workdir = Path('/home/jphacks/LipNet-JP/')
youtube_id = '1'
# youtube_id = '2'
spk = 's{}'.format(youtube_id)
txtpath = workdir / 'data/align' / 'output{}word.align'.format(youtube_id)
aligned_lm_path = Path('/home/jphacks/LipNet-JP/data/processed2/{0}/{0}_aligned.csv'.format(youtube_id))
lm_path = Path('/home/jphacks/LipNet-JP/data/processed/{0}/{0}.csv'.format(youtube_id))
croppeddir = Path('/home/jphacks/LipNet-JP/data/processed2/{0}/{0}_aligned_aligned_cropped'.format(youtube_id))
assert croppeddir.exists()

datadir = Path('/home/jphacks/LipNet-JP/data')
videodir = datadir / 'lip_video'
txtdir = datadir / 'align_txt'

## Load

In [7]:
inwidth, inheight = 160, 80

In [8]:
aligned_lm_df = pd.read_csv(str(aligned_lm_path))
aligned_lm_df['timestamp'] = (aligned_lm_df['frame'] - 1) * (1/30)

with open(txtpath, 'r') as f:
    txt = json.load(f)

aligned_lm_df['target'] = -1

for word in txt:
    for c in word:
        aligned_lm_df.loc[(aligned_lm_df.timestamp >= c['start']) & (aligned_lm_df.timestamp < c['end']), 'target']         = letters.index(c['word'].upper())

In [9]:
aligned_lm_df

Unnamed: 0,frame,face_id,timestamp,confidence,success,gaze_0_x,gaze_0_y,gaze_0_z,gaze_1_x,gaze_1_y,...,AU15_c,AU17_c,AU20_c,AU23_c,AU25_c,AU26_c,AU28_c,AU45_c,timestamp.1,target
0,1,0,0.0,0.98,1,0.077917,0.293945,-0.952641,-0.316540,0.281786,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.000000,-1
1,2,0,0.0,0.98,1,0.077251,0.272675,-0.959000,-0.317211,0.262823,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.033333,-1
2,3,0,0.0,0.98,1,0.078179,0.285228,-0.955266,-0.303922,0.250068,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.066667,-1
3,4,0,0.0,0.98,1,0.082503,0.286471,-0.954530,-0.309884,0.242810,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.100000,-1
4,5,0,0.0,0.98,1,0.072832,0.289409,-0.954431,-0.322312,0.255926,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.133333,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8833,8834,0,0.0,0.93,1,0.211204,0.278979,-0.936784,-0.193433,0.300300,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,294.433333,-1
8834,8835,0,0.0,0.88,1,0.156352,0.355814,-0.921385,-0.199006,0.464844,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,294.466667,-1
8835,8836,0,0.0,0.88,1,0.206907,0.507335,-0.836541,-0.249781,0.325380,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,294.500000,-1
8836,8837,0,0.0,0.88,1,0.195739,0.543425,-0.816318,-0.214373,0.394439,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,294.533333,-1


In [10]:
## 話していないときのデータを削除
aligned_lm_df = aligned_lm_df[aligned_lm_df['target'] != -1]

In [11]:
aligned_lm_df.target.value_counts()

0    1227
4    1097
1     894
3     645
5     575
2     561
Name: target, dtype: int64

In [19]:
aligned_lm_df

Unnamed: 0,frame,face_id,timestamp,confidence,success,gaze_0_x,gaze_0_y,gaze_0_z,gaze_1_x,gaze_1_y,...,AU15_c,AU17_c,AU20_c,AU23_c,AU25_c,AU26_c,AU28_c,AU45_c,timestamp.1,target
8,9,0,0.0,0.98,1,0.082914,0.282764,-0.955599,-0.319702,0.258262,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.266667,5
9,10,0,0.0,0.98,1,0.084628,0.286519,-0.954329,-0.317676,0.263163,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.300000,5
10,11,0,0.0,0.98,1,0.082808,0.288818,-0.953796,-0.316511,0.267189,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.333333,0
11,12,0,0.0,0.98,1,0.082693,0.292147,-0.952792,-0.319675,0.269907,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.366667,0
12,13,0,0.0,0.98,1,0.071052,0.285099,-0.955861,-0.304504,0.275527,...,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.400000,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8821,8822,0,0.0,0.98,1,0.140809,0.326962,-0.934489,-0.198623,0.298263,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,294.033333,3
8822,8823,0,0.0,0.98,1,0.150457,0.322335,-0.934592,-0.191991,0.304547,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,294.066667,3
8823,8824,0,0.0,0.98,1,0.146324,0.311414,-0.938941,-0.193710,0.298439,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,294.100000,2
8824,8825,0,0.0,0.98,1,0.152923,0.322806,-0.934029,-0.195477,0.285684,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,294.133333,2


## Split

In [12]:
train_df, test_df = train_test_split(aligned_lm_df, test_size=.2, shuffle=True)

In [13]:
train_x = train_df[(set(train_df.columns) - {'target'})].values
train_y = train_df.target.values

test_x = test_df[(set(test_df.columns) - {'target'})].values
test_y = test_df.target.values

## XGBoost Model

In [32]:
model = xgboost.XGBClassifier()
model.fit(train_x, train_y)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='multi:softprob', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

## LightGBM

In [14]:
model = lgb.LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
        importance_type='split', learning_rate=0.1, max_depth=-1,
        min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
        n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
        random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
        subsample=1.0, subsample_for_bin=200000, subsample_freq=0)
model.fit(train_x, train_y)

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.1, max_depth=-1,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
               random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
               subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

## Evaluate

In [15]:
accuracy_score(model.predict(train_x), train_y)

0.9989997499374844

In [16]:
accuracy_score(model.predict(test_x), test_y)

0.652

In [17]:
pred = model.predict(test_x)

class_correct = [0] * len(letters)
class_total = [0] * len(letters)

for p, a in zip(pred, test_y):
    class_total[a] += 1
    if p == a:
        class_correct[a] += 1
        
for i, l in enumerate(letters):
    print('Accuracy of    {}: {:.4f} ({:4d}/{:4d})'.format(l, class_correct[i]/class_total[i], class_correct[i], class_total[i]))

Accuracy of    A: 0.7625 ( 183/ 240)
Accuracy of    I: 0.6117 ( 115/ 188)
Accuracy of    U: 0.6053 (  69/ 114)
Accuracy of    E: 0.5571 (  78/ 140)
Accuracy of    O: 0.7864 ( 162/ 206)
Accuracy of    N: 0.4018 (  45/ 112)
