### 00. 環境建置

In [None]:
# Mount to Porject Folder
from google.colab import drive
drive.mount('/content/drive', force_remount = True)
FOLDERNAME = 'Colab\ Notebooks/SC201LFINAL'
%cd drive/MyDrive/$FOLDERNAME

Mounted at /content/drive
/content/drive/MyDrive/Colab Notebooks/SC201LFINAL


In [None]:
import torch

device = torch.device('cuda' if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [None]:
# 影片字幕存放區

# 量產區 train          : val
TRAIN_DIRECTORY: str = '_train_rgb_front_clips'         # 31165
VAL_DIRECTORY: str = '_val_rgb_front_clips'             # 1741
LABEL_FILE_TRAIN = 'how2sign_realigned_train.csv'       # 31165
LABEL_FILE_VAL = 'how2sign_realigned_val.csv'           # 1741

# # 量產區 val as train === val
# TRAIN_DIRECTORY: str = '_val_rgb_front_clips'          # 1741
# VAL_DIRECTORY: str = '_val_rgb_front_clips'             # 1741
# LABEL_FILE_TRAIN = 'how2sign_realigned_val.csv'        # 1741
# LABEL_FILE_VAL = 'how2sign_realigned_val.csv'           # 1741


### 01.字幕 (sentence) : train.csv, val.csv'

### 1. 資料讀取

#### 1-1. 讀取字幕資訊

In [None]:
import pandas as pd
import csv

# 讀取 LABEL_FILE :

# training data
label_cap_train = pd.read_csv(LABEL_FILE_TRAIN, delimiter='\t', quoting = csv.QUOTE_NONE, encoding = 'utf-8')
# label_cap_train = pd.read_csv(LABEL_FILE_TRAIN, delimiter='\t', on_bad_lines='skip')
file_name_trains = label_cap_train.SENTENCE_NAME
file_cap_train = label_cap_train.SENTENCE

# val data
label_cap_val = pd.read_csv(LABEL_FILE_VAL, delimiter='\t', quoting = csv.QUOTE_NONE, encoding = 'utf-8')
# label_cap_val = pd.read_csv(LABEL_FILE_VAL, delimiter='\t', on_bad_lines='skip')
file_name_vals = label_cap_val.SENTENCE_NAME
file_cap_val = label_cap_val.SENTENCE

#### 1-2. 資料前處理(preprocessing): 字幕(sentence)

In [None]:
# 定義: 特殊字元的替換 (patterns replacement)

patterns = ['<br />', '--', '.', ',', '!', '?', ')', '(', ';', ':', '*', '~', '_', "'", '"','#','$','%','&','+']
replacements = ['', '', '', '', '', '', '', '', '', '', '', '', '', '', '','','','','','']

In [None]:
# def: 資料前處理(data preproessing)
import re

def preprocessing(reviews, patterns, replacements) -> list:
  lst = []
  for i in range(len(reviews)):
    review = reviews[i].lower()
    for pattern, replacement in zip(patterns, replacements):
      # review = review.replace(pattern, replacement)
      review = re.sub(re.escape(pattern), replacement, review)
    lst.append(review)
  return lst

In [None]:
# output: 資料前處理(data preproessing output)

file_cap_trains = preprocessing(file_cap_train, patterns, replacements)
file_cap_vals = preprocessing(file_cap_val, patterns, replacements)

#### 1-3. 定義: 資料筆數, token 最多字元數

In [None]:
# 定義: 資料筆數, token 最多字元數

num_train = len(file_cap_trains)
num_val = len(file_cap_vals)
longest_num_tokens = 20

#### 1-4. 將字幕 做記號(Tokens) / 索引(Indexing) / 填充(Padding) 處理

In [None]:
# def: 幫token加上, 索引(indexing)

def indexing_tokens(mode='train') -> dict :
  indices = {'<SOS>': 0, '<EOS>': 1, '<PAD>': 2, '<UNK>': 3}
  index = 4

  for i in range(num_train):
    sentence = file_cap_trains[i]
    tokens = sentence.split()
    for token in tokens:
      if token not in indices:
        indices[token] = index
        index += 1

  return indices

In [None]:
# def: 幫字幕(sentence) indexing + Padding 去補足 所定義最多字元數 (longest_line_tokens)

def get_data(indices, longest_line_tokens, mode='train') -> list:

  sentences = []
  files = []

  if mode == 'train':
    for i in range(num_train):
      one_train_data = []
      y = file_name_trains[i]
      sentence = file_cap_trains[i]

      for token in sentence.split():
        one_train_data.append(indices[token])
        if len(one_train_data) == longest_line_tokens:
          break

      while len(one_train_data) < longest_line_tokens:
        one_train_data.append(indices['<PAD>'])
      one_train_data.insert(0, indices['<SOS>'])
      one_train_data.append(indices['<EOS>'])
      sentences.append(one_train_data)
      files.append(y)

  else:
    for i in range(num_val):
      one_val_data = []

      y = file_name_vals[i]
      sentence = file_cap_vals[i]

      for token in sentence.split():
        if token in indices:
          one_val_data.append(indices[token])
        else:
          one_val_data.append(indices['<UNK>'])
        if len(one_val_data) == longest_line_tokens:
          break

      while len(one_val_data) < longest_line_tokens:
        one_val_data.append(indices['<PAD>'])
      one_val_data.insert(0, indices['<SOS>'])
      one_val_data.append(indices['<EOS>'])
      sentences.append(one_val_data)
      files.append(y)

  return files, sentences

In [None]:
# Loading Training Data & Val Data
indices = indexing_tokens()                                                     # {'<SOS>': 0, '<EOS>': 1, '<PAD>': 2, '<UNK>': 3}

training_files, training_sentences = get_data(indices, longest_num_tokens)
val_files, val_sentences = get_data(indices, longest_num_tokens, mode='val')

# ---------------------------
# 備用: 以防萬一需要替換indices (like -> word_index)
reversed_indices = {value: key for key, value in indices.items()}             # {0: '<SOS>', 1: '<EOS>', 2: '<PAD>', 3: '<UNK>'}

#### 1-6. 輸出dict = {檔案名稱:索引化字幕, ...}

In [None]:
# 幫 Train & val 的 indexing字幕資訊 轉tensors

# import numpy as np

# indexing 字幕(sentences)__ (尚未使用)
# train_sentences_tensor = torch.tensor(np.array(training_sentences))
# val_sentences_tensor = torch.tensor(np.array(val_sentences))

# -----------
# 檔案名稱 (file name)__ (無法使用, 文字無法轉tensor 張量)
# train_tensor = torch.tensor(np.array(training_files))
# val_tensor = torch.tensor(np.array(val_files))

In [None]:
# “P3” 將檔案, indexing字幕 裝成dict (類似 -> Vedio_sequences)
# {'--7E2sU6zP4_10-5-rgb_front': [ 0,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 15, 18,  4, 19, 20, 21,  2,  2,  1]}

training_file_sentence_dict = {file: sentence for file, sentence in zip(training_files, training_sentences)}
val_file_sentence_dict = {file: sentence for file, sentence in zip(val_files, val_sentences)}

### 02.影像 (video): rgb_front_clips,

#### 2-1. 定義: 影像“幀數”和“尺寸大小”

In [None]:
# 常數區
IMG_SIZE: int = 112
MAX_FRAMES :int = 16
BATCH_SIZE_VIDEO: int = 2

#### 2-2. 資料前處理: 影像 (影像裁切, 轉張量, #[N, torch.Size([16, 112, 112, 3]])

In [None]:
# def: 將影片 裁切成 大小為(IMG_SIZE*IMG_SIZE), 幀數為(MAX_FRAMES)

import cv2

def load_video_helper(cap, max_frames, resize):
    frames_count = 0
    try:
        while True:
            success, frame = cap.read()             # success: if video available
            if not success:
                break
            frame = cv2.resize(frame, resize)       # frame reszie
            frames_count += 1
            yield frame
            if frames_count == max_frames:
                break
    finally:
        cap.release()

In [None]:
# def: 下載影片,並根據需求裁切 (F x H x W x C)

def load_video(path, max_frames=MAX_FRAMES, resize=(IMG_SIZE, IMG_SIZE)):
    cap = cv2.VideoCapture(path)

    #initialize variables
    frames = list(load_video_helper(cap, max_frames=max_frames, resize=resize))
    can_use = (len(frames) == max_frames)             # True or False
    if can_use:
        return can_use, np.array(frames) / 255        # /255 : normalize

    shape = (max_frames, *resize, 3)                  # 3 = RBG
                                                      # max(16, (112, 112), 3) => ((128, ))

    return can_use, np.empty(shape)

In [None]:
def process_batch(batch_files, batch_labels):
    results = list(map(load_video, batch_files, (MAX_FRAMES,) * len(batch_files), ((IMG_SIZE, IMG_SIZE),) * len(batch_files)))
    can_use_mask, batch_data = zip(*results)

    # Filter out only the valid video data and labels
    valid_data = np.array(batch_data)[np.array(can_use_mask)]
    valid_labels = np.array(batch_labels)[np.array(can_use_mask)]

    return valid_data, valid_labels

In [None]:
# 資料前處理(影像) + 資料裝填 {影像, 字幕}

import time
import os
import numpy as np
from concurrent.futures import ProcessPoolExecutor
from typing import *

def prepare_data(directory: str, file_sentence_dict, shuffle=True, seed=42) -> List[Tuple[np.ndarray, int]]:
    start = time.time()
    # ****************************

    # # 定義 list 裝填影像資料夾,
    files = []          # 手語影片.mp4
    file_names = []     # 手語影片 檔案名稱

    # # 爬過每部手語影片.mp4 files
    for root, dirs, filenames in os.walk(directory):
        for file in filenames:
            if file.endswith('.mp4'):
                files.append(os.path.join(root, file))              # ['train_pre_rgb_front_clips/_2FBDaOPYig_1-3-rgb_front.mp4', 'train_pre_rgb_front_clips/fE6xxSbjVV8_3-8-rgb_front.mp4']
                file_names.append(os.path.splitext(file)[0])        # ['_2FBDaOPYig_1-3-rgb_front', 'fE6xxSbjVV8_3-8-rgb_front']

    files = np.array(files)                                         # files.shape = (31165,) or (1741,)

    # 標籤 true_labels
    true_labels = np.array([file_sentence_dict.get(name, -1) for name in file_names])

    # # Load video data (file -> video data)
    # with ProcessPoolExecutor() as executor:
    #     results = list(executor.map(load_video, files, (MAX_FRAMES,)*len(files), ((IMG_SIZE,IMG_SIZE),)*len(files)))        # [files, (Bool , array(??? ,IMG_SIZE ,IMG_SIZE, 3)]

    # can_use_mask, video_data = list(zip(*results))
    # can_use_mask = np.array(can_use_mask)
    # video_data = np.array(video_data)

    # video_data = video_data[can_use_mask, :, :, :, :]
    # true_labels = true_labels[can_use_mask]

    # ======================================== 2024-08-07 ========================================

    video_data = []
    video_labels_filtered = []

    batch_size = 200  # Adjust as needed based on memory constraints
    for i in range(0, len(files), batch_size):
        batch_files = files[i:i + batch_size]
        batch_labels = true_labels[i:i + batch_size]  # Ensure the batch labels match the batch files
        batch_data, batch_labels = process_batch(batch_files, batch_labels)
        video_data.append(batch_data)
        video_labels_filtered.append(batch_labels)

    video_data = np.concatenate(video_data, axis=0)
    true_labels = np.concatenate(video_labels_filtered, axis=0)

    # ======================================== 2024-08-07 ========================================

    if shuffle:
        np.random.seed(seed)
        size = len(true_labels)
        # random_nums = np.random.rand(size)
        # sort_idx = np.argsort(random_nums)
        # video_data = video_data[sort_idx, :, :, :, :]
        # true_labels = true_labels[sort_idx]
        indices = np.arange(size)
        np.random.shuffle(indices)
        video_data = video_data[indices]
        true_labels = true_labels[indices]

    # # Convert to PyTorch tensors
    # video_data = torch.tensor(np.array(video_data))
    # true_labels = torch.tensor(np.array(true_labels))
    video_data = torch.tensor(video_data, dtype=torch.float32)
    true_labels = torch.tensor(true_labels, dtype=torch.long)

    result = list(zip(video_data, true_labels))       # [(data[0], true_lables[0]), (data[1], true_lables[1], ....)]

    # 影像裁切資訊 video_data.shape =>  torch.Size([16, 112, 112, 3]
    # 影像字幕 true_labels.shape => torch.Size([22])

    # ****************************
    end = time.time()
    print(f'This function took {end-start} second to complete.')

    return result

### 03.訓練資料(Train data) & 驗證資料(val data)

In [None]:
# 定義樣本常數
BATCH_SIZE: int = 32

In [None]:
# Load train_data and val_data

train_data = prepare_data(TRAIN_DIRECTORY, training_file_sentence_dict)
# print(f'Number of training samples: {len(train_data)}')

val_data = prepare_data(VAL_DIRECTORY, val_file_sentence_dict)
# print(f'Number of validation samples: {len(val_data)}')

# dict (like -> Vedio_sequences)

# [(tensor(影像資訊[16, 112, 112, 3]), tensor(影片字幕indeing_01)), (tensor(影像資訊[16, 112, 112, 3]), tensor(影片字幕indeing_02)) ]

In [None]:
# 製作 mini_train, mini_val (根據BATCH_SIZE, 使用DataLoader對train data抽樣)

from torch.utils.data import DataLoader, TensorDataset

# # ---
# train_videos = np.array([item[0] for item in train_data])
# train_labels = np.array([item[1] for item in train_data])
# train_dataset = TensorDataset(torch.tensor(train_videos).float(), torch.tensor(train_labels).long())
# # ---

mini_trains = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True)
mini_vals = DataLoader(val_data, batch_size=BATCH_SIZE)

In [None]:
# check torch size

print('[D_train.shape]', next(iter(mini_trains))[0].shape)

[D_train.shape] torch.Size([32, 16, 112, 112, 3])


### 04. 模型建立

#### 4-1. 定義: 模型常數

In [None]:
# 定義模型常數

LONGEST_NUM_TOKENS: int = longest_num_tokens   # 每句話的長度 (20)
EMBEDDING_DIM: int = 300

HIDDEN_DIM: int = 256          # hidden_size
VOCAB_SIZE = len(indices)      # 15983, 不重複字數
OUTPUT_DIM = len(indices)      # 15983, 不重複字數 = VOCAB_SIZE

NUM_EPOCHS: int = 500
PRINT_EVERY_EPOCH: int = 400
TEACHER_FORCING_RATIO = 1.0

# 4608 = 512*3*3

#### 4.1.1.Pretrained word_embedding_stack_pt by Glove300

In [None]:
# 利用Glove300製作embedding_stack_pt 給模型使用 (將indices索引表 拿給 Glove300 一起pretrained)

# 下載並讀取 GloVe 300d 文件
def load_glove_embeddings(file_path):
    embeddings_index = {}
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    return embeddings_index

# 創建嵌入矩陣
def create_embedding_matrix(word_index, embeddings_index, embedding_dim):
    embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim))
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    return embedding_matrix

# 初始化嵌入層
def init_embedding_layer(embedding_matrix):
    embedding_layer = nn.Embedding.from_pretrained(torch.FloatTensor(embedding_matrix), freeze=False)
    return embedding_layer

# 路徑到 GloVe 300d 文件
glove_file_path = 'glove.6B.300d.txt'

# 加載 GloVe 嵌入
embeddings_index = load_glove_embeddings(glove_file_path)

# 創建嵌入矩陣
embedding_dim = EMBEDDING_DIM
embedding_stack_pt_0 = create_embedding_matrix(indices, embeddings_index, embedding_dim)

# indices = indexing_tokens()                                                     # {'<SOS>': 0, '<EOS>': 1, '<PAD>': 2, '<UNK>': 3}

# 將嵌入矩陣轉換為 PyTorch tensor
embedding_stack_pt = torch.FloatTensor(embedding_stack_pt_0)

# 打印嵌入層的尺寸
print(embedding_stack_pt.size())

torch.Size([3238, 300])


#### 4-2. 模型建立

In [None]:
# 建立模型

import torch
from torch import nn
import torchvision.models as models
from torchvision.models import ResNet50_Weights

class VideoTextModel(nn.Module):
    def __init__(self, embedding_stack, embedding_dim, vocab_size, teacher_forcing_ratio = 0.5):
        super(VideoTextModel, self).__init__()

        # -- video (Encoder)
        # ResNet
        # resnet = models.resnet50(pretrained=True) # UserWarning: The parameter 'pretrained' is deprecated since 0.13 and may be removed in the future, please use 'weights' instead.
        resnet = models.resnet50(weights=ResNet50_Weights.DEFAULT)
        self.resnet = nn.Sequential(*list(resnet.children())[:-2])    # Remove the last fully connected layer
        self.resnet_avgpool = nn.AdaptiveAvgPool2d((1,1))

        # LSTM for video features (wihtout fc), Ensure input_size matches ResNet50's output = 2048
        self.lstm_video = nn.LSTM(input_size=2048, hidden_size=256, batch_first=True, num_layers=2, bidirectional=True)

        # -- text (decoder)
        # Embedding layer for text
        self.embedding = nn.Embedding.from_pretrained(embedding_stack, freeze=False)

        # LSTM for text features
        self.lstm_text = nn.LSTM(input_size=embedding_dim, hidden_size=256, batch_first=True, num_layers=2, bidirectional=True)

        # Fully connected layer for classification
        self.fc = nn.Linear(256*2, vocab_size)

        self._initialize_weights()

        # teacher forcing ratio
        self.teacher_forcing_ratio = teacher_forcing_ratio

    def _initialize_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.xavier_uniform_(m.weight)
                nn.init.constant_(m.bias, 0)
            elif isinstance(m, nn.LSTM):
                for name, param in m.named_parameters():
                    if 'weight' in name:
                        nn.init.xavier_uniform_(param)
                    elif 'bias' in name:
                        nn.init.constant_(param, 0)

    def forward(self, x_video, sentence):

        # --- video
        # feature extraction (x_video.shape = [32, 16, 112, 112, 3])
        batch_size, frames, height, width, channels = x_video.size()
        x_video = x_video.view(batch_size*frames, channels, height, width)

        # Resnet
        cnn_features = self.resnet(x_video)
        cnn_features = self.resnet_avgpool(cnn_features)
        cnn_features = cnn_features.view(batch_size, frames, -1) # torch.Size([2, 16, 512])

        # -- encoder LSTM for video features
        # lstm for cnn_features torch.Size([2, 16, 512*3*3]), (h_n, c_n) will be input with decoder_input
        lstm_output, (h_n, c_n) = self.lstm_video(cnn_features)

        # --- sentence
        # Remove second dimension, queeze sentence.shape from [1:1:20:300] to [1:20:300]
        sentence = sentence.squeeze(1) # sentence.squeeze(1).shape: torch.Size([batch_size, 22])

        # word embeddings
        embeddings = self.embedding(sentence)  # embeddings.shape : torch.Size([batch_size, 22, embedding_dim])

        # -- decoder
        decoder_outputs = []
        decoder_input = embeddings[:,0,:].unsqueeze(1) # decoder_input.shape: torch.Size([batch_size, 1, embedding_dim])

        # -- teacher forcing
        use_teacher_forcing = True if torch.rand(1).item() < self.teacher_forcing_ratio else False

        for t in range(sentence.size(1)):
            lstm_text_output, (h_n, c_n) = self.lstm_text(decoder_input, (h_n, c_n)) # decoder output

            output = self.fc(lstm_text_output.squeeze(1)) # [batch_size, vocal_size]
            decoder_outputs.append(output)

            if use_teacher_forcing:
              decoder_input = embeddings[:,t,:].unsqueeze(1) # decoder input without teacher forcing
            else:
              decoder_input = self.embedding(output.argmax(1)).unsqueeze(1) # next decoder input

        decoder_outputs = torch.stack(decoder_outputs, dim=1)

        return decoder_outputs

### 5. 訓練 Training

In [None]:
# 建立訓練方法

import torch
import torch.nn as nn
from torch.optim import Adam

def train(model, optimizer, mini_trains, device):
    start = time.time()
    # ****************************

    model.train()
    total_loss = 0

    for epoch in range(NUM_EPOCHS):
        cost = 0
        for batch_idx, (videos, sentences) in enumerate(mini_trains):

            # to(device)
            videos = videos.float().to(device)
            sentences = sentences.long().to(device)   # if data type != int/long, alter train data

            # 產出 outputs
            outputs = model(videos, sentences)

            # 取得 probabilities
            probabilities = nn.functional.log_softmax(outputs, dim = -1)
            # 取得最高的機率
            predicted_indices = torch.argmax(probabilities, dim = -1)
            # 生成預測字句
            predicted_words = [reversed_indices[idx.item()] for idx in predicted_indices[0]]

            # Flatten
            sentence_flatten = torch.flatten(sentences, end_dim=-1)
            probabilities_flatten = torch.flatten(probabilities, end_dim=1)

            ### sentences     torch.Size([32, 22])       ###
            ### probabilities torch.Size([32, 22, 3237]) ###
            ### output        torch.Size([32, 22, 3237]) ###

            # 計算 loss
            loss = loss_function(probabilities_flatten, sentence_flatten)

            # clear previous gradient:
            optimizer.zero_grad()

            # 計算 loss / cost per epoch
            total_loss += loss.item()
            cost += loss.item()

            # Backward propagation/
            loss.backward()

            # update parameters
            optimizer.step()

        if epoch % 10 == 0:
            print(f'Epoch: {epoch}, Cost: {cost / len(mini_trains)}')
            # 印出預測字句
            print(f"'predicted_words: ' {predicted_words}")

    avg_loss = total_loss / len(mini_trains)
    print(f'Triaining loss: ', avg_loss)

    # ****************************
    end = time.time()
    print(f'This function took {end-start} second to complete.')

### 5.1 訓練開始

In [None]:
# 訓練開始

print(f'詞向量的形狀', embedding_stack_pt.shape)
print(f'輸入層大小:', EMBEDDING_DIM)
print(f'詞彙表大小:', VOCAB_SIZE)
print(f'是否有用到 GPU:', device)
print(f'Teacher Forcing Ratio:', TEACHER_FORCING_RATIO)
# print(f'詞索引表:', word_index)

model = VideoTextModel(embedding_stack_pt, EMBEDDING_DIM, VOCAB_SIZE, TEACHER_FORCING_RATIO).to(device)
optimizer = Adam(model.parameters())
# loss_function = nn.CrossEntropyLoss()
loss_function = nn.NLLLoss()

train(model, optimizer, mini_trains, device)

詞向量的形狀 torch.Size([3238, 300])
輸入層大小: 300
詞彙表大小: 3237
是否有用到 GPU: cuda
Teacher Forcing Ratio: 1.0


Downloading: "https://download.pytorch.org/models/resnet50-11ad3fa6.pth" to /root/.cache/torch/hub/checkpoints/resnet50-11ad3fa6.pth
100%|██████████| 97.8M/97.8M [00:00<00:00, 224MB/s]


Epoch: 0, Cost: 4.880536295749523
'predicted_words: ' ['<SOS>', 'so', 'you', 'to', 'to', '<PAD>', 'to', '<PAD>', 'to', 'to', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>']
Epoch: 10, Cost: 2.813847855285362
'predicted_words: ' ['<SOS>', 'so', 'i', 'you', 'way', 'thing', 'is', 'the', 'hair', 'of', 'of', 'can', 'to', 'out', 'and', 'the', 'back', 'of', '<PAD>', '<PAD>', '<PAD>', '<EOS>']
Epoch: 20, Cost: 1.9080039880893849
'predicted_words: ' ['<SOS>', 'so', 'lot', 'is', 'is', 'going', 'that', 'little', 'recording', 'you', 'a', 'little', 'input', 'or', 'the', 'little', 'inch', 'length', 'depending', 'on', 'the', '<EOS>']
Epoch: 30, Cost: 1.1922600975743047
'predicted_words: ' ['<SOS>', 'and', 'then', 'you', 'a', 'lot', 'of', 'people', 'strength', 'involved', 'to', 'stabilize', 'the', 'dumbbell', 'right', 'above', 'his', 'hips', '<PAD>', '<PAD>', '<PAD>', '<EOS>']
Epoch: 40, Cost: 0.7486818002329932
'predicted_words: ' ['<SOS>', 

### 6. 驗證 Validation

In [None]:
# 建立驗證方法

import torch
from torch.utils.data import DataLoader

def validate(model, mini_vals, device):
    model.eval()
    correct = 0
    total_loss = 0
    total_samples = 0

    with torch.no_grad():
        for videos, sentences in mini_vals:

            # to(device)
            videos = videos.float().to(device)
            sentences = sentences.long().to(device)
            # 產出 outputs
            outputs = model(videos, sentences)

            # probabilities
            # softmax
            probabilities = nn.functional.log_softmax(outputs, dim = -1)
            # 取得最高的機率
            predicted_indices = torch.argmax(probabilities, dim = -1)
            # 生成預測字句
            predicted_words = [reversed_indices[idx.item()] for idx in predicted_indices[0]]

            # Flatten
            probabilities_flatten = torch.flatten(probabilities, end_dim=1)
            sentence_flatten = torch.flatten(sentences, end_dim=-1)

            # 計算 loss
            loss = loss_function(probabilities_flatten, sentence_flatten)
            total_loss += loss.item()

            # 計算正確預測數量
            correct += predicted_indices.eq(sentences).sum().item()
            total_samples += sentence_flatten.size(0)

            # 印出預測字句
            print(f"'predicted_words: ' {predicted_words}")

    avg_loss = total_loss / len(mini_vals)
    accuracy = correct / total_samples

    print(f'Validation loss: {avg_loss}')
    print(f'Validation accuracy: {accuracy * 100:.2f}%')

#### 6.1 驗證開始

In [None]:
validate(model, mini_vals, device)

'predicted_words: ' ['<SOS>', 'you', 'dress', 'for', 'wind', 'chill', 'not', 'for', 'the', 'temperature', 'that', 'you', 'have', 'outside', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<EOS>']
'predicted_words: ' ['<SOS>', 'this', 'will', 'get', 'you', 'through', 'a', 'camping', 'trip', 'but', 'youll', 'probably', 'want', 'to', 'replace', 'your', 'pole', 'after', 'that', '<PAD>', '<PAD>', '<EOS>']
'predicted_words: ' ['<SOS>', 'how', 'does', 'a', 'snake', 'grow', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<EOS>']
'predicted_words: ' ['<SOS>', 'and', 'let', 'me', 'show', 'you', 'with', 'a', 'wall', 'just', 'in', 'case', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<EOS>']
'predicted_words: ' ['<SOS>', 'this', 'brings', 'up', 'the', 'visual', 'basic', 'code', 'editor', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '