In [1]:
import tqdm
import random
import pathlib
import itertools
import collections

import os
import cv2
import numpy as np
import remotezip as rz

import tensorflow as tf


In [2]:
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        # 현재 프로그램이 필요에 따라 점진적으로 메모리를 할당하도록 설정
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
    except RuntimeError as e:
        # 메모리 증가를 설정하는 중 예외 발생
        print(e)

In [3]:
URL = 'https://storage.googleapis.com/thumos14_files/UCF101_videos.zip'

In [4]:
def list_files_from_zip_url(zip_url):

  files = []
  with rz.RemoteZip(zip_url) as zip:
    for zip_info in zip.infolist():
      files.append(zip_info.filename)
  return files

In [5]:
files = list_files_from_zip_url(URL)
files = [f for f in files if f.endswith('.avi')]
files[:10]

['UCF101/v_ApplyEyeMakeup_g01_c01.avi',
 'UCF101/v_ApplyEyeMakeup_g01_c02.avi',
 'UCF101/v_ApplyEyeMakeup_g01_c03.avi',
 'UCF101/v_ApplyEyeMakeup_g01_c04.avi',
 'UCF101/v_ApplyEyeMakeup_g01_c05.avi',
 'UCF101/v_ApplyEyeMakeup_g01_c06.avi',
 'UCF101/v_ApplyEyeMakeup_g02_c01.avi',
 'UCF101/v_ApplyEyeMakeup_g02_c02.avi',
 'UCF101/v_ApplyEyeMakeup_g02_c03.avi',
 'UCF101/v_ApplyEyeMakeup_g02_c04.avi']

In [6]:
def get_class(fname):

  return fname.split('_')[-3]

In [7]:
def get_files_per_class(files):

  files_for_class = collections.defaultdict(list)
  for fname in files:
    class_name = get_class(fname)
    files_for_class[class_name].append(fname)
  return files_for_class

In [79]:
NUM_CLASSES = 101
FILES_PER_CLASS = 50

In [80]:
files_for_class = get_files_per_class(files)
classes = list(files_for_class.keys())

In [81]:
print('Num classes:', len(classes))
print('Num videos for class[0]:', len(files_for_class[classes[0]]))

Num classes: 101
Num videos for class[0]: 145


In [82]:
def select_subset_of_classes(files_for_class, classes, files_per_class):

  files_subset = dict()

  for class_name in classes:
    class_files = files_for_class[class_name]
    files_subset[class_name] = class_files[:files_per_class]

  return files_subset

In [83]:
files_subset = select_subset_of_classes(files_for_class, classes[:NUM_CLASSES], FILES_PER_CLASS)
list(files_subset.keys())

['ApplyEyeMakeup',
 'ApplyLipstick',
 'Archery',
 'BabyCrawling',
 'BalanceBeam',
 'BandMarching',
 'BaseballPitch',
 'BasketballDunk',
 'Basketball',
 'BenchPress',
 'Biking',
 'Billiards',
 'BlowDryHair',
 'BlowingCandles',
 'BodyWeightSquats',
 'Bowling',
 'BoxingPunchingBag',
 'BoxingSpeedBag',
 'BreastStroke',
 'BrushingTeeth',
 'CleanAndJerk',
 'CliffDiving',
 'CricketBowling',
 'CricketShot',
 'CuttingInKitchen',
 'Diving',
 'Drumming',
 'Fencing',
 'FieldHockeyPenalty',
 'FloorGymnastics',
 'FrisbeeCatch',
 'FrontCrawl',
 'GolfSwing',
 'Haircut',
 'Hammering',
 'HammerThrow',
 'HandstandPushups',
 'HandstandWalking',
 'HeadMassage',
 'HighJump',
 'HorseRace',
 'HorseRiding',
 'HulaHoop',
 'IceDancing',
 'JavelinThrow',
 'JugglingBalls',
 'JumpingJack',
 'JumpRope',
 'Kayaking',
 'Knitting',
 'LongJump',
 'Lunges',
 'MilitaryParade',
 'Mixing',
 'MoppingFloor',
 'Nunchucks',
 'ParallelBars',
 'PizzaTossing',
 'PlayingCello',
 'PlayingDaf',
 'PlayingDhol',
 'PlayingFlute',
 'Play

In [84]:
def download_from_zip(zip_url, to_dir, file_names):

  with rz.RemoteZip(zip_url) as zip:
    for fn in tqdm.tqdm(file_names):
      class_name = get_class(fn)
      zip.extract(fn, str(to_dir / class_name))
      unzipped_file = to_dir / class_name / fn

      fn = pathlib.Path(fn).parts[-1]
      output_file = to_dir / class_name / fn
      unzipped_file.rename(output_file)

In [85]:
def split_class_lists(files_for_class, count):

  split_files = []
  remainder = {}
  for cls in files_for_class:
    split_files.extend(files_for_class[cls][:count])
    remainder[cls] = files_for_class[cls][count:]
  return split_files, remainder

In [86]:
def download_ufc_101_subset(zip_url, num_classes, splits, download_dir):

  files = list_files_from_zip_url(zip_url)
  for f in files:
    path = os.path.normpath(f)
    tokens = path.split(os.sep)
    if len(tokens) <= 2:
      files.remove(f) # Remove that item from the list if it does not have a filename
  
  files_for_class = get_files_per_class(files)

  classes = list(files_for_class.keys())[:num_classes]

  for cls in classes:
    random.shuffle(files_for_class[cls])
    
  # Only use the number of classes you want in the dictionary
  files_for_class = {x: files_for_class[x] for x in classes}

  dirs = {}
  for split_name, split_count in splits.items():
    print(split_name, ":")
    split_dir = download_dir / split_name
    split_files, files_for_class = split_class_lists(files_for_class, split_count)
    download_from_zip(zip_url, split_dir, split_files)
    dirs[split_name] = split_dir

  return dirs

In [None]:
download_dir = pathlib.Path('./UCF101_subset/')
subset_paths = download_ufc_101_subset(URL,
                                       num_classes = NUM_CLASSES,
                                       splits = {"train": 30, "val": 10, "test": 10},
                                       download_dir = download_dir)

In [18]:

donwload_path = pathlib.Path('./UCF101_subset/')

video_count_train = len(list(donwload_path.glob('train/*/*.avi')))
video_count_val = len(list(donwload_path.glob('val/*/*.avi')))
video_count_test = len(list(donwload_path.glob('test/*/*.avi')))
from pathlib import Path

subset_paths = {
    'train': Path('UCF101_subset/train'),
    'val': Path('UCF101_subset/val'),
    'test': Path('UCF101_subset/test')
}


video_total = video_count_train + video_count_val + video_count_test
print(f"Total videos: {video_total}")

Total videos: 5050


In [19]:
video_count_train = len(list(download_dir.glob('train/*/*.avi')))
video_count_val = len(list(download_dir.glob('val/*/*.avi')))
video_count_test = len(list(download_dir.glob('test/*/*.avi')))
video_total = video_count_train + video_count_val + video_count_test
print(f"Total videos: {video_total}")

Total videos: 5050


In [20]:
def format_frames(frame, output_size):
  frame = tf.image.convert_image_dtype(frame, tf.float32)
  frame = tf.image.resize_with_pad(frame, *output_size)
  return frame

In [21]:
def frames_from_video_file(video_path, n_frames, output_size = (224,224), frame_step = 2):

  # Read each video frame by frame
  result = []
  src = cv2.VideoCapture(str(video_path))  

  video_length = src.get(cv2.CAP_PROP_FRAME_COUNT)

  need_length = 1 + (n_frames - 1) * frame_step

  if need_length > video_length:
    start = 0
  else:
    max_start = video_length - need_length
    start = random.randint(0, max_start + 1)

  src.set(cv2.CAP_PROP_POS_FRAMES, start)
  # ret is a boolean indicating whether read was successful, frame is the image itself
  ret, frame = src.read()
  result.append(format_frames(frame, output_size))

  for _ in range(n_frames - 1):
    for _ in range(frame_step):
      ret, frame = src.read()
    if ret:
      frame = format_frames(frame, output_size)
      result.append(frame)
    else:
      result.append(np.zeros_like(result[0]))
  src.release()
  result = np.array(result)[..., [2, 1, 0]]

  return result

In [49]:
def frames_from_video_file(video_path, n_frames, output_size=(224, 224), frame_step=2):


    result = []
    src = cv2.VideoCapture(str(video_path))

    video_length = int(src.get(cv2.CAP_PROP_FRAME_COUNT))

    if video_length < n_frames * frame_step:
        print(f"Video length is too short to extract {n_frames} frames. Skipping video: {video_path}")
        return None

    start = random.randint(0, video_length - n_frames * frame_step + 1)
    src.set(cv2.CAP_PROP_POS_FRAMES, start)

    for _ in range(n_frames):
        ret, frame = src.read()
        if ret:
            frame = cv2.resize(frame, output_size)
            result.append(frame)
        else:
            print(f"Error reading frame. Skipping video: {video_path}")
            return None

        # Move to next frame_step
        for _ in range(frame_step - 1):
            src.read()

    src.release()
    result = np.array(result)
    return result

In [60]:
class FrameGenerator:
  def __init__(self, path, n_frames):

      self.path = path
      self.n_frames = n_frames
      self.class_names = sorted(set(p.name for p in self.path.iterdir() if p.is_dir()))
      self.class_ids_for_name = dict((name, idx) for idx, name in enumerate(self.class_names))

  def get_files_and_class_names(self):
      video_paths = list(self.path.glob('*/*.avi'))
      classes = [p.parent.name for p in video_paths] 
      return video_paths, classes

  def __call__(self):
      video_paths, classes = self.get_files_and_class_names()

      pairs = list(zip(video_paths, classes))

      random.shuffle(pairs)

      for path, name in pairs:
          video_frames = frames_from_video_file(path, self.n_frames) 
          label = self.class_ids_for_name[name]  
          yield video_frames, np.int32(label) 

In [61]:
# Create the training set
output_signature = (tf.TensorSpec(shape = (None, None, None, 3), dtype = tf.float32),
                    tf.TensorSpec(shape = (), dtype = tf.int16))
train_ds = tf.data.Dataset.from_generator(FrameGenerator(subset_paths['train'], 10),
                                          output_signature = output_signature)

In [62]:
# Create the validation set
val_ds = tf.data.Dataset.from_generator(FrameGenerator(subset_paths['val'], 10),
                                        output_signature = output_signature)

In [63]:
# Print the shapes of the data
train_frames, train_labels = next(iter(train_ds))
print(f'Shape of training set of frames: {train_frames.shape}')
print(f'Shape of training labels: {train_labels.shape}')

val_frames, val_labels = next(iter(val_ds))
print(f'Shape of validation set of frames: {val_frames.shape}')
print(f'Shape of validation labels: {val_labels.shape}')

Shape of training set of frames: (10, 224, 224, 3)
Shape of training labels: ()
Shape of validation set of frames: (10, 224, 224, 3)
Shape of validation labels: ()


In [64]:
AUTOTUNE = tf.data.AUTOTUNE

train_ds = train_ds.cache().shuffle(1000).prefetch(buffer_size = AUTOTUNE)
val_ds = val_ds.cache().shuffle(1000).prefetch(buffer_size = AUTOTUNE)

In [65]:
train_ds = train_ds.batch(2)
val_ds = val_ds.batch(2)

train_frames, train_labels = next(iter(train_ds))
print(f'Shape of training set of frames: {train_frames.shape}')
print(f'Shape of training labels: {train_labels.shape}')

val_frames, val_labels = next(iter(val_ds))
print(f'Shape of validation set of frames: {val_frames.shape}')
print(f'Shape of validation labels: {val_labels.shape}')

Shape of training set of frames: (2, 10, 224, 224, 3)
Shape of training labels: (2,)
Shape of validation set of frames: (2, 10, 224, 224, 3)
Shape of validation labels: (2,)


In [117]:
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.applications import ResNet152
import math

class Encoder(models.Model):
    def __init__(self, latent_dim):
        print('Encoder')
        super(Encoder, self).__init__()
        self.resnet = ResNet152(include_top=False, weights='imagenet', input_shape=(224, 224, 3), pooling='avg')
        self.resnet.trainable = False
        # Wrap the ResNet model with TimeDistributed to handle the frames dimension
        self.time_distributed_resnet = layers.TimeDistributed(self.resnet)
        self.time_distributed_dense = layers.TimeDistributed(layers.Dense(latent_dim))
        self.time_distributed_bn = layers.TimeDistributed(layers.BatchNormalization(momentum=0.01))

    def call(self, inputs):
        x = self.time_distributed_resnet(inputs)
        x = self.time_distributed_dense(x)
        x = self.time_distributed_bn(x)
        return x


class LSTMModel(models.Model):
    def __init__(self, latent_dim, num_layers, hidden_dim, bidirectional=True):
        print('LSTM')
        super(LSTMModel, self).__init__()
        if bidirectional:
            # Bidirectional LSTM layer
            self.lstm = layers.Bidirectional(layers.LSTM(hidden_dim, return_sequences=True))
        else:
            # Unidirectional LSTM layer
            self.lstm = layers.LSTM(hidden_dim, return_sequences=True)

    def call(self, inputs):
        return self.lstm(inputs)


class AttentionModule(models.Model):
    def __init__(self, latent_dim, hidden_dim, attention_dim):
        print('Attention')
        super(AttentionModule, self).__init__()
        self.latent_attention = layers.Dense(attention_dim)
        self.hidden_attention = layers.Dense(attention_dim)
        self.joint_attention = layers.Dense(1)

    
    def call(self, latent_repr, hidden_repr):
        h_t = hidden_repr[:, -1, :]  
        latent_att = self.latent_attention(latent_repr)
        hidden_att = self.hidden_attention(h_t)


        hidden_att = tf.expand_dims(hidden_att, 1)  

        joint_att = self.joint_attention(layers.ReLU()(latent_att + hidden_att))
        attention_w = tf.nn.softmax(joint_att, axis=1)
        return attention_w


class ConvLSTM(models.Model):
    def __init__(self, num_classes, latent_dim=512, lstm_layers=1, hidden_dim=1024, bidirectional=True, attention=True):
        print('ConvLSTM')
        super(ConvLSTM, self).__init__()
        self.encoder = Encoder(latent_dim)
        self.lstm = LSTMModel(latent_dim, lstm_layers, hidden_dim, bidirectional)
        self.output_layers = models.Sequential([
            layers.Dense(hidden_dim if bidirectional else hidden_dim // 2),
            layers.BatchNormalization(momentum=0.01),
            layers.ReLU(),
            layers.Dense(num_classes, activation='softmax'),
        ])
        self.attention = attention
        if attention:
            self.attention_module = AttentionModule(latent_dim, hidden_dim, 1)

    def call(self, inputs):
        x = self.encoder(inputs)
        x = self.lstm(x)
        if self.attention:
            attention_w = self.attention_module(x, x)
            x = tf.reduce_sum(x * attention_w, axis=1)
        else:
            x = x[:, -1, :] 
        return self.output_layers(x)


class ConvClassifier(models.Model):
    def __init__(self, num_classes, latent_dim=512):
        print('ConvClassifier')
        super(ConvClassifier, self).__init__()
        self.encoder = Encoder(latent_dim)
        self.final = models.Sequential([
            layers.Dense(latent_dim),
            layers.BatchNormalization(momentum=0.01),
            layers.ReLU(),
            layers.Dense(num_classes
            , activation='softmax'),
        ])

    def call(self, inputs):
        x = self.encoder(inputs)
        return self.final(x)
    
# 모델 생성 및 컴파일
conv_lstm_model = ConvLSTM(num_classes=101)
conv_lstm_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.00001),loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits = True), metrics = ['accuracy'])
def scheduler(epoch, lr):
    if epoch > 0 and epoch % 15 == 0:
        return lr * math.exp(-0.1 * (epoch // 15))
    else:
        return lr

# LearningRateScheduler 콜백 생성
lr_scheduler = tf.keras.callbacks.LearningRateScheduler(scheduler)

# 모델 컴파일
conv_lstm_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.000005),
                        loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
                        metrics=['accuracy'])

# 모델 학습
history = conv_lstm_model.fit(train_ds, 
                    epochs=100,
                    validation_data=val_ds,
                    callbacks=[tf.keras.callbacks.EarlyStopping(patience=20, monitor='val_loss'), lr_scheduler],
                    batch_size=16)


ConvLSTM
Encoder
LSTM
Attention
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100


In [120]:

test_ds = tf.data.Dataset.from_generator(FrameGenerator(subset_paths['test'], 10),
                                        output_signature = output_signature)
test_frames, test_labels = next(iter(test_ds))
print(f'Shape of validation set of frames: {test_frames.shape}')
print(f'Shape of validation labels: {test_frames.shape}')
test_ds = val_ds.cache().shuffle(1000).prefetch(buffer_size = AUTOTUNE)
test_frames, test_labels = next(iter(test_ds))
test_loss, test_accuracy = conv_lstm_model.evaluate(test_ds)
print(f'Test Loss: {test_loss}')
print(f'Test Accuracy: {test_accuracy}')

Shape of validation set of frames: (10, 224, 224, 3)
Shape of validation labels: (10, 224, 224, 3)
Test Loss: 3.072455883026123
Test Accuracy: 0.5653465390205383


In [121]:
conv_lstm_model.save('my_model_path', save_format='tf')




INFO:tensorflow:Assets written to: my_model_path\assets


INFO:tensorflow:Assets written to: my_model_path\assets
