In [None]:
!ls ../yt_label/

## Deep Learning algorithms (Neural Network (computer vision) + LSTM) to label youtube videos based on their genre. Using Resnet to extract video level features and LSTM/GRU to encode sequential strings (audio) through word embedding. Both algorithms later concatenate onto a fully connected network to output the video label genre (E.g. Games, Art & Entertainment, etc.)

In [None]:
import io
import os
import gc  
import csv
import time
import random
import operator
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import urllib.request
import seaborn as sns
from numpy import array
import tensorflow as tf
import plotly.plotly as py
import matplotlib.pyplot as plt
from IPython.display import YouTubeVideo

In [None]:
# 3862 rows × 2 columns
print("3862 rows × 2 columns")
url = 'https://raw.githubusercontent.com/rchavezj/Label_YT_Videos/master/v2/label_names_2018.csv'
labels_df = pd.read_csv(url)

# Videos
Below is where the videos are being downloaded
** **

In [None]:
# print(os.listdir("../yt_label/video/"))

In [None]:
batch_itor = 0
video_files = []
str_set = ["train", "test", "validate"]
# for i in os.listdir("../yt_label/video/"):
for i in os.listdir("/Users/user/yt8m/v2/video/"):
    file_str = format(i)
    if (batch_itor == 100):
        break
    if any(x in file_str for x in str_set):
#         video_files.append("../yt_label/video/{}".format(i))
        video_files.append("/Users/user/yt8m/v2/video/{}".format(i))
    batch_itor += 1
# video_files = ["../yt_label/video/{}".format(i) for i in os.listdir("../yt_label/video")]

In [None]:
video_files

In [None]:
# Distribution of labels
vid_ids = []
labels = []
mean_rgb = []
mean_audio = []
for file in video_files:
    for example in tf.python_io.tf_record_iterator(file):
        tf_example = tf.train.Example.FromString(example)
        vid_ids.append(tf_example.features.feature['id'].bytes_list.value[0].decode(encoding='UTF-8'))
        labels.append(tf_example.features.feature['labels'].int64_list.value)
        mean_rgb.append(tf_example.features.feature['mean_rgb'].float_list.value)
        mean_audio.append(tf_example.features.feature['mean_audio'].float_list.value)
mean_rgb = array(mean_rgb)
mean_audio = array(mean_audio)

In [None]:
print("mean_rgb_shape: ", mean_rgb.shape)
print("mean_audio_shape: ", mean_audio.shape)
print('Number of videos in Sample data set: %s' % str(len(vid_ids)))
print('Picking a youtube video id: %s' % vid_ids[13])
print('List of label ids for youtube video id %s, are - %s' % (vid_ids[13], str(labels[13])))
print('First 20 rgb feature of a youtube video (',vid_ids[13],'): \n%s' % str(mean_rgb[13][:20]))

# Frames
** **

In [None]:
batch_itor = 0
frame_files = []
str_set = ["train"]
for i in os.listdir("../v2/frame"):
# for i in os.listdir("../yt_label/frame"):
    file_str = format(i)
    if (batch_itor == 5):
        break
    if any(x in file_str for x in str_set):
        frame_files.append("../v2/frame/{}".format(i))
#         frame_files.append("../yt_label/frame/{}".format(i))
    batch_itor += 1
# frame_files = ["../v2/frame/{}".format(i) for i in os.listdir("../v2/frame")]

In [None]:
frame_files

In [None]:
feat_rgb = []
feat_audio = []
for file in frame_files:
    for example in tf.python_io.tf_record_iterator(file):        
        tf_seq_example = tf.train.SequenceExample.FromString(example)
        n_frames = len(tf_seq_example.feature_lists.feature_list['audio'].feature)
        sess = tf.InteractiveSession()
        rgb_frame = []
        audio_frame = []
        # iterate through frames
        for i in range(120):
            rgb_frame.append(tf.cast(tf.decode_raw(
                    tf_seq_example.feature_lists.feature_list['rgb'].feature[i].bytes_list.value[0],tf.uint8)
                           ,tf.float32).eval())
            audio_frame.append(tf.cast(tf.decode_raw(
                    tf_seq_example.feature_lists.feature_list['audio'].feature[i].bytes_list.value[0],tf.uint8)
                           ,tf.float32).eval())
        sess.close()
        feat_rgb.append(rgb_frame)
        feat_audio.append(audio_frame)
        break
feat_rgb = array(feat_rgb)
feat_audio = array(feat_audio)

In [None]:
print("No. of videos %d" % len(feat_rgb))
print("feat_rgb_shape: ", feat_rgb.shape)
print("feat_audio_shape: ", feat_audio.shape)
print('The first video has %d frames' %len(feat_rgb[0]))
print("Max frame length is: %d" % max([len(x) for x in feat_rgb]))

In [None]:
max_frame_rgb_sequence_length = 120; frame_rgb_embedding_size = 1024; max_frame_audio_sequence_length = 120; frame_audio_embedding_size = 128; number_dense_units = 1000
number_lstm_units = 100; rate_drop_lstm = 0.2; rate_drop_dense = 0.2; activation_function='relu'; validation_split_ratio = 0.2; label_feature_size = 3862

In [None]:
def create_train_dev_dataset(video_rgb, video_audio, frame_rgb, frame_audio, labels):
    """
    Method to create training and validation data
    """
    shuffle_indices = np.random.permutation(np.arange(len(labels)))
    video_rgb_shuffled = video_rgb[shuffle_indices]
    video_audio_shuffled = video_audio[shuffle_indices]
    print("frame_rgb.shape: ", frame_rgb.shape)
    print("shuffle_indices: ", shuffle_indices)
    print("shuffle_indices_shape: ", shuffle_indices.shape)
    frame_rgb_shuffled = frame_rgb[shuffle_indices]
    frame_audio_shuffled = frame_audio[shuffle_indices]
    labels_shuffled = labels[shuffle_indices]
    dev_idx = max(1, int(len(labels_shuffled) * validation_split_ratio))
    del video_rgb
    del video_audio
    del frame_rgb
    del frame_audio
    gc.collect()
    train_video_rgb, val_video_rgb = video_rgb_shuffled[:-dev_idx], video_rgb_shuffled[-dev_idx:]
    train_video_audio, val_video_audio = video_audio_shuffled[:-dev_idx], video_audio_shuffled[-dev_idx:]
    train_frame_rgb, val_frame_rgb = frame_rgb_shuffled[:-dev_idx], frame_rgb_shuffled[-dev_idx:]
    train_frame_audio, val_frame_audio = frame_audio_shuffled[:-dev_idx], frame_audio_shuffled[-dev_idx:]
    train_labels, val_labels = labels_shuffled[:-dev_idx], labels_shuffled[-dev_idx:]
    del video_rgb_shuffled, video_audio_shuffled, frame_rgb_shuffled, frame_audio_shuffled, labels_shuffled
    gc.collect()
    return (train_video_rgb, train_video_audio, train_frame_rgb, train_frame_audio, train_labels, val_video_rgb, val_video_audio, 
            val_frame_rgb, val_frame_audio, val_labels)

In [None]:
sample_length = len(feat_rgb)
labels = np.zeros([sample_length, 3862])
for i in range(len(labels)):
    j = random.randint(0,9)
    labels[i][j] = 1 

In [None]:
shuffle_indices = np.random.permutation(np.arange(len(labels)))
labels_shuffled = labels[shuffle_indices]
labels_shuffled = labels[shuffle_indices]
dev_idx = max(1, int(len(labels_shuffled) * validation_split_ratio))
train_labels, val_labels = labels_shuffled[:-dev_idx-1], labels_shuffled[-dev_idx-1:]

In [None]:
train_video_rgb, train_video_audio, train_frame_rgb, train_frame_audio, train_labels, val_video_rgb, val_video_audio, val_frame_rgb, val_frame_audio, val_labels = create_train_dev_dataset(mean_rgb, mean_audio, feat_rgb, feat_audio, labels) 

In [None]:
print("train_video_rgb.shape: ", train_video_rgb.shape)
print("val_video_rgb.shape: ", val_video_rgb.shape, "\n")
print("train_video_audio.shape: ", train_video_audio.shape)
print("val_video_audio.shape: ", val_video_audio.shape, "\n")
print("train_frame_rgb.shape: ", train_frame_rgb.shape)
print("val_frame_rgb.shape: ", val_frame_rgb.shape, "\n")
print("train_frame_audio.shape: ", train_frame_audio.shape)
print("val_frame_audio.shape: ", train_frame_audio.shape, "\n")
print("train_labels.shape: ", train_labels.shape)
print("val_labels.shape: ", val_labels.shape)

In [None]:
x_train_np_frame = np.concatenate((train_frame_rgb, train_frame_audio), axis=2) 
x_test_np_frame = np.concatenate((val_frame_rgb, val_frame_audio), axis=2)

x_train_np_video = np.concatenate((train_video_rgb, train_video_audio), axis=1)
x_test_np_video = np.concatenate((val_video_rgb, val_video_audio), axis=1)

# Importing keras libraries to perform deep learning algorithms
** **

In [None]:
import keras
from keras.utils import plot_model
from keras.layers.recurrent import LSTM
from keras.callbacks import ModelCheckpoint
from keras.layers.embeddings import Embedding
from keras.layers.merge import dot, concatenate
from keras.models import Sequential, Model, load_model
from keras.layers.normalization import BatchNormalization
from keras.layers.advanced_activations import LeakyReLU, PReLU
from keras.layers import Dense, Input, LSTM, Dropout, Bidirectional
from keras.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard
from keras.layers import Input, Dense, Dropout, Bidirectional, Add, GlobalMaxPooling1D

# Multi-Bidirectional LSTM for the Frames
** **

In [None]:
bi_X1 = Input(shape=(120,1152),name='frame')
fc_1 = Dense(2304,activation='relu',name='fc_1')(bi_X1)
lstm_1 = LSTM(2304, return_sequences=True, go_backwards=False, name='lstm_1')(fc_1)

# First fast Merge connection
merge_1 = Add(name='merge_1')([fc_1, lstm_1])
fc_2 = Dense(2304,activation='relu',name='fc_2')(merge_1)
lstm_2 = LSTM(2304, return_sequences=True, go_backwards=True, name='lstm_2')(fc_2)

# second fast Merge connection
merge_2 = Add(name='merge_2')([fc_2, lstm_2])
fc_3 = Dense(2304,activation='relu',name='fc_3')(merge_2)
lstm_3 = LSTM(2304, return_sequences=True, go_backwards=False, name='lstm_3')(fc_3)

# third fast Merge connection
merge_3 = Add(name='merge_3')([fc_3, lstm_3])
fc_4 = Dense(2304,activation='relu',name='fc_4')(merge_3)
lstm_4 = LSTM(2304, return_sequences=True, go_backwards=True, name='lstm_4')(fc_4)

# Pooling
pool = GlobalMaxPooling1D(name='global_max_pool')(lstm_4)
# FC_2048
fc_2048 = Dense(2048, activation='relu',name='fc_2048')(pool)
# Softmax
output = Dense(3862, activation='softmax',name='output')(fc_2048)

In [None]:
# # Complete Model Diagram
frame_model = Model(inputs=[bi_X1],outputs=[output])
frame_model.compile(loss='categorical_crossentropy', optimizer='sgd', metrics=['accuracy'] )

In [None]:
# checkpoint
filepath="weights-improvement-{epoch:02d}-{val_acc:.2f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
callbacks_list = [checkpoint]

In [None]:
plot_model(frame_model,to_file='bidirectional.png',show_shapes=True)

In [None]:
frame_model.summary()

In [None]:
frame_model.fit(x_train_np, train_labels, validation_data=(x_test_np, val_labels), epochs=60, batch_size=5)
# frame_model.fit(x_train_np, train_labels, validation_data=(x_test_np, val_labels), epochs=50, batch_size=5, callbacks=callbacks_list, verbose=0)

In [None]:
frame_model.fit(x_train_np, train_labels, validation_data=(x_test_np, val_labels), epochs=50, batch_size=5, callbacks=callbacks_list, verbose=0)

In [None]:
# evaluate loaded model on test data
frame_model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
_score = frame_model.evaluate(x_train_np, train_labels, verbose=0)
print("%s: %.2f%%" % (frame_model.metrics_names[1], _score[1]*100))

In [None]:
frame_model.save("second_frame_model.h5")

# Two_Steam_LSTM for the frames
** **

In [None]:
steam_x1 = Input(shape=(120,128), name='audio')
steam_x2 = Input(shape=(120,1024), name='rgb_video')

In [None]:
steam_fc_1_x1 = Dense(512, activation='tanh', name='fc_1_x1')(steam_x1) 
steam_fc_1_x2 = Dense(512, activation='tanh', name='fc_1_x2')(steam_x2) 

In [None]:
# LSTM
steam_lstm_1_x1 = LSTM(128, return_sequences=True, go_backwards=False, name='lstm_1_x1')(steam_fc_1_x1)
steam_lstm_1_x2 = LSTM(1024, return_sequences=True, go_backwards=False, name='lstm_1_x2')(steam_fc_1_x2)

In [None]:
# Bidirectional_LSTM
steam_lstm_2_x1 = LSTM(128, return_sequences=True, go_backwards=True, name='lstm_2_x1')(steam_lstm_1_x1)
steam_lstm_2_x2 = LSTM(1024, return_sequences=True, go_backwards=True, name='lstm_2_x2')(steam_lstm_1_x2)

In [None]:
steam_dropout_1_x1 = Dropout(rate=0.5, name="dropout_1_x1")(steam_lstm_2_x1)
steam_dropout_1_x2 = Dropout(rate=0.5, name="dropout_1_x2")(steam_lstm_2_x2)

In [None]:
steam_fc_2_x1 = Dense(1, activation='softmax', name='fc_2_x1')(steam_dropout_1_x1) 
steam_fc_2_x2 = Dense(1, activation='softmax', name='fc_2_x2')(steam_dropout_1_x2) 

In [None]:
steam_pool_1_x1 = GlobalMaxPooling1D(name='pool_1_x1')(steam_fc_2_x1)
steam_pool_1_x2 = GlobalMaxPooling1D(name='pool_1_x2')(steam_fc_2_x2)

In [None]:
steam_merge_1 = concatenate([steam_pool_1_x1, steam_pool_1_x2])

In [None]:
steam_fc_2 = Dense(8192, activation='relu', name='fc_2')(steam_merge_1) 

In [None]:
steam_fc_3 = Dense(4096, activation='relu', name='fc_3')(steam_fc_2) 

In [None]:
steam_output = Dense(3862, activation='softmax',name='output')(steam_fc_3)

In [None]:
# # Complete Model Diagram
steam_model = Model(inputs=[steam_x1, steam_x2],outputs=[steam_output])
steam_model.compile(loss='categorical_crossentropy', optimizer='sgd', metrics=['accuracy'] )

In [None]:
# checkpoint
steam_filepath="steam-fc-1-x1-weights-improvement-{epoch:02d}-{val_acc:.2f}.hdf5"
steam_checkpoint = ModelCheckpoint(steam_filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
steam_callbacks_list = [steam_checkpoint]

In [None]:
plot_model(steam_model,to_file='lstm_steam_model.png',show_shapes=True)

In [None]:
steam_model.summary()

In [None]:
val_labels.shape

In [None]:
frame_model.fit(x_train_np, train_labels, validation_data=(x_test_np, val_labels), epochs=60, batch_size=5)

# Organizing Video level content for classification

In [None]:
# sample_length = len(feat_rgb)
labels = np.zeros([36454, 3862])
for i in range(len(labels)):
    j = random.randint(0,9)
    labels[i][j] = 1 

In [None]:
print(mean_rgb.shape[0])
totalNumberOfExamples = mean_rgb.shape[0]
# Splitting data between training and test. 
numberOfTrainingExamples = int(totalNumberOfExamples * 80 / 100)
numberOfTestExamples = int(totalNumberOfExamples * 20 / 100)

In [None]:
mean_rgb.shape

In [None]:
print(numberOfTrainingExamples)

print(numberOfTestExamples)

print(numberOfTrainingExamples + numberOfTestExamples)

In [None]:
# Traing examples for video level rgb 
train_video_rgb = mean_rgb[:numberOfTrainingExamples, :]
val_video_rgb = mean_rgb[numberOfTrainingExamples:, :]

# Training examples for video level audio
train_video_audio = mean_audio[numberOfTrainingExamples:]
val_video_audio = mean_audio[:numberOfTestExamples]

In [None]:
print(train_video_rgb.shape)

In [None]:
print(val_video_rgb.shape)

# Autoencoder
** **

In [None]:
input_autoencoder = Input(shape=(1024,))
encoded = Dense(784, activation='relu')(input_autoencoder)
encoded = Dense(64, activation='relu')(encoded)
encoded = Dense(32, activation='relu')(encoded)

decoded = Dense(64, activation='relu')(encoded)
decoded = Dense(128, activation='relu')(decoded)
decoded = Dense(784, activation='relu')(decoded)
decoded = Dense(3862, activation='softmax')(decoded)

In [None]:
train_labels.shape

In [None]:
train_video_rgb.shape

In [None]:
train_labels.shape

In [None]:
val_video_rgb.shape

In [None]:
val_labels.shape

In [None]:
autoencoder = Model(input_autoencoder, decoded)
autoencoder.compile(optimizer='adadelta', loss='binary_crossentropy')
# autoencoder.fit(train_video_rgb, val_video_rgb, epochs=100, batch_size=256, shuffle=True)

autoencoder.fit(train_video_rgb, train_labels, validation_data=(val_video_rgb, val_labels), epochs=100, batch_size=25)

In [None]:
print(train_video_rgb.shape)
print(val_video_rgb.shape)

# A Neural Net

In [None]:
# Creating a model
model_nn = Sequential()
model_nn.add(Dense(512, input_shape=(1024,), activation='relu'))
model_nn.add(Dense(128, activation='relu'))
model_nn.add(Dense(512, activation='relu'))
model_nn.add(Dense(3862, activation ='softmax',name='output'))

# Compiling model
model_nn.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
val_labels.shape

In [None]:
train_video_rgb.shape

In [None]:
#
model_nn.fit(train_video_rgb, train_labels, validation_data=(val_video_rgb, val_labels), epochs=1, batch_size=2)