Video Processing. Getting the audio and the video according to the defined format

In [None]:
import cv2
import numpy as np

def preprocess_video(video_path, target_size=(224, 224), num_frames=16):
    frames = []
    cap = cv2.VideoCapture(video_path)
    while True:
        ret, frame = cap.read()
        if not ret:
            break
        frame = cv2.resize(frame, target_size)
        frame = frame.astype(np.float32) / 255.0
        frames.append(frame)
        if len(frames) == num_frames:
            break
    cap.release()
    if len(frames) < num_frames:
        frames += [frames[-1]] * (num_frames - len(frames))
    return np.stack(frames, axis=0)

video_path = "test.mp4"
preprocessed_video = preprocess_video(video_path)
print(preprocessed_video.shape)


(16, 224, 224, 3)


In [None]:
from moviepy.editor import VideoFileClip

def get_audio(video_path, audio_path):
    video_clip = VideoFileClip(video_path)

    audio_clip = video_clip.audio

    audio_clip.write_audiofile(audio_path)

    video_clip.close()
    audio_clip.close()

audio_path = "audio.wav"
get_audio(video_path, audio_path)

MoviePy - Writing audio in audio.wav


                                                        

MoviePy - Done.




In [None]:
import librosa
import numpy as np

def preprocess_audio(audio_path, sample_rate=44100, n_mels=128, hop_length=512, duration=10):
    y, sr = librosa.load(audio_path, sr=sample_rate, duration=duration, mono=True)
    mel_spectrogram = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=n_mels, hop_length=hop_length)
    mel_spectrogram = librosa.power_to_db(mel_spectrogram, ref=np.max)

    mel_spectrogram = (mel_spectrogram - np.min(mel_spectrogram)) / (np.max(mel_spectrogram) - np.min(mel_spectrogram))
    return mel_spectrogram


preprocessed_audio = preprocess_audio(audio_path)
print(preprocessed_audio.shape)


(128, 577)


Making the CNN model with resnet50 base and extracting the features for audio and video images


In [None]:
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.models import Model
from tensorflow.keras.layers import GlobalAveragePooling2D

def create_video_feature_extractor(input_shape=(224, 224, 3), base_model='resnet50'):
    if base_model == 'resnet50':
        base = ResNet50(weights='imagenet', include_top=False, input_shape=input_shape)
    else:
        raise ValueError("Unsupported base model")

    output = GlobalAveragePooling2D()(base.output)
    model = Model(inputs=base.input, outputs=output)
    return model

video_feature_extractor = create_video_feature_extractor()


Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/resnet/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5


In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense

def create_audio_feature_extractor(input_shape=(128, 128, 1)):
    model = Sequential([
        Conv2D(32, kernel_size=(3, 3), activation='relu', input_shape=input_shape),
        MaxPooling2D(pool_size=(2, 2)),
        Conv2D(64, kernel_size=(3, 3), activation='relu'),
        MaxPooling2D(pool_size=(2, 2)),
        Flatten(),
        Dense(128, activation='relu')
    ])
    return model

audio_feature_extractor = create_audio_feature_extractor()


Combining the 2 features

In [None]:
def late_fusion(video_features, audio_features):
    video_output = Dense(64, activation='relu')(video_features)
    audio_output = Dense(64, activation='relu')(audio_features)
    combined_features_late = Concatenate()([video_output, audio_output])
    return combined_features_late

combined_features_late = late_fusion(video_feature_extractor, audio_feature_extractor)


In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

def create_fusion_model(input_dim):
    model = Sequential([
        Dense(128, activation='relu', input_dim=input_dim),
        Dropout(0.5),
        Dense(64, activation='relu'),
        Dense(num_classes, activation='softmax')
    ])
    return model

fusion_model = create_fusion_model(input_dim=combined_features.shape[1])
fusion_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
fusion_model.summary()


Metrics

In [None]:
history = fusion_model.fit(X_train, y_train, batch_size=32, epochs=10, validation_data=(X_val, y_val))

loss, accuracy = fusion_model.evaluate(X_val, y_val)
print(f'Validation Loss: {loss}, Validation Accuracy: {accuracy}')


In [None]:
test_loss, test_accuracy = fusion_model.evaluate(X_test, y_test)
print(f'Test Loss: {test_loss}, Test Accuracy: {test_accuracy}')
