In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [1]:
!pip install -q ffmpeg-python librosa opencv-python-headless tqdm


In [2]:
import cv2
import os
from tqdm import tqdm

# Paths
video_path = "/kaggle/input/dataset23/UPSC_Mains.mp4"
frames_dir = "/kaggle/working/data/frames"

os.makedirs(frames_dir, exist_ok=True)

# Load video
cap = cv2.VideoCapture(video_path)
fps = int(cap.get(cv2.CAP_PROP_FPS))  # Get frame rate

# Extract frames
frame_count = 0
while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break
    
    # Resize the frame to 96x96 (you may need face cropping here)
    frame_resized = cv2.resize(frame, (96, 96))
    
    # Save the frame
    frame_path = os.path.join(frames_dir, f"frame_{frame_count:05d}.jpg")
    cv2.imwrite(frame_path, frame_resized)
    frame_count += 1

cap.release()
print(f"Extracted {frame_count} frames at {fps} FPS")


Extracted 221180 frames at 25 FPS


In [3]:
import ffmpeg

audio_path = "/kaggle/working/data/audio.wav"
os.makedirs("data", exist_ok=True)

# Extract audio from video
ffmpeg.input(video_path).output(audio_path, ac=1, ar=16000).run(overwrite_output=True)
print(f"Extracted audio saved to {audio_path}")


ffmpeg version 4.4.2-0ubuntu0.22.04.1 Copyright (c) 2000-2021 the FFmpeg developers
  built with gcc 11 (Ubuntu 11.2.0-19ubuntu1)
  configuration: --prefix=/usr --extra-version=0ubuntu0.22.04.1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libdav1d --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librabbitmq --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libsrt --enable-libssh --enable-libtheora --enable-libtwolame --enable-libvidstab --enable-libvorbis --enable-libvpx --enab

Extracted audio saved to data/audio.wav


size=  276476kB time=02:27:27.21 bitrate= 256.0kbits/s speed= 682x    
video:0kB audio:276476kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: 0.000028%


In [5]:
import librosa
import numpy as np
import os

def audio_to_mel(audio_path, output_dir, hop_length=160, n_fft=400, n_mels=80, chunk_size=128):
    # Load audio
    y, sr = librosa.load(audio_path, sr=16000)
    
    # Compute mel spectrogram
    mel = librosa.feature.melspectrogram(y=y, sr=sr, n_fft=n_fft, hop_length=hop_length, n_mels=n_mels)
    mel_db = librosa.power_to_db(mel, ref=np.max)
    
    # Ensure output directory exists
    os.makedirs(output_dir, exist_ok=True)

    # Split mel spectrogram into chunks
    num_frames = mel_db.shape[1]
    for i in range(0, num_frames, chunk_size):
        mel_chunk = mel_db[:, i:i + chunk_size]
        np.save(os.path.join(output_dir, f"mel_{i // chunk_size:05d}.npy"), mel_chunk)
    
    print(f"Generated {num_frames // chunk_size} mel-spectrogram chunks")

# Example usage
audio_path = '/kaggle/working/data/audio.wav'
mel_dir = '/kaggle/working/data/mels'
audio_to_mel(audio_path, mel_dir)


Generated 6911 mel-spectrogram chunks


In [10]:
%cd /kaggle/working/

/kaggle/working


In [11]:
!pip install -q torch torchvision torchaudio
!pip install -q opencv-python-headless matplotlib tqdm
!git clone https://github.com/Rudrabha/LipGAN.git
%cd LipGAN

fatal: destination path 'LipGAN' already exists and is not an empty directory.
/kaggle/working/LipGAN


In [7]:
!wget https://www.iiitvidya.com/pretrained/LipGAN.pth -P checkpoints/


--2024-11-26 04:18:00--  https://www.iiitvidya.com/pretrained/LipGAN.pth
Resolving www.iiitvidya.com (www.iiitvidya.com)... failed: Name or service not known.
wget: unable to resolve host address 'www.iiitvidya.com'


In [8]:
load_checkpoint = True
checkpoint_path = "checkpoints/LipGAN.pth"

In [9]:
import os
import torch
from torch.utils.data import DataLoader
from model import LipGAN
from datasets.lrs2 import LRS2Dataset
from hparams import hparams as hp
from utils import load_checkpoint, save_checkpoint

def fine_tune():
    # Paths
    data_root = '/kaggle/working/data'  # Path to your fine-tuning dataset
    checkpoint_dir = '/kaggle/working/checkpoints'  # Directory to save checkpoints
    os.makedirs(checkpoint_dir, exist_ok=True)
    
    # Dataset paths for frames and mel spectrograms
    audio_path = os.path.join(data_root, 'audio.wav')
    frames_dir = os.path.join(data_root, 'frames')
    mels_dir = os.path.join(data_root, 'mels')
    
    # Dataset
    fine_tune_dataset = LRS2Dataset(audio_path, frames_dir, mels_dir, split='train', augment=True)
    fine_tune_loader = DataLoader(fine_tune_dataset, batch_size=hp.batch_size, shuffle=True, num_workers=4)
    
    # Model
    model = LipGAN().cuda()  # Ensure GPU is enabled
    optimizer = torch.optim.Adam(model.parameters(), lr=hp.lr)
    
    # Load Pretrained Weights
    start_epoch = 0
    if hp.load_checkpoint:
        start_epoch = load_checkpoint(hp.checkpoint_path, model, optimizer)
        print(f"Loaded pretrained weights from {hp.checkpoint_path}")
    
    # Fine-Tuning Loop
    for epoch in range(start_epoch, hp.epochs):
        model.train()
        for step, (video_frames, audio_mels, labels) in enumerate(fine_tune_loader):
            video_frames = video_frames.cuda()
            audio_mels = audio_mels.cuda()
            labels = labels.cuda()
            
            # Forward pass
            preds = model(video_frames, audio_mels)
            loss = model.loss_function(preds, labels)
            
            # Backward pass
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            # Logging
            if step % hp.log_interval == 0:
                print(f"Epoch [{epoch}/{hp.epochs}], Step [{step}/{len(fine_tune_loader)}], Loss: {loss.item():.4f}")
        
        # Save Checkpoint
        save_checkpoint(os.path.join(checkpoint_dir, f"fine_tuned_epoch_{epoch}.pth"), model, optimizer, epoch)
        print(f"Checkpoint saved at epoch {epoch}.")

if __name__ == "__main__":
    fine_tune()


ModuleNotFoundError: No module named 'model'

In [None]:
!zip -r fine_tuned_model.zip checkpoints/
