In [1]:
!pip install gradio
!pip install soundfile
!pip install modelscope -U
!pip install librosa
!pip install soundfile
!pip install webrtcvad


import os
!pip install git+https://github.com/openai/whisper.git 
!pip install yt-dlp
!pip install moviepy --upgrade
!apt-get update
!apt install imagemagick -y
# remove line 88 of vim ~/../etc/ImageMagick-6/policy.xml to run MoviePy
!sed -i '88d' ~/../etc/ImageMagick-6/policy.xml 
!mkdir experiments
os._exit(00)

Collecting gradio
  Downloading gradio-4.19.2-py3-none-any.whl (16.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.9/16.9 MB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting httpx>=0.24.1
  Downloading httpx-0.27.0-py3-none-any.whl (75 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.6/75.6 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting aiofiles<24.0,>=22.0
  Downloading aiofiles-23.2.1-py3-none-any.whl (15 kB)
Collecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Collecting typer[all]<1.0,>=0.9
  Downloading typer-0.9.0-py3-none-any.whl (45 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.9/45.9 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting gradio-client==0.10.1
  Downloading gradio_client-0.10.1-py3-none-any.whl (307 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m307.9/307.9 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00

: 

: 

: 

In [1]:
!rm -rf /notebooks/audio_chunks

In [5]:
import os
import shutil
from typing import List

import librosa
import numpy as np
from scipy.io.wavfile import write
import webrtcvad

from modelscope.pipelines import pipeline
from modelscope.utils.constant import Tasks

class AudioSplitter:
	def __init__(self, sample_rate: int = 16000, frame_duration: int = 30, min_sec: int = 5 * 60, top_db: int = 30):
		self.sample_rate = sample_rate
		self.frame_duration = frame_duration
		self.min_sec = min_sec
		self.top_db = top_db
		self.vad = self.initialize_vad()

	def initialize_vad(self, aggressiveness: int = 3) -> webrtcvad.Vad:
		return webrtcvad.Vad(aggressiveness)

	def load_audio(self, audio_file_path: str) -> np.ndarray:
		audio, _ = librosa.load(audio_file_path, sr=self.sample_rate)
		return audio

	def extract_voiced_audio(self, audio: np.ndarray) -> np.ndarray:
		frame_size = int(self.sample_rate * self.frame_duration / 1000)
		voiced_frames = [
			audio[i:i + frame_size]
			for i in range(0, len(audio), frame_size)
			if len(audio[i:i + frame_size]) == frame_size and self.vad.is_speech((audio[i:i + frame_size] * 32767).astype(np.int16).tobytes(), self.sample_rate)
		]
		return np.concatenate(voiced_frames, axis=0) if voiced_frames else np.array([])

	def split_and_save_chunks(self, voiced_audio: np.ndarray, output_path: str) -> List[str]:
		self.prepare_output_directory(output_path)
		non_silent_intervals = librosa.effects.split(voiced_audio, top_db=self.top_db)
		chunk_files = []
		current_start = None
		file_i = 0
		for interval in non_silent_intervals:
			start, end = interval
			if current_start is None:
				current_start = start

			if end - current_start >= self.min_sec * self.sample_rate:
				chunk_file_path = os.path.join(output_path, f"chunk_{file_i:08}.wav")
				chunk = voiced_audio[current_start:end]
				write(chunk_file_path, self.sample_rate, (chunk * 32767).astype(np.int16))
				chunk_files.append(chunk_file_path)
				current_start = None
				file_i += 1
			
			#if file_i > 2:
			#	break

		return chunk_files

	def prepare_output_directory(self, path: str):
		if os.path.exists(path):
			shutil.rmtree(path)
		os.makedirs(path, exist_ok=True)

	def process_audio(self, audio_file_path: str, output_path: str) -> List[str]:
		audio = self.load_audio(audio_file_path)
		voiced_audio = self.extract_voiced_audio(audio * 32767)
		return self.split_and_save_chunks(voiced_audio / 32767, output_path)


class NoiseEliminator:
	def __init__(self):
		self.suppression_pipeline = pipeline(
			Tasks.acoustic_noise_suppression,
			model='damo/speech_frcrn_ans_cirm_16k'
		)

	def noise_suppression(self, audio_file_path: str, output_path: str):
		self.suppression_pipeline(audio_file_path, output_path=output_path)


# Example usage
if __name__ == "__main__":
	splitter = AudioSplitter(
		top_db=60,
	)
	eliminator = NoiseEliminator()
	audio_file_path = "/notebooks/Podcast ep172.mp3"
	output_path = "/notebooks/audio_chunks"
	chunks = splitter.process_audio(audio_file_path, output_path)
	for chunk in chunks:
		eliminator.noise_suppression(chunk, chunk)
	print("Processed audio chunks:", chunks)


2024-02-29 01:56:52,536 - modelscope - INFO - initiate model from /root/.cache/modelscope/hub/damo/speech_frcrn_ans_cirm_16k
2024-02-29 01:56:52,537 - modelscope - INFO - initiate model from location /root/.cache/modelscope/hub/damo/speech_frcrn_ans_cirm_16k.
2024-02-29 01:56:52,539 - modelscope - INFO - initialize model from /root/.cache/modelscope/hub/damo/speech_frcrn_ans_cirm_16k


inputs:(1, 4848128)
padding: 24128
inputs after padding:(1, 4872256)
current_idx: 0
current_idx: 12000
current_idx: 24000
current_idx: 36000
current_idx: 48000
current_idx: 60000
current_idx: 72000
current_idx: 84000
current_idx: 96000
current_idx: 108000
current_idx: 120000
current_idx: 132000
current_idx: 144000
current_idx: 156000
current_idx: 168000
current_idx: 180000
current_idx: 192000
current_idx: 204000
current_idx: 216000
current_idx: 228000
current_idx: 240000
current_idx: 252000
current_idx: 264000
current_idx: 276000
current_idx: 288000
current_idx: 300000
current_idx: 312000
current_idx: 324000
current_idx: 336000
current_idx: 348000
current_idx: 360000
current_idx: 372000
current_idx: 384000
current_idx: 396000
current_idx: 408000
current_idx: 420000
current_idx: 432000
current_idx: 444000
current_idx: 456000
current_idx: 468000
current_idx: 480000
current_idx: 492000
current_idx: 504000
current_idx: 516000
current_idx: 528000
current_idx: 540000
current_idx: 552000
curr

In [19]:
import glob

chunks = sorted(glob.glob("/notebooks/audio_chunks/*.wav"))
chunks

['/notebooks/audio_chunks/chunk_00000182.wav',
 '/notebooks/audio_chunks/chunk_00000377.wav',
 '/notebooks/audio_chunks/chunk_00000557.wav',
 '/notebooks/audio_chunks/chunk_00000765.wav',
 '/notebooks/audio_chunks/chunk_00000953.wav',
 '/notebooks/audio_chunks/chunk_00001139.wav',
 '/notebooks/audio_chunks/chunk_00001342.wav',
 '/notebooks/audio_chunks/chunk_00001533.wav',
 '/notebooks/audio_chunks/chunk_00001730.wav',
 '/notebooks/audio_chunks/chunk_00001924.wav',
 '/notebooks/audio_chunks/chunk_00002125.wav',
 '/notebooks/audio_chunks/chunk_00002306.wav',
 '/notebooks/audio_chunks/chunk_00002492.wav',
 '/notebooks/audio_chunks/chunk_00002689.wav',
 '/notebooks/audio_chunks/chunk_00002889.wav',
 '/notebooks/audio_chunks/chunk_00003079.wav',
 '/notebooks/audio_chunks/chunk_00003290.wav',
 '/notebooks/audio_chunks/chunk_00003461.wav',
 '/notebooks/audio_chunks/chunk_00003650.wav']

In [2]:
import os
import glob
import json
import whisper
import librosa
from tqdm import tqdm

class TranscriptionProcessor:
    def __init__(self, model_type='medium', output_json_path="/notebooks/transcriptions.json"):
        self.model = whisper.load_model(model_type).cuda()
        self.output_json_path = output_json_path
        self.segments = []
        self.cumulative_duration = 0
    
    def transcribe_chunks(self, chunks):
        if os.path.exists(self.output_json_path):
            os.remove(self.output_json_path)
        
        for i, chunk in tqdm(enumerate(chunks), total=len(chunks)):
            transcription = self.model.transcribe(chunk)
            self._update_segments(transcription["segments"])
            self._save_transcription()
            self.cumulative_duration += librosa.get_duration(filename=chunk)
    
    def _update_segments(self, new_segments):
        for segment in new_segments:
            segment["start"] += self.cumulative_duration
            segment["end"] += self.cumulative_duration
        self.segments.extend(new_segments)
    
    def _save_transcription(self):
        with open(self.output_json_path, "w") as file:
            json.dump(self.segments, file, indent=2)

# Example usage
if __name__ == "__main__":
    chunks = sorted(glob.glob("/notebooks/audio_chunks/*.wav"))
    processor = TranscriptionProcessor()
    processor.transcribe_chunks(chunks)


	This alias will be removed in version 1.0.
  self.cumulative_duration += librosa.get_duration(filename=chunk)
	This alias will be removed in version 1.0.
  self.cumulative_duration += librosa.get_duration(filename=chunk)
	This alias will be removed in version 1.0.
  self.cumulative_duration += librosa.get_duration(filename=chunk)
100%|██████████| 3/3 [01:31<00:00, 30.42s/it]


In [3]:
import subprocess
import json
import os
import librosa

def concatenate_audio(chunks, output_file):
	"""Concatenate a list of audio chunks into a single file."""
	with open('concat_list.txt', 'w') as f:
		for chunk in chunks:
			f.write(f"file '{chunk}'\n")
	subprocess.run(['ffmpeg', '-y', '-f', 'concat', '-safe', '0', '-i', 'concat_list.txt', 
					'-c', 'copy', output_file], check=True)
	os.remove('concat_list.txt')

def generate_srt_from_json(segments, srt_file):
	"""Generate an SRT file from the transcription JSON, considering each chunk's cumulative time."""
	with open(srt_file, 'w', encoding='utf-8') as file:
		for entry in segments:
			# Adjust start and end times by adding the cumulative duration
			start = entry['start']
			end = entry['end']
			text = entry['text']
			file.write(f"{entry['id'] + 1}\n")
			file.write(f"{format_time(start)} --> {format_time(end)}\n")
			file.write(f"{text}\n\n")
			
def format_time(seconds):
	"""Convert time in seconds to SRT time format."""
	hours = int(seconds // 3600)
	minutes = int((seconds % 3600) // 60)
	seconds = seconds % 60
	return f"{hours:02}:{minutes:02}:{seconds:06.3f}".replace('.', ',')

def create_video_with_subtitles_from_json(image_path, json_path, chunks, output_video_path):
	# Read JSON to get all audio paths
	with open(json_path, 'r') as file:
		segments = json.load(file)

	combined_audio_path = 'combined_audio.wav'
	concatenate_audio(chunks, combined_audio_path)

	srt_file = 'subtitles.srt'
	generate_srt_from_json(segments, srt_file)

	if os.path.exists(output_video_path):
		os.remove(output_video_path)
	
	# Command to create video using FFmpeg
	command = [
		'ffmpeg',
		'-loop', '1',                 # Loop the image
		'-i', image_path,             # Input image file
		'-i', combined_audio_path,    # Input combined audio file
		'-vf', "scale=1240:814,subtitles=" + srt_file,
		'-r', '1',                    # Set output frame rate to 1 fps
		'-c:v', 'libx264',            # Video codec
		'-pix_fmt', 'yuv420p',        # Pixel format
		'-tune', 'stillimage',        # Tune for still image
		'-c:a', 'aac',                # Audio codec
		'-b:a', '192k',               # Audio bitrate
		'-shortest',                  # Finish encoding when the shortest stream ends
		output_video_path             # Output file
	]

	subprocess.run(command, check=True)
	# Clean up
	os.remove(combined_audio_path)
	os.remove(srt_file)

# Example usage
background_image_path = '/notebooks/Background 1240x815.jpg'
json_path = '/notebooks/transcriptions.json'
chunks = sorted(glob.glob("/notebooks/audio_chunks/*.wav"))
output_video_path = 'output_video.mp4'

create_video_with_subtitles_from_json(background_image_path, json_path, chunks, output_video_path)


ffmpeg version 4.2.7-0ubuntu0.1 Copyright (c) 2000-2022 the FFmpeg developers
  built with gcc 9 (Ubuntu 9.4.0-1ubuntu1~20.04.1)
  configuration: --prefix=/usr --extra-version=0ubuntu0.1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-avresample --disable-filter=resample --enable-avisynth --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librsvg --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libssh --enable-libtheora --enable-libtwolame --enable-libvidstab --enable-libvorbis --e