In [1]:
!pip3 install moviepy torch torchaudio pandas

Collecting moviepy
  Using cached moviepy-1.0.3.tar.gz (388 kB)
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
[?25hCollecting torch
  Obtaining dependency information for torch from https://files.pythonhosted.org/packages/3f/14/e105b8ef6d324e789c1589e95cb0ab63f3e07c2216d68b1178b7c21b7d2a/torch-2.2.2-cp311-none-macosx_10_9_x86_64.whl.metadata
  Downloading torch-2.2.2-cp311-none-macosx_10_9_x86_64.whl.metadata (25 kB)
Collecting torchaudio
  Obtaining dependency information for torchaudio from https://files.pythonhosted.org/packages/57/c4/80cc3315dd1ca706643b78f894901d4d888ffe376a5e401f73d9db61071e/torchaudio-2.2.2-cp311-cp311-macosx_10_13_x86_64.whl.metadata
  Downloading torchaudio-2.2.2-cp311-cp311-macosx_10_13_x86_64.whl.metadata (6.4 kB)
Collecting decorator<5.0,>=4.0.2 (from moviepy)
  Obtaining dependency information for decorator<5.0,>=4.0.2 from https://files.

Downloading tqdm-4.66.2-py3-none-any.whl (78 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.3/78.3 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading typing_extensions-4.11.0-py3-none-any.whl (34 kB)
Downloading filelock-3.13.4-py3-none-any.whl (11 kB)
Downloading fsspec-2024.3.1-py3-none-any.whl (171 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m172.0/172.0 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading Jinja2-3.1.3-py3-none-any.whl (133 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m133.2/133.2 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading sympy-1.12-py3-none-any.whl (5.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.7/5.7 MB[0m [31m24.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading MarkupSafe-2.1.5-cp311-cp311-macosx_10_9_x86_64.whl (14 kB)
Using cached mpmath-1.3.0-py3-none-any.whl (536 kB)
Building wheels for collected packages: m

In [2]:
import moviepy.editor as mp
import torch
import torchaudio
import pandas as pd
from tqdm import tqdm

# Define the charset and other constants
CHARSET = " abcdefghijklmnopqrstuvwxyz,.'"
mel_transform = {}

def extract_audio(video_path):
    try:
        video = mp.VideoFileClip(video_path)
        audio_path = "temp_audio.wav"
        video.audio.write_audiofile(audio_path)
        return audio_path
    except Exception as e:
        print(f"Error extracting audio from video: {e}")
        return None

def load_audio(audio_path):
    try:
        waveform, sample_rate = torchaudio.load(audio_path, normalize=True)
        hop_length = int(sample_rate / (1000 / 10))  # 10ms
        win_length = int(sample_rate / (1000 / 25))  # 25ms
        mel_spectrogram = torchaudio.transforms.MelSpectrogram(
            sample_rate, n_fft=win_length, win_length=win_length, hop_length=hop_length, n_mels=80
        )(waveform)
        return mel_spectrogram[0].T  # Transpose to get the correct dimension
    except Exception as e:
        print(f"Error loading audio file: {e}")
        return None

def load_model(path):
    try:
        # Load the model safely
        model = torch.load(path)
        model.eval()  # Set the model to inference mode
        return model
    except Exception as e:
        print(f"Error loading model: {e}")
        return None

def recognize_speech(mel_spectrogram, model):
    try:
        if mel_spectrogram is None or model is None:
            raise ValueError("Invalid input or model not loaded")
        
        # Ensure input tensor is correctly shaped
        if len(mel_spectrogram.shape) != 2 or mel_spectrogram.shape[1] != 80:
            raise ValueError("Input mel_spectrogram must have shape [Time, 80]")
        input_tensor = mel_spectrogram.unsqueeze(0)  # Add batch dimension
        output = model(input_tensor)
        predicted_indices = output.argmax(2).squeeze(0)  # Simulate prediction
        text = ''.join([CHARSET[i] for i in predicted_indices if i != 0])
        return text
    except Exception as e:
        print(f"Error during model inference: {e}")
        return ""

def create_csv(data, filename):
    try:
        df = pd.DataFrame(data, columns=["Text", "Video Link"])
        df.to_csv(filename, index=False)
    except Exception as e:
        print(f"Error creating CSV file: {e}")

def video_to_text(video_path, video_link, model_path):
    model = load_model(model_path)
    if model is None:
        print("Model loading failed.")
        return

    audio_path = extract_audio(video_path)
    if audio_path:
        mel_spectrogram = load_audio(audio_path)
        if mel_spectrogram is not None:
            text = recognize_speech(mel_spectrogram, model)
            print(text)
            create_csv([(text, video_link)], "output.csv")
        else:
            print("Failed to process audio for speech recognition.")
    else:
        print("Failed to extract audio from video.")

video_path = "test.mp4"
video_link = "https://www.youtube.com/watch?v=Hb3v7zcu6UY&ab_channel=thenewboston"
model_path = "voice.pt"
video_to_text(video_path, video_link, model_path)

ModuleNotFoundError: No module named 'torchaudio'

In [1]:
!pip3 install torchaudio


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.11 -m pip install --upgrade pip[0m
