# Convert YouTube video to audio WAV format

In [None]:
import sys
!conda install -y -c conda-forge ffmpeg libsndfile -p {sys.prefix}
!pip install spleeter
!pip install azure-cognitiveservices-speech
!conda install -y -c conda-forge youtube_dl -p {sys.prefix} 
!conda install -y -c conda-forge pydub -p {sys.prefix} 
!conda install -y -c conda-forge python-dotenv -p {sys.prefix}

In [None]:

import spleeter 
from __future__ import unicode_literals
import youtube_dl 
from pydub import AudioSegment
from pydub.silence import split_on_silence
import csv
from pathlib import Path
from termcolor import colored
import os
import azure.cognitiveservices.speech as speechsdk
from dotenv import load_dotenv
load_dotenv()

In [None]:
audio_file_name = 'st1'
youtube_url = 'https://youtu.be/DtsCgfLZnQQ'
speech_subscription = os.getenv('SPEECH_SUBSCRIPTION')

## Extract and download the audio from YouTube

In [None]:
def download_clip(url, name):
    ydl_opts = {
        'format': 'bestaudio/best',
        'outtmpl': f'/output/{name}.wav',
        'noplaylist': True,
        'continue_dl': True,
        'postprocessors': [{
            'key': 'FFmpegExtractAudio',
            'preferredcodec': 'pcm',
            'preferredquality': '192', }]
    }
    try:
        with youtube_dl.YoutubeDL(ydl_opts) as ydl:
            ydl.cache.remove()
            info_dict = ydl.extract_info(url, download=False)
            ydl.prepare_filename(info_dict)
            ydl.download([url])
            return True
    except Exception:
        return False

download_clip(youtube_url, audio_file_name)

## Seperate the voice from the audio


In [None]:
# Alter the stem value to split music into multiple tracks and files
!spleeter separate -p spleeter:2stems -o output "output/st1.wav"

In [None]:
def convert_audio(audio_file):
    """
    Corrects the channels, sample rate, and sample width of the audios.
    Replaces the original audio file with the one generated.
    AudioSegment.from_wav(fromFile).export(toFile, format="wav", codec="pcm_mulaw", parameters=["-ar","8000"])
    """
    sound = AudioSegment.from_file(audio_file)
    sound = sound.set_frame_rate(16000)
    sound = sound.set_channels(1)
    sound = sound.set_sample_width(2) # 2 corresponds to 16-bit sample width in Pydub
    sound.export(audio_file, format ="wav", codec="pcm_mulaw", parameters=["-ar","8000", "-ac", "1"])

# adjust the sample rate so that we can extract the text from the audio
convert_audio('output/{}/vocals.wav'.format(audio_file_name))

## Send the audio to azure speech to text service


In [None]:
def from_file():
    speech_config = speechsdk.SpeechConfig(subscription=speech_subscription, region="southcentralus")
    audio_input = speechsdk.AudioConfig(filename="output/st1/vocals.wav")
    speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, audio_config=audio_input)
    
    result = speech_recognizer.recognize_once_async().get()
    print(result.text)

from_file()