In [104]:
from moviepy.editor import VideoFileClip

def extract_audio(video_file, output_audio_file):
    video = VideoFileClip(video_file)
    audio = video.audio
    audio.write_audiofile(output_audio_file)
    video.close()

video_path = "v.mp4"
audio_output = "audio_output.wav"
extract_audio(video_path, audio_output)


MoviePy - Writing audio in audio_output.wav


                                                                     

MoviePy - Done.


In [105]:
import os

def print_file_size(file_path):
    file_size = os.path.getsize(file_path)  # Size in bytes
    file_size_MB = file_size   # Convert to MB
    print(f"File size: {file_size_MB:.2f} bytes")

audio_file = "audio_output.wav"  # Replace with your file path
print_file_size(audio_file)


File size: 9878478.00 bytes


In [106]:
from pydub import AudioSegment

def convert_to_mono(input_audio, output_audio):
    audio = AudioSegment.from_wav(input_audio)
    mono_audio = audio.set_channels(1)
    mono_audio.export(output_audio, format="wav")

input_audio = "audio_output.wav"
output_audio = "mono_audio_output.wav"
convert_to_mono(input_audio, output_audio)


In [117]:
from moviepy.editor import VideoFileClip, AudioFileClip, CompositeAudioClip


In [232]:
import io
import os
from google.cloud import speech

os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "credentials.json"

def transcribe_audio_with_timestamps(audio_file):
    """Transcribe audio with timestamps using Google Speech-to-Text."""
    client = speech.SpeechClient()
    resultSpeech = ""
    # Load audio file
    with io.open(audio_file, "rb") as f:
        content = f.read()

    # Set up the RecognitionAudio and RecognitionConfig
    audio = speech.RecognitionAudio(content=content)
    config = speech.RecognitionConfig(
        encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
        language_code="en-US",
        enable_word_time_offsets=True  # Enable word-level timestamps
    )

    # Perform the speech recognition
    response = client.recognize(config=config, audio=audio)
    
    # Extract transcripts and timestamps
    transcript = ""
    last_end = 0
    for result in response.results:
        alternative = result.alternatives[0]
        transcript += alternative.transcript + " "
        for word_info in alternative.words:
            word = word_info.word
            start_time = word_info.start_time.total_seconds()
            end_time = word_info.end_time.total_seconds()
            print(f"Word: {word}, Start time: {start_time}, End time: {end_time}")
            resultSpeech += f"(Word: {word}, breakPoint:{start_time-last_end }), "
            last_end = end_time
    return resultSpeech


# Example usage
transcript = transcribe_audio_with_timestamps("mono_audio.wav")


Word: hi, Start time: 3.3, End time: 3.7
Word: hi, Start time: 3.7, End time: 4.9
Word: myself, Start time: 4.9, End time: 5.9
Word: I'm, Start time: 5.9, End time: 8.1
Word: from, Start time: 8.1, End time: 8.3
Word: Kippur, Start time: 8.3, End time: 9.1
Word: this, Start time: 9.1, End time: 10.0
Word: take, Start time: 10.0, End time: 10.2
Word: of, Start time: 10.2, End time: 10.3
Word: Maharashtra, Start time: 10.3, End time: 10.8
Word: I, Start time: 10.8, End time: 11.6
Word: am, Start time: 11.6, End time: 11.9
Word: be, Start time: 11.9, End time: 12.1
Word: a, Start time: 12.1, End time: 12.2
Word: graduate, Start time: 12.2, End time: 12.4
Word: and, Start time: 12.4, End time: 13.3
Word: also, Start time: 13.3, End time: 13.4
Word: doing, Start time: 13.4, End time: 13.6
Word: Ma, Start time: 13.6, End time: 14.4
Word: from, Start time: 14.4, End time: 14.6
Word: she, Start time: 15.1, End time: 15.6
Word: was, Start time: 15.6, End time: 15.8
Word: University, Start time:

In [233]:
print(transcript)

(Word: hi, breakPoint:3.3), (Word: hi, breakPoint:0.0), (Word: myself, breakPoint:0.0), (Word: I'm, breakPoint:0.0), (Word: from, breakPoint:0.0), (Word: Kippur, breakPoint:0.0), (Word: this, breakPoint:0.0), (Word: take, breakPoint:0.0), (Word: of, breakPoint:0.0), (Word: Maharashtra, breakPoint:0.0), (Word: I, breakPoint:0.0), (Word: am, breakPoint:0.0), (Word: be, breakPoint:0.0), (Word: a, breakPoint:0.0), (Word: graduate, breakPoint:0.0), (Word: and, breakPoint:0.0), (Word: also, breakPoint:0.0), (Word: doing, breakPoint:0.0), (Word: Ma, breakPoint:0.0), (Word: from, breakPoint:0.0), (Word: she, breakPoint:0.5), (Word: was, breakPoint:0.0), (Word: University, breakPoint:0.0), (Word: I, breakPoint:0.0), (Word: love, breakPoint:0.0), (Word: cooking, breakPoint:0.0), (Word: and, breakPoint:0.0), (Word: singing, breakPoint:0.0), (Word: and, breakPoint:0.0), (Word: also, breakPoint:0.0), (Word: wrestling, breakPoint:0.0), (Word: uh, breakPoint:0.6000000000000014), (Word: tell, breakPoi

In [121]:
import requests

In [269]:
def correct_grammar(transcribed_text):
    """Correct grammar using Azure OpenAI."""
    azure_openai_key = os.getenv("AZURE_OPENAI_KEY")  # Load from .env
    azure_openai_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT")  # Load from .env

    headers = {
        "Content-Type": "application/json",
        "api-key": azure_openai_key
    }

    data = {
        "messages": [{"role": "user", "content": f"""
                    you will get a input of list of pairs in the format "Word: (word), breakPoint: (time)".
                    Using (word) from each line create a grammatically correct paragraph with punctuations then using this new paragraph replace the old word from given list with grammatically correct words(with punctuations) and corresponding breakpoint.
                    Input: '{transcript}'.
                    Only return the final list of pair and nothing else in one line without newline.
                    """}],
        "max_tokens": len(transcript)
    }

    response = requests.post(azure_openai_endpoint, headers=headers, json=data)

    if response.status_code == 200:
        result = response.json()
        return result["choices"][0]["message"]["content"].strip()
    else:
        return f"Error: {response.status_code} - {response.text}"


In [270]:
text = correct_grammar(transcript)
text

"(Word: Hi!, breakPoint:3.3), (Word: Hi., breakPoint:0.0), (Word: Myself, breakPoint:0.0), (Word: I'm, breakPoint:0.0), (Word: from, breakPoint:0.0), (Word: Kippur., breakPoint:0.0), (Word: This, breakPoint:0.0), (Word: take, breakPoint:0.0), (Word: of, breakPoint:0.0), (Word: Maharashtra., breakPoint:0.0), (Word: I, breakPoint:0.0), (Word: am, breakPoint:0.0), (Word: be, breakPoint:0.0), (Word: a, breakPoint:0.0), (Word: graduate, breakPoint:0.0), (Word: and, breakPoint:0.0), (Word: also, breakPoint:0.0), (Word: doing, breakPoint:0.0), (Word: MA, breakPoint:0.0), (Word: from, breakPoint:0.0), (Word: She, breakPoint:0.5), (Word: was, breakPoint:0.0), (Word: University., breakPoint:0.0), (Word: I, breakPoint:0.0), (Word: love, breakPoint:0.0), (Word: cooking, breakPoint:0.0), (Word: and, breakPoint:0.0), (Word: singing, breakPoint:0.0), (Word: and, breakPoint:0.0), (Word: also, breakPoint:0.0), (Word: wrestling., breakPoint:0.0), (Word: Uh, breakPoint:0.6000000000000014), (Word: tell, b

In [271]:
def convert_to_pairs(text):
    pairs = []
    
    # Split the input string into individual entries
    entries = text.split('), (')
    
    # Process each entry to extract word and breakPoint
    for entry in entries:
        # Remove any leading/trailing characters
        entry = entry.strip('() ')
        
        # Split the entry into word and breakpoint components
        components = entry.split(',')
        word = components[0].split(':')[1].strip()
        
        # Clean the breakpoint string before converting to float
        break_point = float(components[1].split(':')[1].strip().rstrip(')'))  # Remove any trailing parenthesis
        
        # Append the (word, breakpoint) pair to the list
        pairs.append((word, break_point))
    
    return pairs

In [272]:
def convert_to_ssml_with_breaks(pairs):
    ssml_output = '<speak>'
    ssml_output += f'<prosody rate="slow">'
    for word, break_point in pairs:
        # Add a break if the breakpoint is greater than 0
        if break_point > 0:
            ssml_output += f'<break time="{break_point}s"/>'
        # Add the word to the SSML string
        ssml_output += f'{word} '

    ssml_output += '</prosody>'
    ssml_output += '</speak>'
    return ssml_output

In [275]:
input_script = convert_to_ssml_with_breaks(convert_to_pairs(transcript))
print(input_script)

<speak><prosody rate="slow"><break time="3.3s"/>hi hi myself I'm from Kippur this take of Maharashtra I am be a graduate and also doing Ma from <break time="0.5s"/>she was University I love cooking and singing and also wrestling <break time="0.6000000000000014s"/>uh tell me a little about your work <break time="2.8999999999999986s"/>I am person of so many creative event like Lonnie and Bhangra <break time="1.0s"/>I have lots of N and also always doing roadside Network my most famous n was <break time="0.6000000000000014s"/>most successful not as in Kapoor <break time="0.5s"/>I always win every year <break time="1.1000000000000014s"/>I even talked to <break time="1.0s"/>Madame uru on full but that film was never happened <break time="0.8999999999999986s"/>um so why do you think you should </prosody></speak>


In [276]:
from google.cloud import texttospeech
import os
import io
import pydub
from pydub import AudioSegment

# Set up your Google Cloud credentials
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "credentials.json"

# Initialize the Google Text-to-Speech client
client = texttospeech.TextToSpeechClient()

input_text = texttospeech.SynthesisInput(ssml=input_script)

# Select the voice and language code
voice = texttospeech.VoiceSelectionParams(
    language_code="en-US",
    ssml_gender=texttospeech.SsmlVoiceGender.MALE
)

# Select the audio format
audio_config = texttospeech.AudioConfig(
    audio_encoding=texttospeech.AudioEncoding.MP3
)

# Synthesize the speech
response = client.synthesize_speech(
    input=input_text, voice=voice, audio_config=audio_config
)

# Save the synthesized audio to a file
with open("output_with_ssml.mp3", "wb") as out:
    out.write(response.audio_content)

print("Audio with SSML saved as 'output_with_ssml.mp3'.")


Audio with SSML saved as 'output_with_ssml.mp3'.
