## Record Audio from Microphone & Speech Recognition

In [None]:
# %%capture
# !pip install pyaudio
# !pip install wave
# !pip install SpeechRecognition
# !pip install pyaudio wave SpeechRecognition

In [1]:
import pyaudio
import wave
import speech_recognition as sr

This code is broken down into two main functions:

1. `record_audio_to_wav()`: This function records audio from the computer's microphone using the `pyaudio` library and saves it as a `.wav` file using the `wave` library.
2. `transcribe_audio_to_text()`: This function takes a `.wav` file as input and transcribes it into text using the `speech_recognition` library, which interfaces with Google's Web Speech API. 


In [104]:
def record_audio_to_wav(filename="microphone_output.wav", record_seconds=5):
    # Audio settings
    FORMAT = pyaudio.paInt16  # Format for the audio (paInt16 is a widely used format)
    CHANNELS = 2  # Stereo recording (1 for mono, 2 for stereo)
    RATE = 44100  # Samples per second (CD quality)
    CHUNK = 1024  # Number of audio frames per buffer

    # Initialize pyaudio
    audio = pyaudio.PyAudio()

    # Start recording
    stream = audio.open(format=FORMAT, channels=CHANNELS, rate=RATE, input=True, frames_per_buffer=CHUNK)
    print("Recording...")

    frames = []  # List to store chunks of audio data
    for _ in range(0, int(RATE / CHUNK * record_seconds)):  # Calculate number of chunks needed
        data = stream.read(CHUNK)
        frames.append(data)

    # End recording
    print("Finished recording.")
    stream.stop_stream()
    stream.close()
    audio.terminate()

    # Save the recorded audio to a WAV file
    with wave.open(filename, 'wb') as wf:
        wf.setnchannels(CHANNELS)
        wf.setsampwidth(audio.get_sample_size(FORMAT))
        wf.setframerate(RATE)
        wf.writeframes(b''.join(frames))  # Write the audio frames to the file

In [107]:
def transcribe_audio_to_text(filename="microphone_output.wav"):
    recognizer = sr.Recognizer()  # Initialize the recognizer

    # Open the audio file
    with sr.AudioFile(filename) as source:
        audio_data = recognizer.record(source)  # Read the entire audio file into memory

        try:
            # Transcribe audio to text using Google's Web Speech API
            print("Transcribing...")
            text = recognizer.recognize_google(audio_data)
            return text
        except sr.UnknownValueError:
            # This error occurs if the Speech API cannot understand the audio
            return "Google Web Speech API could not understand the audio."
        except sr.RequestError as e:
            # This error occurs if there's an issue with access to the API (like no internet connection)
            return f"Could not request results from Google Web Speech API; {e}"

In [108]:
# Record 10 seconds of audio and save as output.wav
record_audio_to_wav("microphone_output.wav", 10)

Recording...
Finished recording.


In [111]:
from IPython.display import Audio
# Play an audio file
Audio("microphone_output.wav", rate=44100)

In [112]:
# Transcribe the saved audio to text
transcription = transcribe_audio_to_text("microphone_output.wav")
print("Transcription: ", transcription)

Transcribing...
result2:
{   'alternative': [   {   'transcript': 'hello I am Prashant welcome to the '
                                         'natural language processing course I '
                                         'hope you are enjoying thank you'}],
    'final': True}
Transcription:  hello I am Prashant welcome to the natural language processing course I hope you are enjoying thank you


In [113]:
transcription

'hello I am Prashant welcome to the natural language processing course I hope you are enjoying thank you'

## Translate from English to Hindi

In [116]:
transcription = 'hello! I am Prashant. Welcome to the natural language processing course. \
I hope you are enjoying! Thank you'

In [5]:
# %%capture
# !pip install googletrans==4.0.0-rc1

In [122]:
from googletrans import Translator, LANGUAGES

def translate_text_to_hindi(text):
    translator = Translator()
    translated = translator.translate(text, src='en', dest='hi')
    return translated.text

In [123]:
hindi_translation = translate_text_to_hindi(transcription)
print("Translated to Hindi: ", hindi_translation)

Translated to Hindi:  नमस्ते!मैं प्रशांत हूं।प्राकृतिक भाषा प्रसंस्करण पाठ्यक्रम में आपका स्वागत है।मुझे आशा है कि आप आनंद ले रहे हैं!धन्यवाद


### List of all the languages available in Google Translator API

In [119]:
from googletrans import LANGUAGES

for code, language in LANGUAGES.items():
    print(f"{code}: {language}")


af: afrikaans
sq: albanian
am: amharic
ar: arabic
hy: armenian
az: azerbaijani
eu: basque
be: belarusian
bn: bengali
bs: bosnian
bg: bulgarian
ca: catalan
ceb: cebuano
ny: chichewa
zh-cn: chinese (simplified)
zh-tw: chinese (traditional)
co: corsican
hr: croatian
cs: czech
da: danish
nl: dutch
en: english
eo: esperanto
et: estonian
tl: filipino
fi: finnish
fr: french
fy: frisian
gl: galician
ka: georgian
de: german
el: greek
gu: gujarati
ht: haitian creole
ha: hausa
haw: hawaiian
iw: hebrew
he: hebrew
hi: hindi
hmn: hmong
hu: hungarian
is: icelandic
ig: igbo
id: indonesian
ga: irish
it: italian
ja: japanese
jw: javanese
kn: kannada
kk: kazakh
km: khmer
ko: korean
ku: kurdish (kurmanji)
ky: kyrgyz
lo: lao
la: latin
lv: latvian
lt: lithuanian
lb: luxembourgish
mk: macedonian
mg: malagasy
ms: malay
ml: malayalam
mt: maltese
mi: maori
mr: marathi
mn: mongolian
my: myanmar (burmese)
ne: nepali
no: norwegian
or: odia
ps: pashto
fa: persian
pl: polish
pt: portuguese
pa: punjabi
ro: romanian
r

## Extract Audio from a Video File, and then transcribe it

In [26]:
# !pip install moviepy

In [124]:
from moviepy.editor import *

def extract_audio_from_video(video_path, audio_output_path):
    # Load the video
    clip = VideoFileClip(video_path)
    
    # Extract audio
    clip.audio.write_audiofile(audio_output_path)
    clip.audio.close()

In [125]:
video_file = r"What are LLM's or Large Language Models-.mp4"
audio_output = "output_audio.wav"  # or .mp3 or .ogg
extract_audio_from_video(video_file, audio_output)

MoviePy - Writing audio in output_audio.wav


                                                                                                                       

MoviePy - Done.




In [126]:
# Transcribe the saved audio to text
transcription = transcribe_audio_to_text("output_audio.wav")
print("Transcription: ", transcription)

Transcribing...
result2:
{   'alternative': [   {   'transcript': 'you have been using chat GPT to '
                                         'Converse with an AI powered '
                                         'cardboard but do you know what is '
                                         'behind it meet large language models '
                                         'or LLM these models are a '
                                         'breakthrough in the field of Natural '
                                         'Language Processing empowering '
                                         'machines to understand and generate '
                                         'human like language are built using '
                                         'deep learning techniques and trained '
                                         'on vast amounts of text data which '
                                         'allows them to recognise patterns '
                                         'an

In [127]:
transcription

"you have been using chat GPT to Converse with an AI powered cardboard but do you know what is behind it meet large language models or LLM these models are a breakthrough in the field of Natural Language Processing empowering machines to understand and generate human like language are built using deep learning techniques and trained on vast amounts of text data which allows them to recognise patterns and relationships within human language they can then use this learning to perform a variety of tasks such as answering questions generating text and even translation between languages but elements are more than just powerful there a gateway to revolutionize the way we interact with technology to the next time you use a chatbot or any other application remember that it's a result of years of research and development in the field of element"

In [129]:
transcription = """you have been using chat GPT to Converse with an AI powered chatbot but do you know what is behind it. 
meet large language models or LLM. 
these models are a breakthrough in the field of Natural Language Processing empowering machines to understand and generate human like language.
They are built using deep learning techniques and trained on vast amounts of text data which allows them to recognise patterns and relationships within human language.
they can then use this learning to perform a variety of tasks such as answering questions, generating text, and even translation between languages.
but LLMs are more than just powerful there a gateway to revolutionize the way we interact with technology.
to the next time you use a chatbot or any other application remember that it's a result of years of research and development in the field of LLMs"""

In [130]:
hindi_translation = translate_text_to_hindi(transcription)
print("Translated to Hindi: ", hindi_translation)

Translated to Hindi:  आप AI संचालित चैटबॉट के साथ बातचीत करने के लिए चैट GPT का उपयोग कर रहे हैं, लेकिन क्या आप जानते हैं कि इसके पीछे क्या है।
बड़े भाषा मॉडल या एलएलएम से मिलें।
ये मॉडल भाषा की तरह मानव को समझने और उत्पन्न करने के लिए प्राकृतिक भाषा प्रसंस्करण सशक्त मशीनों के क्षेत्र में एक सफलता हैं।
वे गहरी सीखने की तकनीकों का उपयोग करके बनाए गए हैं और बड़ी मात्रा में पाठ डेटा पर प्रशिक्षित हैं जो उन्हें मानव भाषा के भीतर पैटर्न और संबंधों को पहचानने की अनुमति देता है।
फिर वे इस सीखने का उपयोग विभिन्न प्रकार के कार्यों को करने के लिए कर सकते हैं जैसे कि प्रश्नों का उत्तर देना, पाठ उत्पन्न करना और यहां तक कि भाषाओं के बीच अनुवाद भी।
लेकिन एलएलएम सिर्फ शक्तिशाली से अधिक हैं, जिस तरह से हम प्रौद्योगिकी के साथ बातचीत करते हैं।
अगली बार जब आप एक चैटबॉट या किसी अन्य एप्लिकेशन का उपयोग करते हैं, तो याद रखें कि यह एलएलएम के क्षेत्र में अनुसंधान और विकास के वर्षों का परिणाम है


In [131]:
# %%capture
# !pip install gTTS

In [132]:
from gtts import gTTS

def text_to_speech(text, lang='en', output_file='output.mp3'):
    """
    Convert the given text to speech and save it to an audio file.

    Parameters:
    - text (str): The text to be converted.
    - lang (str, optional): The language code for the text. Default is 'en' for English.
    - output_file (str, optional): The path to save the audio file. Default is 'output.mp3'.
    """
    tts = gTTS(text=text, lang=lang, slow=False)
    tts.save(output_file)
    print(f"Audio saved to {output_file}")
    
#     You can modify the lang parameter in the text_to_speech function to change the language of the speech. 
# For example, use 'fr' for French, 'es' for Spanish, etc.

In [134]:
text_input = "Hello, this is a test for text to speech conversion."
text_to_speech(text_input)

Audio saved to output.mp3


In [137]:
text_input = "hello! I am Prashant. Welcome to the natural language processing course. I hope you are enjoying! Thank you"
text_to_speech(text_input)
Audio("output.mp3")

Audio saved to output.mp3


# Text to Speech using Amazon's Polly

In [27]:
# %%capture
# !pip install boto3

In [138]:
import boto3
import csv

def load_aws_credentials_from_csv(file_path):
    with open(file_path, 'r', encoding='utf-8-sig') as csv_file:
        reader = csv.DictReader(csv_file)
        for row in reader:
            return {
                'aws_access_key_id': row['AccessKeyId'],
                'aws_secret_access_key': row['SecretAccessKey']
            }

In [139]:
# Load AWS credentials from CSV
credentials_path = "AWS_Rootkeys.csv"
credentials = load_aws_credentials_from_csv(credentials_path)

In [140]:
credentials

{'aws_access_key_id': 'AKIAQ6AS5BH4E3WCDOPV',
 'aws_secret_access_key': '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'}

In [141]:
def synthesize_speech(text, credentials, output_file='output.mp3', OutputFormat='mp3', VoiceId='Joanna', TextType='text', Engine='standard'):
    """
    Convert the provided text to speech using Amazon Polly.

    Parameters:
    - text (str): The text to be converted.
    - credentials (dict): Dictionary containing AWS credentials.
    - output_file (str, optional): The path to save the audio file. Default is 'output.mp3'.
    """
    # Create a Polly client using the provided credentials
    polly_client = boto3.client('polly',
                                aws_access_key_id=credentials['aws_access_key_id'],
                                aws_secret_access_key=credentials['aws_secret_access_key'],
                                region_name='ap-south-1')  # Change the region if needed
    
    # Request speech synthesis
    response = polly_client.synthesize_speech(Text=text, OutputFormat='mp3', VoiceId=VoiceId, \
                                              TextType=TextType,  Engine=Engine)  #neural

    # Save the synthesized speech to an output file
    with open(output_file, 'wb') as audio_file:
        audio_file.write(response['AudioStream'].read())

    print(f"Audio saved to {output_file}")

In [143]:
text_input = "hello! I am Prashant. Welcome to the natural language processing course. I hope you are enjoying! Thank you"

In [144]:
# text_input = transcription
synthesize_speech(text_input, credentials)

Audio saved to output.mp3


In [62]:
# %%capture
# !pip install ipython

In [145]:
from IPython.display import Audio

# Play an audio file
Audio("output.mp3")


In [None]:
# https://docs.aws.amazon.com/polly/latest/dg/voicelist.html

In [146]:
# text_input = "Hello, this is a test for text to speech conversion, haha."
synthesize_speech(text_input, credentials, VoiceId='Kajal', Engine='neural') # Kajal is bilingual
Audio("output.mp3")

Audio saved to output.mp3


In [147]:
synthesize_speech(text_input, credentials, VoiceId='Aditi')
Audio("output.mp3")

Audio saved to output.mp3


In [148]:
synthesize_speech(text_input, credentials, VoiceId='Raveena')
Audio("output.mp3")

Audio saved to output.mp3


Hindi can be used in two different forms:

- Devanagari: "उसने कहाँ, खेल तोह अब शुरू होगा"
- Romanagari (using the Latin alphabet): "Usne kahan, khel toh ab shuru hoga"

Additionally, it's possible to mix English and Hindi of either or both forms within a single sentence:
- Devanagari + English: "This is the song कभी कभी अदिति"
- Romanagari + English: "This is the song from the movie Jaane Tu Ya Jaane Na."
- Devanagari + Romanagari + English: "This is the song कभी कभी अदिति from the movie Jaane Tu Ya Jaane Na."

Because Aditi is a bilingual voice, text in all of these cases will be read correctly, as Amazon Polly can differentiate between the languages and scripts.

In [156]:
text_input = "उसने कहाँ, खेल तोह अब शुरू होगा"
text_input = "Usne kaha, khel toh ab shuru hoga"
text_input = "This is the song कभी कभी अदिति"
text_input = "This is the song from the movie Jaane Tu Ya Jaane Na."
text_input = "This is the song कभी कभी अदिति from the movie Jaane Tu Ya Jaane Na."

# synthesize_speech(text_input, credentials, VoiceId='Aditi')
synthesize_speech(text_input, credentials, VoiceId='Kajal', Engine='neural')
Audio("output.mp3")

Audio saved to output.mp3
