In [1]:
from google.colab import drive # to use goolge drive as disk

In [2]:
drive.mount('/content/gdrive/') # mounting gdrive

Mounted at /content/gdrive/


In [4]:
%cd /content/gdrive/MyDrive/Web_API/Speech-to-text
!ls 

/content/gdrive/MyDrive/Web_API/Speech-to-text
GCP_speech_to_text_APIs.ipynb  parts		 speechtextcredentals.json
genevieve.wav		       requirements.txt


# Preparing environment for this speech to text api

- Create an environment with python3

In [5]:
!cat requirements.txt # libraries required

google-api-python-client==1.6.4
httplib2==0.10.3
oauth2client==4.1.2
pyasn1==0.4.2
pyasn1-modules==0.2.1
rsa==3.4.2
six==1.12.0
SpeechRecognition==3.8.1
tqdm==4.19.5
uritemplate==3.0.0


In [6]:
!pip install -r requirements.txt # installing libraries as needed.

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting google-api-python-client==1.6.4
  Downloading google_api_python_client-1.6.4-py2.py3-none-any.whl (53 kB)
[K     |████████████████████████████████| 53 kB 978 kB/s 
[?25hCollecting httplib2==0.10.3
  Downloading httplib2-0.10.3.tar.gz (204 kB)
[K     |████████████████████████████████| 204 kB 3.9 MB/s 
[?25hCollecting oauth2client==4.1.2
  Downloading oauth2client-4.1.2-py2.py3-none-any.whl (99 kB)
[K     |████████████████████████████████| 99 kB 8.1 MB/s 
[?25hCollecting pyasn1==0.4.2
  Downloading pyasn1-0.4.2-py2.py3-none-any.whl (71 kB)
[K     |████████████████████████████████| 71 kB 8.6 MB/s 
[?25hCollecting pyasn1-modules==0.2.1
  Downloading pyasn1_modules-0.2.1-py2.py3-none-any.whl (60 kB)
[K     |████████████████████████████████| 60 kB 6.8 MB/s 
[?25hCollecting rsa==3.4.2
  Downloading rsa-3.4.2-py2.py3-none-any.whl (46 kB)
[K     |█████████████████████████████

- Divide the source into multiple parts

```ffmpeg -i genevieve.wav -f segment -segment_time 30 -c copy parts/out%09d.wav```

In [126]:
!pwd

/content/gdrive/MyDrive/Web_API/Speech-to-text


# English Recognition

In [127]:
!ffmpeg -i english.wav -f segment -segment_time 30 -c copy parts/out%09d.wav

ffmpeg version 3.4.11-0ubuntu0.1 Copyright (c) 2000-2022 the FFmpeg developers
  built with gcc 7 (Ubuntu 7.5.0-3ubuntu1~18.04)
  configuration: --prefix=/usr --extra-version=0ubuntu0.1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --enable-gpl --disable-stripping --enable-avresample --enable-avisynth --enable-gnutls --enable-ladspa --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librubberband --enable-librsvg --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libssh --enable-libtheora --enable-libtwolame --enable-libvorbis --enable-libvpx --enable-libwavpack --enable-libwebp --enable-libx265 --enable-libxml2 --enable-libxvid --enable-li

In [128]:
# from SpeechRecognition package, import speech_recognition module
# https://github.com/Uberi/speech_recognition#readme
import speech_recognition as sr
sr

<module 'speech_recognition' from '/usr/local/lib/python3.7/dist-packages/speech_recognition/__init__.py'>

In [129]:
import os
from tqdm import tqdm

from multiprocessing.dummy import Pool
pool = Pool(8) # Number of concurrent threads

In [130]:
pool

<multiprocessing.pool.ThreadPool at 0x7fee685bfa10>

- Read the file contents of api-key.json file

In [131]:
with open("api-key.json") as f:
    GOOGLE_CLOUD_SPEECH_CREDENTIALS = f.read()

In [132]:
GOOGLE_CLOUD_SPEECH_CREDENTIALS[:10]+'...."' # cannot show you my key, sorry!

'{\n  "type"...."'

In [133]:
# get the recogniser from the speech_recognition api
r = sr.Recognizer()
r

<speech_recognition.Recognizer at 0x7fee68636e10>

In [134]:
# get the list of all the files that we want to convert
files = sorted(os.listdir('parts/'))
files

['out000000000.wav', 'out000000001.wav', 'out000000002.wav']

- Fucntion to convert the audio to text.

In [90]:
name = 'parts/'+files[0]
from IPython.display import Audio
Audio(name)

Output hidden; open in https://colab.research.google.com to view.

In [91]:
# Load audio file
with sr.AudioFile(name) as source:
    audio = r.record(source)
audio

<speech_recognition.AudioData at 0x7fee63611b10>

In [92]:
# Transcribe audio file
text = r.recognize_google_cloud(
    audio,
    credentials_json=GOOGLE_CLOUD_SPEECH_CREDENTIALS
    )
text

'this Dynamic Workshop aims to provide up-to-date information on pharmacological approaches, issues, and treatment in the geriatric population to assist in preventing medication related problems, appropriately and effectively managing medications and compliance. The concept of polypharmacy parentheses taking multiple types of drugs parentheses will also be discussed, as though '

In [93]:
def transcribe(data):
    idx, file = data
    name = "parts/" + file
    print(name + " started")
    # Load audio file
    with sr.AudioFile(name) as source:
        audio = r.record(source)
    # Transcribe audio file
    text = r.recognize_google_cloud(audio, credentials_json=GOOGLE_CLOUD_SPEECH_CREDENTIALS)
    print(name + " done")
    return {
        "idx": idx,
        "text": text
    }

In [94]:
# running them in parallel to make it faster
all_text = pool.map(transcribe, enumerate(files))
pool.close()
pool.join()


parts/out000000000.wav startedparts/out000000001.wav started

parts/out000000002.wav started
parts/out000000002.wav done
parts/out000000001.wav done
parts/out000000000.wav done


In [95]:
all_text

[{'idx': 0,
  'text': 'this Dynamic Workshop aims to provide up-to-date information on pharmacological approaches, issues, and treatment in the geriatric population to assist in preventing medication related problems, appropriately and effectively managing medications and compliance. The concept of polypharmacy parentheses taking multiple types of drugs parentheses will also be discussed, as though '},
 {'idx': 1,
  'text': 'is a common issue that can impact adverse side effects in the geriatric population. Participants will leave with the knowledge and considerations of common drug interactions and how to minimize the effects that limit function. Summit professional education is approved provider of continuing education. This course is offered for 6. '},
 {'idx': 2,
  'text': '. discourse contains a Content classified under the both the domain of occupational therapy and professional issues. '}]

# Hindi recognition

In [109]:
!ffmpeg -i hindi.wav -f segment -segment_time 30 -c copy partshindi/out%09d.wav

ffmpeg version 3.4.11-0ubuntu0.1 Copyright (c) 2000-2022 the FFmpeg developers
  built with gcc 7 (Ubuntu 7.5.0-3ubuntu1~18.04)
  configuration: --prefix=/usr --extra-version=0ubuntu0.1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --enable-gpl --disable-stripping --enable-avresample --enable-avisynth --enable-gnutls --enable-ladspa --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librubberband --enable-librsvg --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libssh --enable-libtheora --enable-libtwolame --enable-libvorbis --enable-libvpx --enable-libwavpack --enable-libwebp --enable-libx265 --enable-libxml2 --enable-libxvid --enable-li

In [114]:
# get the list of all the files that we want to convert
files_hindi = sorted(os.listdir('partshindi/'))
files

['out000000000.wav', 'out000000001.wav', 'out000000002.wav']

In [115]:
name = 'partshindi/'+files_hindi[0]
from IPython.display import Audio
Audio(name)

Output hidden; open in https://colab.research.google.com to view.

In [116]:
def transcribe(data):
    idx, file = data
    name = "partshindi/" + file
    print(name + " started")
    # Load audio file
    with sr.AudioFile(name) as source:
        audio = r.record(source)
    # Transcribe audio file
    text = r.recognize_google_cloud(audio, credentials_json=GOOGLE_CLOUD_SPEECH_CREDENTIALS)
    print(name + " done")
    return {
        "idx": idx,
        "text": text
    }

In [117]:
# running them in parallel to make it faster
pool = Pool(8) # Number of concurrent threads
all_text = pool.map(transcribe, enumerate(files_hindi))
pool.close()
pool.join()



partshindi/out000000000.wav started
partshindi/out000000001.wav started
partshindi/out000000002.wav started
partshindi/out000000000.wav done
partshindi/out000000002.wav done
partshindi/out000000001.wav done


In [118]:
all_text

[{'idx': 0,
  'text': 'Google Plano I love you I love you I love you very very much '},
 {'idx': 1,
  'text': 'I love you and mount pocket blanket better Nana Banana cartoon alcohol bottle pinata '},
 {'idx': 2, 'text': 'Marion Kentucky take me to dental clinic video '}]

# Telugu Recognition

In [119]:
!ffmpeg -i telugu.wav -f segment -segment_time 30 -c copy partstelugu/out%09d.wav

ffmpeg version 3.4.11-0ubuntu0.1 Copyright (c) 2000-2022 the FFmpeg developers
  built with gcc 7 (Ubuntu 7.5.0-3ubuntu1~18.04)
  configuration: --prefix=/usr --extra-version=0ubuntu0.1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --enable-gpl --disable-stripping --enable-avresample --enable-avisynth --enable-gnutls --enable-ladspa --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librubberband --enable-librsvg --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libssh --enable-libtheora --enable-libtwolame --enable-libvorbis --enable-libvpx --enable-libwavpack --enable-libwebp --enable-libx265 --enable-libxml2 --enable-libxvid --enable-li

In [135]:
# get the list of all the files that we want to convert
files_telugu= sorted(os.listdir('partstelugu/'))
files

['out000000000.wav', 'out000000001.wav', 'out000000002.wav']

In [136]:
name = 'partstelugu/'+files_telugu[0]
from IPython.display import Audio
Audio(name)

In [137]:
def transcribe(data):
    idx, file = data
    name = "partstelugu/" + file
    print(name + " started")
    # Load audio file
    with sr.AudioFile(name) as source:
        audio = r.record(source)
    # Transcribe audio file
    text = r.recognize_google_cloud(audio, credentials_json=GOOGLE_CLOUD_SPEECH_CREDENTIALS)
    print(name + " done")
    return {
        "idx": idx,
        "text": text
    }

In [138]:
# running them in parallel to make it faster
pool = Pool(8) # Number of concurrent threads
all_text = pool.map(transcribe, enumerate(files_telugu))
pool.close()
pool.join()
all_text

partstelugu/out000000000.wav startedpartstelugu/out000000001.wav started
partstelugu/out000000002.wav started

partstelugu/out000000002.wav done
partstelugu/out000000000.wav done
partstelugu/out000000001.wav done


[{'idx': 0,
  'text': 'give me tomorrow Mutant Ninja Turtle Mera Dil badal De naat '},
 {'idx': 1,
  'text': 'mitigated boots Google since I smoked a real name just like this what are not same '},
 {'idx': 2,
  'text': "Manisha margosa members volunteer team on Trump I don't nobody know your tongue "}]

## Looks like API is woring fine for english but not good for other languages.