In [None]:
# De-biasing Transcribed Text from Automatic Speech Recognition Systems
# Copyright (C) 2021  Rigved Rakshit
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published
# by the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program.  If not, see <https://www.gnu.org/licenses/>.

# This code has been modified from the following Kaggle Jupyter notebook:
# https://www.kaggle.com/huseinzol05/sound-augmentation-librosa

In [None]:
import os
import csv
import wave
import random
import librosa.display
import librosa
import tempfile
import soundfile
import deepspeech
import numpy as np
import language_tool_python
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
from IPython.display import Audio

In [None]:
# Randomly select a voice recording from the 100 available instances
# These 100 voice recording have been selected
# from the Mozilla CommonVoice v6.1 English dataset

validated_csv_filename = os.path.join('./', 'validated.csv')

random.seed()

with open(validated_csv_filename, 'r') as validated_csv_file:
    reader = csv.DictReader(validated_csv_file)
    selected_filename = np.random.choice(os.listdir('./data/'))

    for row in reader:
        if row['path'] == selected_filename:
            selected_filename = os.path.join('./data', row['path'])

            print('Selected file has following attributes:\n')
            print('client_id: ' + row['client_id'])
            print('path:' + row['path'])
            print('sentence: ' + row['sentence'])
            ground_truth_sentence = row['sentence'].strip().lower()
            print('up_votes:' + row['up_votes'])
            print('down_votes: ' + row['down_votes'])
            print('age: ' + row['age'])
            print('gender: ' + row['gender'])
            print('accent: ' + row['accent'])
            print('locale: ' + row['locale'])
            print('segment: ' + row['segment'])

            break

In [None]:
# Load the original audio

y, sr = librosa.load(selected_filename, sr=16000)

# Perform time-stretching on the original audio

y_time_stretched = y.copy()
tmp_time_stretched = librosa.effects.time_stretch(y, np.random.uniform(0.9, 1.1))
minlen = min(y.shape[0], tmp_time_stretched.shape[0])
y_time_stretched *= 0
y_time_stretched[0:minlen] = tmp_time_stretched[0:minlen]

# Perform pitch-shifting on the original audio

y_pitch_shifted = librosa.effects.pitch_shift(y, sr, n_steps=(4 * np.random.uniform()))

y_louder = y * np.random.uniform(low=1.5, high=3.0)

# Save these pre-processed files

audio_time_stretched_filename = tempfile.NamedTemporaryFile(suffix='.wav').name
audio_pitch_shifted_filename = tempfile.NamedTemporaryFile(suffix='.wav').name
audio_louder_filename = tempfile.NamedTemporaryFile(suffix='.wav').name

soundfile.write(audio_time_stretched_filename, y_time_stretched, sr, subtype='PCM_16')
soundfile.write(audio_pitch_shifted_filename, y_pitch_shifted, sr, subtype='PCM_16')
soundfile.write(audio_louder_filename, y_louder, sr, subtype='PCM_16')

# Reload these pre-processed files in the format expected by
# the pre-trained Mozilla DeepSpeech binary

with wave.open(selected_filename, 'rb') as audio_input_file:
    audio_input = np.frombuffer(
        audio_input_file.readframes(
            audio_input_file.getnframes()
        ), np.int16
    )

with wave.open(audio_time_stretched_filename, 'rb') as audio_time_stretched_file:
    audio_time_stretched = np.frombuffer(
        audio_time_stretched_file.readframes(
            audio_time_stretched_file.getnframes()
        ), np.int16
    )

with wave.open(audio_pitch_shifted_filename, 'rb') as audio_pitch_shifted_file:
    audio_pitch_shifted = np.frombuffer(
        audio_pitch_shifted_file.readframes(
            audio_pitch_shifted_file.getnframes()
        ), np.int16
    )

with wave.open(audio_louder_filename, 'rb') as audio_louder_file:
    audio_louder = np.frombuffer(
        audio_louder_file.readframes(
            audio_louder_file.getnframes()
        ), np.int16
    )

In [None]:
# Load the pre-build DeepSpeech model and scorer memory-mapped files

ds = deepspeech.Model(os.path.join('./', 'deepspeech-0.9.3-models.pbmm'))
ds.enableExternalScorer(os.path.join('./', 'deepspeech-0.9.3-models.scorer'))

In [None]:
# Perform speech-to-text on the 4 audio clips

audio_transcription_hypothesis = ds.stt(audio_input)
audio_time_stretched_transcription_hypothesis = ds.stt(audio_time_stretched)
audio_pitch_shifted_transcription_hypothesis = ds.stt(audio_pitch_shifted)
audio_loudness_level_increased_transcription_hypothesis = ds.stt(audio_louder)

In [None]:
# The baseline DeepSpeech model's output on the original audio

print('Ground truth sentence: ' + ground_truth_sentence)
print('DeepSpeech transcription of original audio: ' + audio_transcription_hypothesis)

print('Original audio wave-plot:')

plt.figure(figsize=(12, 4))
librosa.display.waveplot(audio_input.astype('float'), sr=sr)
plt.show()

Audio(audio_input, rate=sr)

In [None]:
# The baseline DeepSpeech model's output on time-stretched audio

print('Ground truth sentence: ' + ground_truth_sentence)
print('DeepSpeech transcription of time-stretched audio: ' + audio_time_stretched_transcription_hypothesis)

print('Time-stretched audio wave-plot:')

plt.figure(figsize=(12, 4))
librosa.display.waveplot(audio_time_stretched.astype('float'), sr=sr)
plt.show()

Audio(audio_time_stretched, rate=sr)

In [None]:
# The baseline DeepSpeech model's output on the pitch-shifted audio

print('Ground truth sentence: ' + ground_truth_sentence)
print('DeepSpeech transcription of pitch-shifted audio: ' + audio_pitch_shifted_transcription_hypothesis)

print('Pitch-shifted audio wave-plot:')

plt.figure(figsize=(12, 4))
librosa.display.waveplot(audio_pitch_shifted.astype('float'), sr=sr)
plt.show()

Audio(audio_pitch_shifted, rate=sr)

In [None]:
# The baseline DeepSpeech model's output on the audio with volume increased

print('Ground truth sentence: ' + ground_truth_sentence)
print('DeepSpeech transcription of louder audio: ' + audio_loudness_level_increased_transcription_hypothesis)

print('Louder audio wave-plot:')

plt.figure(figsize=(12, 4))
librosa.display.waveplot(audio_louder.astype('float'), sr=sr)
plt.show()

Audio(audio_louder, rate=sr)

In [None]:
# The baseline DeepSpeech model's output after passing through
# LanguageTool

language_tool = language_tool_python.LanguageTool('en-us')

print('Ground truth sentence: ' + ground_truth_sentence)
print('DeepSpeech transcription of after LanguageTool: ' + language_tool.correct(audio_transcription_hypothesis))
