In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
project_path = "/content/drive/My Drive/Speech_project/" 

In [0]:
# !wget http://www.openslr.org/resources/12/train-clean-100.tar.gz

In [0]:
! cp "/content/drive/My Drive/Speech_project/train-clean-100.tar.gz" /content

In [0]:
!tar xzvf train-clean-100.tar.gz

In [0]:
! rm -rf train-clean-100.tar.gz

In [0]:
! unzip deepasr.zip     

In [0]:
# ! rm -rf deepasr

In [0]:
# ! pip install --upgrade deepasr

In [0]:
# ! pip uninstall deepasr

In [0]:
! pip install tensorflow==2.1.0

In [0]:
%tensorflow_version 2.x

# 1. Prepare DataSet

In [0]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
import deepasr as asr
import librosa

In [5]:
tf.__version__

'2.1.0'

In [6]:
asr.__version__

'0.0.9'

In [0]:
# get audios and transcripts
org_path = './LibriSpeech/train-clean-100/'
count = 0
inp = []
k=0
audio_name = []
audio_trans = []
for dir1 in os.listdir(org_path):
  dir2_path = org_path+dir1+'/'
  #print(dir2_path)
  for dir2 in os.listdir(dir2_path):
    dir3_path = dir2_path+dir2+'/'
    
    for audio in os.listdir(dir3_path):
      if audio.endswith('.txt'):
        k+=1
        file_path = dir3_path + audio
        with open(file_path) as f:
          line = f.readlines()
          for lines in line:
            flac_path = dir3_path+lines.split()[0]+'.flac'
            
            audio_name.append(flac_path)

            # print(cmd)
            words2 = lines.split()[1:]
            words4=' '.join(words2)
            audio_trans.append(words4)

In [0]:
# create dataset
df = pd.DataFrame({"path":audio_name,"transcripts":audio_trans})

In [9]:
df.shape

(28539, 2)

In [0]:
# filter transcript less than 100 charcters
train_data = df[df['transcripts'].str.len() < 100]
# train_df = df.sample(n = 3000) 

In [11]:
train_data.shape

(3194, 2)

# 2. Prepare DeepAsr CTC Pipeline

In [0]:
# get CTCPipeline
def get_config(feature_type: str = 'spectrogram', multi_gpu: bool = False):
    # audio feature extractor
    features_extractor = asr.features.preprocess(feature_type=feature_type, features_num=161,
                                                 samplerate=16000,
                                                 winlen=0.02,
                                                 winstep=0.025,
                                                 winfunc=np.hanning)
    
    # input label encoder
    alphabet_en = asr.vocab.Alphabet(lang='en')
    # training model
    model = asr.model.get_deepasrnetwork1(
        input_dim=161,
        output_dim=29,
        is_mixed_precision=True
    )
    # model optimizer
    optimizer = tf.keras.optimizers.Adam(
        lr=1e-4,
        beta_1=0.9,
        beta_2=0.999,
        epsilon=1e-8
    )
    # output label deocder
    decoder = asr.decoder.GreedyDecoder()
    # CTCPipeline
    pipeline = asr.pipeline.ctc_pipeline.CTCPipeline(
        alphabet=alphabet_en, features_extractor=features_extractor, model=model, optimizer=optimizer, decoder=decoder,
        sample_rate=16000, mono=True, multi_gpu=multi_gpu
    )
    return pipeline

In [0]:
# CTCPiline for asr
pipeline = get_config(feature_type = 'fbank', multi_gpu=False)

# 3. Model traning

In [21]:
# train asr model
history = pipeline.fit(train_dataset = train_data, batch_size=128, epochs=500)

# history = pipeline.fit_iter(train_dataset = train_data, batch_size=32, epochs=3,iter_num=500,checkpoint=project_path+'checkpoints')
# history = pipeline.fit_generator(train_dataset = train_data, batch_size=32, epochs=500)

Model: "DeepAsr"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
the_input (InputLayer)          [(None, None, 161)]  0                                            
__________________________________________________________________________________________________
BN_1 (BatchNormalization)       (None, None, 161)    644         the_input[0][0]                  
__________________________________________________________________________________________________
Conv1D_1 (Conv1D)               (None, None, 220)    177320      BN_1[0][0]                       
__________________________________________________________________________________________________
CNBN_1 (BatchNormalization)     (None, None, 220)    880         Conv1D_1[0][0]                   
____________________________________________________________________________________________

In [0]:
# save deepasr ctc pipeline
pipeline.save(project_path+'checkpoints')

# 4. Model testing

In [12]:
# load saved ctc pipeline
pipeline1 = asr.pipeline.load(project_path+'checkpoints')



In [13]:
# get testing audio and transcript from dataset
index = np.random.randint(train_data.shape[0])
data = train_data.iloc[index]
test_file = data[0]
test_transcript = data[1]
# Audio file
print("Audio File:",test_file)
# ground truth
print("Audio Transcription:", test_transcript)
print("Transcript length:",len(test_transcript))

Audio File: ./LibriSpeech/train-clean-100/27/124992/27-124992-0063.flac
Audio Transcription: WENT THROUGH THE PLAINS BUT WHEN THEY CAME NEAR THE MOUNTAINS
Trancript length: 61


In [14]:
# predict labels
pred= pipeline1.predict(test_file)

Instructions for updating:
Create a `tf.sparse.SparseTensor` and use `tf.sparse.to_dense` instead.


In [15]:
pred[0].upper()

'WENT THROUGH THE PLAINS BUT WHEN THEY CAME NEAR THE MOUNTAINS'