## Speech to Text

### Download Dataset

Source : https://voice.mozilla.org/en/datasets

In [1]:
#!wget https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/zh-CN.tar.gz # chinese dataset
!wget https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/en.tar.gz # englisth dataset

--2019-12-10 12:07:39--  https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/en.tar.gz
Resolving voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com (voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com)... 52.218.204.242
Connecting to voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com (voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com)|52.218.204.242|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 30159205077 (28G) [application/octet-stream]
Saving to: ‘en.tar.gz’

en.tar.gz             0%[                    ]  57.85M  18.4MB/s    eta 36m 59s^C


In [0]:
#!rm -rf /content/English
!rm -rf /content/English/train_data
!rm -rf /content/English/test_data

In [0]:
!mkdir /content/English
!tar -C /content/English -xf /content/en.tar.gz


gzip: stdin: unexpected end of file
tar: Unexpected EOF in archive
tar: Unexpected EOF in archive
tar: Error is not recoverable: exiting now


In [0]:
!ls /content/English/clips/ -F |grep -v / | wc -l

527915


In [3]:
!pip install torchaudio

Collecting torchaudio
[?25l  Downloading https://files.pythonhosted.org/packages/f6/3d/7bcc3476f00d8dd8735d384230cb787d7e91ce6e1b51cef802d6bc5f4ff3/torchaudio-0.3.1-cp36-cp36m-manylinux1_x86_64.whl (2.7MB)
[K     |████████████████████████████████| 2.7MB 6.3MB/s 
[?25hCollecting torch==1.3.0
[?25l  Downloading https://files.pythonhosted.org/packages/ae/05/50a05de5337f7a924bb8bd70c6936230642233e424d6a9747ef1cfbde353/torch-1.3.0-cp36-cp36m-manylinux1_x86_64.whl (773.1MB)
[K     |████████████████████████████████| 773.1MB 21kB/s 
[31mERROR: torchvision 0.4.2 has requirement torch==1.3.1, but you'll have torch 1.3.0 which is incompatible.[0m
Installing collected packages: torch, torchaudio
  Found existing installation: torch 1.3.1
    Uninstalling torch-1.3.1:
      Successfully uninstalled torch-1.3.1
Successfully installed torch-1.3.0 torchaudio-0.3.1


### Preprocessing

In [0]:
# let's listen to a few audios
import os
import re
import string
#import IPython
import numpy as np 
import pandas as pd

from shutil import copyfile

import torch
import torchaudio
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader


# import warnings
# warnings.filterwarnings("ignore")

In [0]:
# Process and save Common Voice dataset
cv_dir = os.path.join('/content')
chars = ['_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', ' ']
tokens = [i for i in range(len(chars))]
tokenize_dict = {c:t for c,t in zip(chars, tokens)}


def create_dirs(lang):
    lang_dir = os.path.join(cv_dir, lang)
    train_dir = os.path.join(lang_dir, 'train_data')
    test_dir = os.path.join(lang_dir, 'test_data')
    try:
        os.mkdir(train_dir)
    except FileExistsError:
        pass
    try:
        os.mkdir(test_dir)
    except FileExistsError:
        pass
    return lang_dir, train_dir, test_dir


def process_sent(s):
    s = s.lower()
    s = s.translate(str.maketrans('', '', string.punctuation))
    return ','.join([tokenize_dict[c] for c in s])


def parse_df(df, lan, lang_dir, save_dir, limit=1000):
    """
        Extracts audio and sentence from df
        Returns new df with columns
            "clip" (indexed names)
            "sentence" (corresponding target sentence)
    """
    data = []
    clips_dir = os.path.join(lang_dir, 'clips')
    i = 0
    total = 0
    absent, numbered, key_error, type_error = 0, 0, 0, 0
    for clip, sent in zip(df.path, df.sentence):
        try:    # some audio files may not exist
            nums = re.findall(r'\d+', sent)
            if len(nums)==0: # avoiding numbers from target sentences
                clip_name = f'{lan}_{i}.mp3'
                src = os.path.join(clips_dir, clip)
                dst = os.path.join(save_dir, clip_name)
                copyfile(src, dst)
                data.append((clip_name, process_sent(sent)))
                i += 1 # update counter
                if i == limit:
                    break; 
        except FileNotFoundError:
            pass 
        except TypeError:
            pass
        except KeyError: # for characters such as ú
            pass
    data_df = pd.DataFrame(data, columns=['clip', 'sentence'])
    return data_df
        

# def convert_to_wav(clips_dir, clip, save_clip_dir):
#     """
#         Converts and saves mp3 to wav
#     """
#     mp3_dir = os.path.join(clips_dir, clip)
#     mp3_file = AudioSegment.from_mp3(mp3_dir)
#     mp3_file.export(save_clip_dir, format='wav')

  
lang = "English"
lang_dir, train_dir, test_dir = create_dirs(lang)

# load dataframes
train_df = pd.read_csv(os.path.join(lang_dir, 'train.tsv'),  delimiter='\t')
dev_df = pd.read_csv(os.path.join(lang_dir, 'dev.tsv'),  delimiter='\t')
train_df = pd.concat([train_df, dev_df])
test_df = pd.read_csv(os.path.join(lang_dir, 'test.tsv'),  delimiter='\t')

# Extract train data
print('Preparing Train dataset')
train_data_df = parse_df(train_df, 'eng', lang_dir, train_dir, limit=60000) # extract data from train_df 
train_data_df.to_csv(os.path.join(train_dir, 'train_data.csv'))

# Extract test data
print('Preparing test dataset')
test_data_df = parse_df(test_df, 'eng', lang_dir, test_dir, limit=2000) # extract data from test_df
test_data_df.to_csv(os.path.join(test_dir, 'test_data.csv')) # save

Preparing Train dataset
Preparing test dataset


In [0]:
!ls /content/English/train_data/ -F |grep -v / | wc -l
!ls /content/English/test_data/ -F |grep -v / | wc -l

58103
2001


In [0]:
# creata a file list
!cd /content/English/train_data/ && printf '%s\n' * > /content/train_files.txt
!cd /content/English/test_data/ && printf '%s\n' * > /content/test_files.txt
# compress the dataset
!tar -c -C /content/English/train_data/ -T train_files.txt -f /content/train_data.tar.gz 
!tar -c -C /content/English/test_data/ -T test_files.txt -f /content/test_data.tar.gz 
# delete the extra files
!rm /content/*.txt

#### Upload the created dataset to drive 

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
!cp /content/train_data.tar.gz /content/drive/My\ Drive/Datasets/train_data.tar.gz
!cp /content/test_data.tar.gz /content/drive/My\ Drive/Datasets/test_data.tar.gz

#### Download dataset from Drive

In [6]:
# # Download dataset from drive
!cp /content/drive/My\ Drive/Datasets/train_data.tar.gz /content/train_data.tar.gz
!cp /content/drive/My\ Drive/Datasets/test_data.tar.gz /content/test_data.tar.gz
# make dirs
train_dir = '/content/train_data'
test_dir = '/content/test_data'
!mkdir /content/train_data
!mkdir /content/test_data
# Extracts the dataset
!tar -C /content/train_data -xf /content/train_data.tar.gz
!tar -C /content/test_data -xf /content/test_data.tar.gz
# confirm the number of audio clips
!ls /content/train_data/ -F |grep -v / | wc -l
!ls /content/test_data/ -F |grep -v / | wc -l
# dataframes
train_data_df = pd.read_csv(os.path.join(train_dir, 'train_data.csv'))
test_data_df = pd.read_csv(os.path.join(test_dir, 'test_data.csv'))

58103
2001


In [0]:
!rm -rf /content/train_data
!rm -rf /content/test_data

#### Dataset

In [0]:
class SpeechDataset(Dataset):
    def __init__(self, df, data_dir, max_target_len, max_data_len):
        self.df = df
        self.data_dir = data_dir
        self.max_data_len = max_data_len
        self.max_target_len = max_target_len
    
    def __len__(self):
        return self.df.shape[0]
    
    def __getitem__(self, idx):
        # preparing audio data
        filename = os.path.join(self.data_dir, self.df['clip'].iloc[idx])
        waveform, sample_rate = torchaudio.load(filename)
        x = torchaudio.transforms.Spectrogram()(waveform)

        #--------------------- experiment--------#
        if x.shape[2] > 5000:
            print(x.shape[2], 'wow')
        #----------------------------------------#
        input_length = min(self.max_data_len-2, x.shape[2])
        if self.max_data_len > 0:
            if x.shape[2]<self.max_data_len:
                z = torch.zeros((x.shape[0], x.shape[1], self.max_data_len-x.shape[2])).to(dtype=x.dtype)
                x = torch.cat((x, z), dim=2)
            else:
                x = x[:,:,:self.max_data_len]
        
        # preparing target
        sent = self.df['sentence'].iloc[idx]
        sent = list(map(int, sent.split(',')))
        target = torch.zeros((self.max_target_len), dtype=torch.int32)
        target_length = len(sent)
        target[:target_length] = torch.tensor(sent)
        return (x, target, input_length, target_length)  

### Model

In [0]:
class ASRNetwork(nn.Module):
    def __init__(self, in_c, out_c, kernel_size, input_len, hidden_size, num_layers, output_shape, bidirectional):
        super(ASRNetwork, self).__init__()
        self.conv = nn.Conv2d(in_c, out_c, kernel_size)
        self.lstm = nn.LSTM(input_len, hidden_size, num_layers=num_layers, bidirectional=bidirectional)
        self.linear = nn.Linear(hidden_size*2 if bidirectional else hidden_size, output_shape) # output shape = num possible chracters
        self.softmax = nn.LogSoftmax(dim=1)
    
    def forward(self, x):
        """
            conv input shape: (batch_size, 1, 101, col) # (bs, in_c, rows, cols)
            conv out shape :  (batch_size, 1, 99, col) # (bs, out_c, rows, cols) 
            lstm input shape: (col, batch_size, 99) # (seq_len, bs, input_len) 
            lstm output shape: (col, batch_size, hidden_size) # hidden state from each timestep
        """
        x = self.conv(x).flatten(start_dim=1, end_dim=2)
        x = x.permute(2, 0, 1)
        x, _ = self.lstm(x)
        Y = []
        for t in range(x.shape[0]):
             yt = self.linear(x[t])[None,:,:]
             Y.append(F.softmax(yt, dim=2))
        out = torch.cat(Y, dim=0)
        return out

### Training

In [0]:
def train(model, device, train_loader, optimizer, epoch, log_interval):
    model.train()
    # Start training
    for batch_idx, (data, target, in_len, tgt_len) in enumerate(train_loader):
        data, target = data.to(device), target.to(device)
        in_len, tgt_len = in_len.to(device), tgt_len.to(device)
        optimizer.zero_grad()
        output = model(data)
        loss = F.ctc_loss(output, target, in_len, tgt_len)  # default blank token : 0
        loss.backward()
        optimizer.step()
        if batch_idx % log_interval == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, batch_idx * len(data), len(train_loader.dataset),
                100. * batch_idx / len(train_loader), loss.item()))

In [0]:
device = torch.device("cuda")
in_c = 1
out_c = 1
kernel_size = 5
input_len = 197
hidden_size = 50
num_layers = 3
output_shape = 28
bidirectional = True
model = ASRNetwork(in_c, out_c, kernel_size, input_len, hidden_size, num_layers, output_shape, bidirectional).to(device)

optimizer = optim.Adam(model.parameters(), lr=0.0005)
epochs = 10
log_interval = 15

max_data_len = 5000
max_sent_len = 300

train_dataset = SpeechDataset(train_data_df, train_dir, max_sent_len, max_data_len)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)

for epoch in range(epochs):
    train(model, device, train_loader, optimizer, epoch, log_interval)
    print('-'*10)



## Testing

In [0]:
### Testing on personal data
wav_path = '/content/train_data/eng_5.wav'
IPython.display.Audio(wav_path)

In [0]:
wav_file = AudioSegment.from_wav(wav_path)  # Load wav file
rate, data = get_wav_info(wav_path) # Extract sampling rate, and data
audio = np.array(wav_file.get_array_of_samples())
audio_sample = audio.reshape(-1, wav_file.channels)
x = graph_spectrogram(audio_sample, rate, plot=False)
x = torch.from_numpy(x[None,None,:,:]).to(device, dtype=torch.float32) # tensor

output = model(x)
print(output.shape)

word = []
for t in range(output.shape[0]):
    chars = output[t,0,:]
    cur_char = token_to_char[torch.argmax(chars).item()]
    if cur_char != '_':
        word.append(cur_char)
print(word)

torch.Size([712, 1, 28])
[]
