## Data preprocessing for modelling Amharic data

In [1]:
!pip install python_speech_features

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting python_speech_features
  Downloading python_speech_features-0.6.tar.gz (5.6 kB)
Building wheels for collected packages: python-speech-features
  Building wheel for python-speech-features (setup.py) ... [?25l[?25hdone
  Created wheel for python-speech-features: filename=python_speech_features-0.6-py3-none-any.whl size=5888 sha256=30cf1ad86ef5a85026eddb9f86e5c4094635fec44fdd2582b668906687d70e52
  Stored in directory: /root/.cache/pip/wheels/b0/0e/94/28cd6afa3cd5998a63eef99fe31777acd7d758f59cf24839eb
Successfully built python-speech-features
Installing collected packages: python-speech-features
Successfully installed python-speech-features-0.6


### Preprocessing steps:
1. Load labels
2. sample each audio at 44100Hz
3. Convert mono to stereo
4. Resize audios
5. Generate a pandas dataframe
6. save preprocessed audios and transcritions to a new folder
6. Split the data to train and valid corpus
7.  save train and valid corpus


In [22]:
import librosa   #for audio processing
import IPython.display as ipd
import matplotlib.pyplot as plt
import numpy as np
from scipy.io import wavfile #for audio processing
import pandas as pd 
import warnings
import json
warnings.filterwarnings("ignore")

In [3]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [4]:
import os
os.chdir("/content/drive/My Drive/AMHARIC")
os.listdir()

['README.md',
 'data',
 'kaldi-script',
 'lang',
 'lm',
 'models',
 'model_1.png',
 'valid_corpus.json',
 'train_corpus.json',
 '__pycache__',
 'AudioGenerator.py']

In [None]:
import librosa  # for audio processing
import librosa.display
import IPython.display as ipd
import matplotlib.pyplot as plt
import numpy as np
from scipy.io import wavfile  # for audio processing
from numpy.lib.stride_tricks import as_strided
from mpl_toolkits.axes_grid1 import make_axes_locatable
from IPython.display import Audio
import sklearn
import pandas as pd
import json
import os
from os.path import exists
import warnings
warnings.filterwarnings("ignore")
# from cleaner import CleanDataFrame

In [None]:
train_dataset_location = 'train/'
train_wav_location = os.path.join('/content/drive/My Drive/AMHARIC/data/train/','wav/')
train_changed_wav_location = os.path.join('/content/drive/My Drive/AMHARIC/data/train/','changed_wav/')
train_txt_location = ("/content/drive/My Drive/AMHARIC/data/train/trsTrain.txt")
test_dataset_location = 'test/' 
lexicon_location = 'train/'

In [None]:
def convert_to_sterio(audio: np.array) -> np.array:
        if len(audio.shape) == 1:
            sterio = np.stack([audio, audio], axis=1)
            return sterio
        return audio


from scipy.io import wavfile

def resize_audio(audio: np.array, size: int) -> np.array:
        """
        This resizes all input audio to a fixed sample size.
        It helps us to have a consistent data shape

        Args:
            audio: This is the audio sample as a numpy array
        """
        resized = librosa.util.fix_length(audio, size, axis=0)
        print(f"Audio resized to {size} samples")
        return resized
# changed = convert_to_sterio(samples)
# changed.shape  
# resized = resize_audio(changed,200000)
# print(resized.shape)
# print(resized.T.shape)
# wavfile.write(os.path.join(train_changed_wav_location, 'trial.wav'), sample_rate, resized)
# ipd.Audio(resized.T, rate=sample_rate)
# # mfcc = librosa.feature.mfcc(y=changed, sr=sample_rate)
# # mfcc.shape
# samples, sample_rate = librosa.load(train_changed_wav_location+'trial.wav' , sr=44100)
# samples.shape
# wav_roll = np.roll(samples, int(sample_rate/10))

def meta_data(trans, path):
        target = []
        features = []
        mode = []
        rmse = []
        spec_cent = []
        spec_bw = []
        rolloff = []
        zcr = []
        mfcc = []
        rate = []
        filenames = []
        duration_of_recordings = []
        for index, k in enumerate(trans):
            if True:
                filename = path + k + ".wav"
                next_file_name = path + k + "changed.wav"
                if exists(filename):
                    audio, fs = librosa.load(filename, sr=44100)
                   
                    chroma_stft = librosa.feature.chroma_stft(y=audio, sr=fs)
                    rmse.append(np.mean(librosa.feature.rms(y=audio)))
                    spec_cent.append(
                        np.mean(librosa.feature.spectral_centroid(y=audio, sr=fs)))
                    spec_bw.append(
                        np.mean(librosa.feature.spectral_bandwidth(y=audio, sr=fs)))
                    rolloff.append(
                        np.mean(librosa.feature.spectral_rolloff(y=audio, sr=fs)))
                    zcr.append(
                        np.mean(librosa.feature.zero_crossing_rate(audio)))
                    mfcc.append(np.mean(librosa.feature.mfcc(y=audio, sr=fs)))
                    duration_of_recordings.append(float(len(audio)/fs))
                    rate.append(fs)
                    changed = convert_to_sterio(audio)
                    audio = resize_audio(changed,200000)
                   
                    # stereo = change_channel_to_stereo(filename, next_file_name)
                    # resized = self.resize_audio(audio,200000)
                    split_array = str(filename).split('/')
                    filename = '../data/train/changed_wav/' + str(split_array[len(split_array)-1])
                    wavfile.write(os.path.join(train_changed_wav_location, str(split_array[len(split_array)-1]) ), fs, audio)
                    filenames.append(filename)
                    mode.append('mono')  # if stereo == 1 else 'stereo')
                    lable = trans[k]
                    target.append(lable)
        # self.logger.info(f"Meta Data Generated For {len(filenames)} Audios")
        return filenames, target, duration_of_recordings, mode, rmse, spec_cent, spec_bw, rolloff, zcr, mfcc, rate

In [None]:
def loaderTrans(filename: str):
        """
        # Loads the audio file and returns the audio data and sample rate
        # param filename: The path to the txt file
        # @return: The audio data and sample rate
        #
        """
        name_to_text = {}
        with open(filename, encoding="utf-8") as f:
            for line in f:
                name = line.split("</s>")[1]
                name = name.replace("(", "")
                name = name.replace(")", "")
                name = name.replace("\n", "")
                name = name.replace(" ", "")
                text = line.split("</s>")[0]
                text = text.replace("<s>", "")
                name_to_text[name] = text
                # self.logger.info(f"Training data loaded: {name}")
        return name_to_text
transcription = loaderTrans(train_txt_location)

In [None]:
filenames, target, duration_of_recordings,mode ,rmse,spec_cent,spec_bw,rolloff,zcr,mfcc,rate = meta_data(transcription, train_wav_location)
data = pd.DataFrame({'key': filenames, 'text': target,
                    'duration': duration_of_recordings, 'mode': mode , 'rate': rate ,'rmse': rmse,'spec_cent' :spec_cent,'spec_bw': spec_bw,"rolloff" :rolloff,"zcr": zcr,"mfcc": mfcc})
data.head()

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Audio resized to 200000 samples
Audio resized to 200000 samples
Audio resized to 200000 samples
Audio resized to 200000 samples
Audio resized to 200000 samples
Audio resized to 200000 samples
Audio resized to 200000 samples
Audio resized to 200000 samples
Audio resized to 200000 samples
Audio resized to 200000 samples
Audio resized to 200000 samples
Audio resized to 200000 samples
Audio resized to 200000 samples
Audio resized to 200000 samples
Audio resized to 200000 samples
Audio resized to 200000 samples
Audio resized to 200000 samples
Audio resized to 200000 samples
Audio resized to 200000 samples
Audio resized to 200000 samples
Audio resized to 200000 samples
Audio resized to 200000 samples
Audio resized to 200000 samples
Audio resized to 200000 samples
Audio resized to 200000 samples
Audio resized to 200000 samples
Audio resized to 200000 samples
Audio resized to 200000 samples
Audio resized to 200000 samples
Audio r

Unnamed: 0,key,text,duration,mode,rate,rmse,spec_cent,spec_bw,rolloff,zcr,mfcc
0,../data/train/changed_wav/tr_1_tr01001.wav,ያንደኛ ደረጃ ትምህርታቸው ን ጐንደር ተ ም ረዋል,4.608005,mono,44100,0.005634,712.137443,1235.842984,1487.201251,0.004273,-18.118677
1,../data/train/changed_wav/tr_2_tr01002.wav,የተ ለቀቁት ምርኮኞች በ አካባቢያቸው ሰላማዊ ኑሮ እንዲ ኖሩ የ ትራንስ...,16.384014,mono,44100,0.00556,769.146959,1257.024045,1619.168774,0.003095,-17.846657
2,../data/train/changed_wav/tr_3_tr01003.wav,በ አዲስ አበባው ስታዲየም በ ተካሄዱ ት ሁለት ግጥሚያ ዎች በ መጀመሪያ...,14.592018,mono,44100,0.005418,784.268292,1287.589639,1656.634798,0.001926,-18.260798
3,../data/train/changed_wav/tr_4_tr01004.wav,ወሬው ን ወሬ ያደረጉ ምስጢረ ኞች ናቸው,4.736009,mono,44100,0.005537,689.18964,1164.392879,1376.119457,0.004953,-18.800745
4,../data/train/changed_wav/tr_5_tr01005.wav,ኢትዮጵያዊ ቷ በ ብሄራዊ ባህላዊ አለባበስ ከ አለም አንደኝነት ን ተቀዳ...,8.192018,mono,44100,0.005631,781.056528,1256.620487,1685.354398,0.003393,-18.187614


In [None]:
# from pandas.core.algorithms import value_counts
data.shape

(10875, 11)

In [None]:
train_json = data[:8000].to_dict(orient='records')
with open("train_corpus.json", "w", encoding='UTF-8') as export_file:
            json.dump(train_json, export_file, indent=4,
                     sort_keys=True, ensure_ascii=False)
valid_json = data[8001:].to_dict(orient='records')
with open("valid_corpus.json", "w", encoding='UTF-8') as export_file:
            json.dump(valid_json, export_file, indent=4,
                     sort_keys=True, ensure_ascii=False)

## Getting a different version of train and valid corpus

In [None]:
def meta_data(trans, path):
        '''
        # Extract the meta-data
       # # @param trans: clean transcription
       # # @param path: location for audio files
        # return: lists of the meta data
        '''
        target = []
        features = []
        filenames = []
        duration_of_recordings = []
        for k in trans:
            filename = path+k + ".wav"
            filenames.append(filename)
            audio, fs = librosa.load(filename, sr=None)
            duration_of_recordings.append(float(len(audio)/fs))
            lable = trans[k]
            target.append(lable)
            #self.logger.info(
                # f"Extract the meta-data from transcription {path}")
        return filenames, target, duration_of_recordings

In [None]:
filenames, target, duration_of_recordings=meta_data(transcription, train_changed_wav_location)

In [None]:
train_data = pd.DataFrame({'key':filenames[:8699], 'duration':duration_of_recordings[:8699], 'text':target[:8699]})
valid_data = pd.DataFrame({'key':filenames[8699:], 'duration':duration_of_recordings[8699:], 'text':target[8699:]})


In [None]:
train_json = train_data.to_dict(orient='records')
with open("train_corpus.json", "w", encoding='UTF-8') as export_file:
            json.dump(train_json, export_file, indent=4,
                     sort_keys=True, ensure_ascii=False)
valid_json = valid_data.to_dict(orient='records')
with open("valid_corpus.json", "w", encoding='UTF-8') as export_file:
            json.dump(valid_json, export_file, indent=4,
                     sort_keys=True, ensure_ascii=False)

Generating a test_corpus

In [6]:
import sys

In [9]:
py_file_location = "/content/drive/My Drive"
sys.path.append(os.path.abspath(py_file_location))

In [10]:
import prep
import create_desc_json
import AudioGenerator

In [12]:
from create_desc_json import create_desc_json

In [13]:
js_trs = create_desc_json()

In [15]:
test_transcription = js_trs.translation_loader("/content/drive/My Drive/AMHARIC/data/test/trsTest.txt")

In [16]:
test_transcription

{'01_d501022': ' በዚህ ም አነስተኛ መስኖ ዎችን በ ማስፋፋት አገሪቱ ን በ ተደጋጋሚ ከሚ ያጠቃ ት ድርቅ ና የ ምግብ እህል እጥረት ለ ማቃለል እንደሚ ቻል ጠቁ መዋል ',
 '01_d501023': ' ይህ እንዳይሆን በ ህይወታቸው መስዋእትነት ያስገኙ ትን የ ኢትዮጵያዊ ነት ክብር አንግ በ ን ለ ቅድስት ሀገራችን አንድነት ና ሉአላዊነት ሰላም ና ብልጽግና በ ህብረት እንቁ ም ',
 '01_d501024': ' የመጀመሪያ ው ነጥብ በ ተቃዋሚ ሀይሎች ሰፈር በ ጋራ አቋም ላይ በ ጋራ ለ ማቆም ያየነው ጽናት አነስተኛ መሆኑ ነው ',
 '01_d501025': ' አቅም በሚ ፈቅደው መሰረት አስቀድሞ መዘጋጀት ስለሚ ፈልጉ የሚ ያጠራጥር አይሆን ም ስትል የ አሜሪካ ድምጽ ዜና ዘጋቢ ገልጻ ለች ',
 '01_d501026': ' ከ ለ ቀስተኞቹ መካከል አብዛኛዎቹ ጸሀዩ ዛሬ ተገለጠ ሀይለስላሴ ማሩን አለማችን አባታችን የ አፍሪካ አባት የ አለም አባት በ ማለት ነበር ሀዘና ቸውን የሚገልጹ ',
 '01_d501027': ' ከዚህ ጋር ለ ሰብአዊ አገልግሎት መሰለ ፏ ም ውጤታማ መሆኗ ም አድናቆት ን አትርፎ ላታል ',
 '01_d501028': ' የ ኮሚሽኑ ውሳኔ ና የቤተ ክህነቱ ተቃውሞ ',
 '01_d501029': ' ይህንን ም ባድመን ና ሽራሮ ን በ መውረር እ ውን አ ርጓል ',
 '01_d501030': ' ጋዜጠኞች ን መለያየታቸው ብዙዎች ን አሳዝኗ ል ',
 '01_d501031': ' ባለፈው ሰኞ እ ለት ደግሞ በ ቴሌቭዥን ሌላ ሽልማት ሲ ሸለም ተ መልከት ኩ ',
 '01_d501032': ' አጠቃላይ ወጪው አስራ ስምንት ሺ ዶላር እንደሆነ ለማወቅ ተችሏል ',
 '01_d501033': ' ሌሎቹ በ ሙሉ ጤነ ኞች ናቸው ',
 '01_d501034': ' መለስ ና ኢሳያስ እርስ በ እ

In [17]:
filenames, target, duration_of_recordings = js_trs.meta_data(test_transcription, '/content/drive/My Drive/AMHARIC/data/test/wav/')

In [18]:
test_data = pd.DataFrame({'key':filenames, 'duration':duration_of_recordings, 'text':target})

In [19]:
test_data.head()

Unnamed: 0,key,duration,text
0,/content/drive/My Drive/AMHARIC/data/test/wav/...,12.544,በዚህ ም አነስተኛ መስኖ ዎችን በ ማስፋፋት አገሪቱ ን በ ተደጋጋሚ ከሚ...
1,/content/drive/My Drive/AMHARIC/data/test/wav/...,13.696,ይህ እንዳይሆን በ ህይወታቸው መስዋእትነት ያስገኙ ትን የ ኢትዮጵያዊ ነ...
2,/content/drive/My Drive/AMHARIC/data/test/wav/...,10.752,የመጀመሪያ ው ነጥብ በ ተቃዋሚ ሀይሎች ሰፈር በ ጋራ አቋም ላይ በ ጋራ...
3,/content/drive/My Drive/AMHARIC/data/test/wav/...,11.392,አቅም በሚ ፈቅደው መሰረት አስቀድሞ መዘጋጀት ስለሚ ፈልጉ የሚ ያጠራጥር...
4,/content/drive/My Drive/AMHARIC/data/test/wav/...,16.0,ከ ለ ቀስተኞቹ መካከል አብዛኛዎቹ ጸሀዩ ዛሬ ተገለጠ ሀይለስላሴ ማሩን ...


In [20]:
test_data.shape

(358, 3)

In [23]:
test_json = test_data.to_dict(orient='records')
with open("test_corpus.json", "w", encoding='UTF-8') as export_file:
            json.dump(test_json, export_file, indent=4,
                     sort_keys=True, ensure_ascii=False)