# N-phone Forced Alignment
Extension of Jian Zhu's [Charsiu forced alignment](https://github.com/lingjzhu/charsiu) for diphones, triphones, and all n-phones.

## Setup

In [1]:
from pydub import AudioSegment
from pydub.playback import play

import time
import os
import json

#from Charsiu import charsiu_forced_aligner
#charsiu = charsiu_forced_aligner(aligner="charsiu/en_w2v2_fc_10ms")



### Charsiu installation

Based on Zhu's setup instructions.

In [None]:
!pip install torch torchvision torchaudio
!pip install datasets transformers
!pip install g2p_en praatio librosa

In [None]:
import os
from os.path import exists, join, expanduser

os.chdir(expanduser("~"))
charsiu_dir = 'charsiu'
if exists(charsiu_dir):
    !rm -rf /root/charsiu
if not exists(charsiu_dir):
    ! git clone -b development https://github.com/lingjzhu/$charsiu_dir
        ! cd charsiu && git checkout && cd -

os.chdir(charsiu_dir)

In [None]:
import sys
import torch
sys.path.append('src/')
sys.path.insert(0,'src')

In [None]:
from Charsiu import charsiu_forced_aligner
charsiu = charsiu_forced_aligner(aligner='charsiu/en_w2v2_fc_10ms')

## Phoneme alignment
Collecting the original, monophone alignment data.

In [2]:
def get_audio_16kHz(audio_filename):
    audio = AudioSegment.from_file(audio_filename)
    resampled_audio = audio.set_frame_rate(16000)
    
    audio_16kHz_filename = f"audio_16kHz/{audio_filename}_16kHz.wav"
    resampled_audio.export(audio_16kHz_filename, format="wav")
    
    return audio_16kHz_filename

def run_alignment(audio_filename, input_sentence):
    # file must be 16kHz
    return charsiu.align(input_sentence, audio_filename)

## N-phoneme alignment
Extending this by finding the timepoints for n-phones.

In [22]:
def get_nphone_alignment_data(alignment_data, n): 
    # if n=1, just return the alignment data
    if n == 1:
        return alignment_data
    
    def get_nphones(phones, n):
        num_nphones = len(phones) - (n-1)
        return [phones[i:i+n] for i in range(num_nphones)]

    def get_nphone_label(nphone):
        nphone_label = ""
        for phone in nphone:
            nphone_label += phone[2] + " "
        return nphone_label[:-1]
        
    nphone_alignment = []
    alignment_chunks = get_nphones(alignment_data, n)
    
    for nphone in alignment_chunks:
        
        start_time = nphone[0][0]
        end_time = nphone[-1][1]
        nphone_label = get_nphone_label(nphone)
        
        nphone_alignment.append((start_time, end_time, nphone_label))
    return nphone_alignment

## Testing/review functions
Two easy ways for a human to verify the results:
1) reading the output
2) reviewing the chopped audio

In [19]:
def review_alignment_data(alignment_data, audio_filename, input_sentence, n):
    nphone_list = [nphone[2] for nphone in alignment_data]
    print(f"""Reviewing alignment data...
Audio file: {audio_filename}
Input sentence: {input_sentence}
N-phone type: {n}-phones

N-phones found: {nphone_list}
""")

def review_alignment_data_audio(alignment_data, audio_filename):
    
    def get_audio_segment(audio_filename, start_time, end_time):
        audio = AudioSegment.from_file(audio_filename)
        return audio[start_time*1000:end_time*1000]
    
    for start_time, end_time, nphone in alignment_data:
        print(f"Playing {nphone}...")
        audio = get_audio_segment(audio_filename, start_time, end_time)
        for _ in range(3):
            play(audio)
            time.sleep(0.5)

## Data I/O functions
Reading/writing alignment data from/to a JSON file.

In [32]:
def read_cache(cache_name="nphone_data_cache.json"): # read in the JSON file
    if not os.path.exists(cache_name):
        return {}
    return # the json file as a readable object

def upload_alignment_data(alignment_data, audio_filename, cache_name="nphone_data_cache.json"): # upload it to the JSON file
    if not os.path.exists(cache_name):
        # make the json file. it should be empty.
        cache = read_cache(cache_name)
    for start_time, end_time, nphone in alignment_data:
        audio_obj = {
            "audio_filename": audio_filename, 
            "start_time": start_time, 
            "end_time": end_time,
                    }
        if nphone in cache:
            cache["nphone"].append(audio_obj)
        else:
            cache["nphone"] = [audio_obj]
    # upload this cache to the json file.

## Example usage
Collecting diphone alignments on an example audio.

In [17]:
# Setup
AUDIO_FILENAME = "audio/test_audio.wav"
INPUT_SENTENCE = """I went to watch a movie instead, and I met my friend there."""
N = 2

In [None]:
# Prepare 16kHz audio and run alignment
audio_16kHz_filename = get_audio_16kHz(AUDIO_FILENAME)
alignment_data = run_alignment(audio_16kHz_filename, INPUT_SENTENCE)

In [5]:
alignment_data = [[0.0, 0.33, "[SIL]"], [0.33, 0.46, "AY"], [0.46, 0.56, "W"], [0.56, 0.6, "EH"], [0.6, 0.63, "N"], [0.63, 0.72, "T"], [0.72, 0.75, "UW"], [0.75, 0.85, "W"], [0.85, 0.94, "AA"], [0.94, 1.02, "CH"], [1.02, 1.08, "AH"], [1.08, 1.15, "M"], [1.15, 1.23, "UW"], [1.23, 1.3, "V"], [1.3, 1.36, "IY"], [1.36, 1.41, "IH"], [1.41, 1.47, "N"], [1.47, 1.54, "S"], [1.54, 1.61, "T"], [1.61, 1.73, "EH"], [1.73, 1.83, "D"], [1.83, 1.96, "[SIL]"], [1.96, 2.01, "AH"], [2.01, 2.04, "N"], [2.04, 2.07, "D"], [2.07, 2.12, "AY"], [2.12, 2.21, "M"], [2.21, 2.28, "EH"], [2.28, 2.31, "T"], [2.31, 2.36, "M"], [2.36, 2.43, "AY"], [2.43, 2.55, "F"], [2.55, 2.59, "R"], [2.59, 2.64, "EH"], [2.64, 2.7, "N"], [2.7, 2.73, "D"], [2.73, 2.76, "DH"], [2.76, 2.8, "EH"], [2.8, 2.96, "R"], [2.96, 3.12, "[SIL]"]]

In [23]:
# Collect nphone alignment data
nphone_alignment_data = get_nphone_alignment_data(alignment_data, N)
nphone_alignment_data

[(0.0, 0.46, '[SIL] AY'),
 (0.33, 0.56, 'AY W'),
 (0.46, 0.6, 'W EH'),
 (0.56, 0.63, 'EH N'),
 (0.6, 0.72, 'N T'),
 (0.63, 0.75, 'T UW'),
 (0.72, 0.85, 'UW W'),
 (0.75, 0.94, 'W AA'),
 (0.85, 1.02, 'AA CH'),
 (0.94, 1.08, 'CH AH'),
 (1.02, 1.15, 'AH M'),
 (1.08, 1.23, 'M UW'),
 (1.15, 1.3, 'UW V'),
 (1.23, 1.36, 'V IY'),
 (1.3, 1.41, 'IY IH'),
 (1.36, 1.47, 'IH N'),
 (1.41, 1.54, 'N S'),
 (1.47, 1.61, 'S T'),
 (1.54, 1.73, 'T EH'),
 (1.61, 1.83, 'EH D'),
 (1.73, 1.96, 'D [SIL]'),
 (1.83, 2.01, '[SIL] AH'),
 (1.96, 2.04, 'AH N'),
 (2.01, 2.07, 'N D'),
 (2.04, 2.12, 'D AY'),
 (2.07, 2.21, 'AY M'),
 (2.12, 2.28, 'M EH'),
 (2.21, 2.31, 'EH T'),
 (2.28, 2.36, 'T M'),
 (2.31, 2.43, 'M AY'),
 (2.36, 2.55, 'AY F'),
 (2.43, 2.59, 'F R'),
 (2.55, 2.64, 'R EH'),
 (2.59, 2.7, 'EH N'),
 (2.64, 2.73, 'N D'),
 (2.7, 2.76, 'D DH'),
 (2.73, 2.8, 'DH EH'),
 (2.76, 2.96, 'EH R'),
 (2.8, 3.12, 'R [SIL]')]

In [24]:
# Visual review
review_alignment_data(nphone_alignment_data, AUDIO_FILENAME, INPUT_SENTENCE, N)

Reviewing alignment data...
Audio file: audio/test_audio.wav
Input sentence: I went to watch a movie instead, and I met my friend there.
N-phone type: 2-phones

N-phones found: ['[SIL] AY', 'AY W', 'W EH', 'EH N', 'N T', 'T UW', 'UW W', 'W AA', 'AA CH', 'CH AH', 'AH M', 'M UW', 'UW V', 'V IY', 'IY IH', 'IH N', 'N S', 'S T', 'T EH', 'EH D', 'D [SIL]', '[SIL] AH', 'AH N', 'N D', 'D AY', 'AY M', 'M EH', 'EH T', 'T M', 'M AY', 'AY F', 'F R', 'R EH', 'EH N', 'N D', 'D DH', 'DH EH', 'EH R', 'R [SIL]']



In [25]:
# Auditory review
review_alignment_data_audio(nphone_alignment_data, AUDIO_FILENAME)

Playing [SIL] AY...
Playing AY W...
Playing W EH...
Playing EH N...
Playing N T...
Playing T UW...
Playing UW W...
Playing W AA...
Playing AA CH...
Playing CH AH...
Playing AH M...
Playing M UW...
Playing UW V...
Playing V IY...
Playing IY IH...
Playing IH N...
Playing N S...
Playing S T...
Playing T EH...
Playing EH D...
Playing D [SIL]...
Playing [SIL] AH...
Playing AH N...
Playing N D...
Playing D AY...
Playing AY M...
Playing M EH...
Playing EH T...
Playing T M...
Playing M AY...
Playing AY F...
Playing F R...
Playing R EH...
Playing EH N...
Playing N D...
Playing D DH...
Playing DH EH...
Playing EH R...
Playing R [SIL]...
