In [6]:
### Standard Imports
import os
import sys
import torch
import tensorflow as tf
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import csv

import pronouncing
from nltk.tokenize import SyllableTokenizer
from nltk import word_tokenize

In [7]:
### Custom Imports
sys.path.append('../')
import lib.utilities as utils

In [139]:
### General Parameters
random_seed = 42
model_folder = '../../../syllable/v1'
model_name = 'syllable-melody'

### Text Parameters
newline_token = '<new>'
end_token = '<eos>'

In [9]:
os.makedirs(model_folder, exist_ok=True)

In [10]:
### Load Data
corpus = utils.load_corpus()
filenames = os.listdir('../../data')

In [129]:
corpus[:2000]

"<VERSE>\nStep by step Together we'll build our dreams\nHeart to heart Together we'll stay as one nation, undivided \nBack to back Together we'll brave the heat, the cold, the storms\nHand in hand Together we'll grow this land that we call home\n\n<CHORUS>\nNothing in this world compares\nIt's our Singaporean Life\nEveryone is family, friend and neighbour\nLiving in harmony\nNothing in this world compares\nTo this island where it's home\nWhere we love and know\nwe'll never be alone\nBecause it's Singapore\n\n<VERSE>\nStep by step Together we'll build our dreams\nHeart to heart Together we'll stay as one nation, undivided\nBack to back Together we'll brave the heat, the cold, the storms\nHand in hand Together we'll grow this land that we call home\n\n<CHORUS>\nNothing in this world compares\nIt's our Singaporean Life\nEveryone is family, friend and neighbour\nLiving in harmony\nNothing in this world compares\nTo this island where it's home\nWhere we love and know\nwe'll never be alone\n

In [11]:
# words = utils.preprocess_text(corpus, fun_list = [utils.to_lower, utils.remove_punct], keep = '\<|\>')
# words = re.sub(r'\n',f' {newline_token} ', words)
# words = re.split(' +', words)
# syllables = [[''] if token.startswith('<') else SSP.tokenize(token) for token in words]

In [173]:
words = utils.preprocess_text(corpus, fun_list = [utils.to_lower], keep = '\<|\>')
words = re.sub(r'\n',f' {newline_token} ', words)
words = re.split('[ ,.?!]+', words) #Tokenising
words = [word for word in words if word not in ' ,.-?!']
songs = ' '.join(words)
songs = songs.split(f' {newline_token} {newline_token} {end_token} {newline_token} {newline_token} ')
songs = [song.split(' ') for song in songs]

In [179]:
syllable_dict = {'compares': ['com','pares'],
                'everyone': ['ev','ery','one'],
                'alone':['a','lone'],
                'because':['be','cause'],
                'singapore':['sing','a','pore'],
                'riverside':['ri','ver','side'],
                'survive':['sur','vive'],
                'welcome':['wel','come'],
                'believe':['be','lieve'],
                'achieve':['a','chieve'],
                'everybody':['ev','ery','bo','dy'],
                'something':['some','thing'],
                'escape':['es','cape'],
                'unfurled':['un','furled'],
                'courage':['cour','age'],
                'inside':['in','side'],
                'everything':['ev','ery','thing'],
                'homeland':['home','land'],
                'precious':['pre','cious'],
                'embrace':['em','brace'],
                'universe':['u','ni','verse'],
                'youthful':['youth','ful'],
                'above':['a','bove'],
                'savour':['sa','vour'],
                'amazed':['a','mazed'],
                'surely':['sure','ly'],
                'heartaches':['heart','aches'],
                'uneventfullest':['un','e','vent','ful','lest'],
                'self-explanatory':['self','ex','pla','na','to','ry'],
                'fallstars':['fall','stars'],
                'forevermore':['for','e','ver','more'],
                'beginnin\'':['be','gin','nin'],
                'wooh':['wooh'],
                'singapura':['sing','a','pu','ra'],
                'fervour':['fer','vour'],
                'patronised':['pat','ron','ised'],
                'queued':['queued'],
                'youre':['youre'],
                'singaporean':['sing','a','po','rean'],
                'pioneers':['pio','neers'],
                'knowledge':['know','ledge'],
                'before':['be','fore'],
                'become':['be','come'],
                'beautiful':['beau','ti','ful'],
                'everywhere':['ev','ery','where'],
                'everyday':['ev','ery','day'],
                'somewhere':['some','where'],
                'peaceful':['peace','ful'],
                'lively':['live','ly'],
                'elsewhere':['else','where'],
                'beauty':['beau','ty'],
                'twinkling':['twink','ling'],
                'bravely':['brave','ly'],
                'especially':['es','pe','cial','ly'],
                'society':['so','ci','e','ty'],
                'reality':['re','al','i','ty'],
                'evening':['eve','ning'],
                'collyer':['col','ly','er'],
                'someday':['some','day'],
                'yourself':['your','self'],
                'grateful':['grate','ful'],
                'sometimes':['some','times'],
                'headlines':['head','lines'],
                'homely':['home','ly'],
                'colleagues':['col','leagues'],
                'creating':['cre','a','ting'],
                'yearning':['year','ning'],
                'echoing':['e','cho','ing'],
                'singaporeans':['sing','a','po','reans'],
                'nation\'s':['na','tion\'s'],
                'dream\'s':['dreams'],
                'there\'ll':['there\'ll'],
                'children\'s':['chil','dren\'s'],
                'city\'s':['ci','ty\'s'],
                'story\'s':['sto','ry\'s'],
                'weren\'t':['weren\'t]'],
                'ahead\'s':['a','head\'s'],
                'centre\'s':['cen','tre\'s'],
                'aren\'t':['aren\'t'],
                'island\'s':['is','land\'s'],
                'tomorrow\'s':['to','mor','row\'s'],
                'nothing\'s':['no','thing\'s'],
                'everyday\'s':['ev','ery','day\'s']}

In [180]:
all_syllables = []
for song in songs:
    syllables = []
    for token in song:
        if token.startswith('<'):
            syllables.append(token)
        else:
            if token in syllable_dict:
                syllables += syllable_dict[token]
            else:
                try:
                    n_syl = pronouncing.syllable_count(pronouncing.phones_for_word(token)[0])
                except:
                    print(token, 'a')
                word_syllables = SSP.tokenize(token)
                if n_syl == 1 or len(word_syllables) == 1:
                    syllables += [token]
                elif len(word_syllables) == n_syl:
                    syllables += word_syllables
                else:
                    if len(word_syllables) - n_syl == 1:
                        last_syl = word_syllables[-1]
                        if re.search('^[a-z]e$',last_syl) or re.search('ed$',last_syl):
                            word_syllables = word_syllables[:-2] + [''.join(word_syllables[-2:])]
                            syllables += word_syllables
                        else:
                            print(token, 'b')
                    else:
                        print(token, 'c')
    all_syllables.append(syllables)

In [182]:
song_names = [re.sub('^(.+)\\.txt$', '\\1', filename) for filename in filenames]

for song_name, song in zip(song_names, all_syllables):
    with open(f'{model_folder}/{song_name}_syllables.csv', 'w', newline='') as csvfile:
        csv_writer = csv.writer(csvfile, delimiter=' ',
                                quotechar='', quoting=csv.QUOTE_NONE)
        for syllable in song:
            csv_writer.writerow(syllable)