#Setup

In [None]:
import nltk
nltk.download('punkt')
import numpy as np
import pickle
import blosc

from scipy.io.wavfile import write
#from tqdm.notebook import tqdm
from tqdm import tqdm
import shutil
import os

from bark import SAMPLE_RATE, generate_audio, preload_models
from IPython.display import Audio
from multiprocessing import Process

preload_models()

print("Loaded Bark")

#Set Variables

In [None]:
MAX_TOKENS = 45   #Texts are split by sentence but in case no punctuation is detected or a sentence is too long, we split further by number of tokens

SPEAKER = "v2/en_speaker_6" #Set bark speaker
silence = np.zeros(int(0.25 * SAMPLE_RATE))  # quarter second of silence

SAVE_PATH = "_OUTPUT/"
os.makedirs(SAVE_PATH, exist_ok=True)

#Load Data

In [None]:
# #Delete
# from DataLoader import DataLoader
# import Utilities
# from Utilities import *

# datasetName = "Twitter"
# data = DataLoader("Twitter")
# texts = data.texts

In [None]:
texts = []

#Load Resources

In [None]:
#Load list of emoticons
#Source: https://c.r74n.com/faces

with open("TextEmoticonList.txt", "r") as file:
  emoticonList = file.read().split("\n")

#Remove emoticons with spaces in-between
emoticonList = [emoticon for emoticon in emoticonList if len(emoticon.split(" ")) == 1]

#Remove one character emoticons
emoticonList = [emoticon for emoticon in emoticonList if len(emoticon) > 1]

print(len(emoticonList))
print(emoticonList[:10])

In [None]:
#Load list of emojis
#Source: https://www.airtable.com/universe/exphjm5ifnV0bX4Kb/emojis-database?explore=true

emojiList = pd.read_csv("Emojis-Grid view.csv")
emojiList = emojiList[emojiList["Emoji"] != "C"]
emojiList = emojiList["Emoji"].tolist()

#Unicode versions
emojiList_uni = [emoji.encode('unicode-escape').decode('ASCII') for emoji in emojiList]

print(len(emojiList))
print(emojiList[:10])
print(emojiList_uni[:10])

# Preprocess

##Text

In [None]:
#FLAGS
DEIDENTIFY = True     #Replace urls, emails, and usernames
EMOPRESERVE = True    #Identify emojis/emoticons on text and skip text cleaning on them
TEXTCLEAN = False     #Minimal cleaning of separating certain conjunctions
TOKEN_TYPE = "wp"     #wp: word piece (BERT Tokenizer); ws: word split

In [None]:
import re

tokenURL = "_URL_"
tokenEmail = "_EMAIL_"
tokenUsername = "_USER_"
reserveTokens = [tokenURL, tokenEmail, tokenUsername]

#CLEANING PROCESS
#- Include emojis and emoticons
#- Replace url, email, and usernames with tokens
#- Remove non-major puncutations and separate them from words with whitespaces
#- Lowercase
def preprocess_str(string):

  #Preclean
  if DEIDENTIFY:
    string = re.sub(r"https?://[^\s]+", tokenURL, string)              #Links
    string = re.sub(r"[\w.+-]+@[\w-]+\.[\w.-]+", tokenEmail, string)   #Email
    string = re.sub(r"@[a-zA-Z0-9_]{2,}", tokenUsername, string)       #Usernames

  #Emoticon/Emoji split
  tokens = [string]
  if EMOPRESERVE:
    allEmo = emoticonList + emojiList + emojiList_uni + reserveTokens
    for emoticon in allEmo:
      regEx = "(^|\s)" + re.escape(emoticon) + "(\s|$)" if emoticon.isalpha() else re.escape(emoticon)
      if emoticon in string:
        splits = []
        for split in tokens:
          splits.append(re.split(r"(" + regEx + ")", split))
        tokens = [y.strip() for x in splits for y in x if y != ""]

  for idx in range(len(tokens)):
    if EMOPRESERVE and tokens[idx] in allEmo: #Skip emoticons, emojis
      continue

    if TEXTCLEAN:
      tokens[idx] = re.sub(r"[^A-Za-z0-9(),!?\.\'\`]", " ", tokens[idx])
      tokens[idx] = re.sub(r"\'s", " \'s", tokens[idx])
      tokens[idx] = re.sub(r"\'ve", " \'ve", tokens[idx])
      tokens[idx] = re.sub(r"n\'t", " n\'t", tokens[idx])
      tokens[idx] = re.sub(r"\'re", " \'re", tokens[idx])
      tokens[idx] = re.sub(r"\'d", " \'d", tokens[idx])
      tokens[idx] = re.sub(r"\'ll", " \'ll", tokens[idx])
      tokens[idx] = re.sub(r",", " , ", tokens[idx])
      tokens[idx] = re.sub(r"!", " ! ", tokens[idx])
      tokens[idx] = re.sub(r"\(", " ( ", tokens[idx])
      tokens[idx] = re.sub(r"\)", " ) ", tokens[idx])
      tokens[idx] = re.sub(r"\?", " ? ", tokens[idx])
      tokens[idx] = re.sub(r"\.", " . ", tokens[idx])
      tokens[idx] = re.sub(r"\s{2,}", " ", tokens[idx])

    #Lower case and strip by default
    tokens[idx] = tokens[idx].lower().strip()

  return " ".join(tokens)

#Generate Audio

In [None]:
def split_sentences(text):
  sentences = nltk.sent_tokenize(text)
  splits = []
  for sentence in sentences:

    if len(sentence.split(" ")) > MAX_TOKENS: #Split in token lengths
      temp = sentence.split(" ")
      phrases = []
      while len(temp) > MAX_TOKENS:
        phrases.append(" ".join(temp[:MAX_TOKENS]))
        temp = temp[MAX_TOKENS:]
      else:
        phrases.append(" ".join(temp))
      splits.extend(phrases)
    else:
      splits.append(sentence)

  return splits

In [None]:
texts_clean = [preprocess_str(text) for text in texts]

audio_list = []
for i, text in enumerate(tqdm(texts_clean[:3])):
  sentences = split_sentences(text)
  pieces = []
  for sentence in sentences:
    audio_array = generate_audio(sentence, history_prompt=SPEAKER, silent = True)
    pieces += [audio_array, silence.copy()]

  #Save file
  write(SAVE_PATH + "Sample_%d.wav" % i, SAMPLE_RATE, np.concatenate(pieces))