In [None]:
from datasets import load_dataset
import networkx as nx
import tqdm
import IPython.display as ipd

webNLG_dataset = load_dataset("web_nlg", "webnlg_challenge_2017")

In [None]:
## Converting tripple set to a graph
def generate_graph_for_tripple_set(tripple_set):
    G = nx.Graph()
    for tripple in tripple_set:
        tripple = tripple.split(" | ")
        ## check if node is in G
        if tripple[0] not in G.nodes:
            G.add_node(tripple[0])
        if tripple[1] not in G.nodes:
            G.add_node(tripple[1])
        if tripple[2] not in G.nodes:
            G.add_node(tripple[2])
        ## Add edge between nodes
        G.add_edge(tripple[0], tripple[1])
        G.add_edge(tripple[1], tripple[2])
    return nx.cytoscape_data(G)

## Printing function for each
def print_details(idx):
    print("EID: ", eid_list[idx])
    for idx, text in enumerate(text_list[idx]):
        print(f"Option {idx}: {text}\n")
    
    print("Graph Visual:")
    nx.draw(nx.cytoscape_graph(graph_list[idx]),with_labels=True, font_size=8)
    
G = generate_graph_for_tripple_set(webNLG_dataset["train"][0]["modified_triple_sets"]["mtriple_set"][0])

## Iterate through all the triple sets and generate graphs
eid_list = []
graph_list = []
text_list = []
wav_list = []

for each in tqdm.tqdm(webNLG_dataset["train"]):
    eid_list.append(each["eid"])
    text_list.append(each["lex"]["text"])
    ## Create the WAV file:
#    each_wav_list = []
#    for each_text in each["lex"]["text"]:
#        each_wav_list.append(generate_audio_content(each_text))
#    wav_list.append(each_wav_list)
    G = generate_graph_for_tripple_set(each["modified_triple_sets"]["mtriple_set"][0])
    graph_list.append(G)

## Save the data
import pickle
with open("webNLG_data.pickle", "wb") as f:
    pickle.dump({"eid_list": eid_list, "graph_list": graph_list, "text_list": text_list, "wav_list": wav_list}, f)


In [None]:
import pandas as pd
df = pd.DataFrame({
    "eid": eid_list,
    "graph": graph_list,
    "text": text_list,
})

df = df.explode('text').reset_index(drop=True)



In [None]:
from accelerate import PartialState  # Can also be Accelerator or AcceleratorState
from transformers import pipeline

distributed_state = PartialState()
pipe = pipeline("text-to-speech", "suno/bark", device=distributed_state.device)

with distributed_state.split_between_processes(list(df["text"].values)) as prompt:
    result = pipe(prompt)

df["wav"] = result

from datasets import Dataset
processed_dataset = Dataset.from_pandas(df)

In [None]:
processed_dataset.add_column("wav", result)

In [None]:
from transformers import pipeline

synthesizer = pipeline("text-to-speech", "suno/bark", device="cuda:0")

In [None]:
outputs = synthesizer(["Look I am generating speech in three lines of code!"])

In [None]:
import IPython.display as ipd
ipd.Audio(outputs["audio"], rate=outputs["sampling_rate"])

In [1]:
## Load all files names in a folder
import os

def load_all_files_in_folder(folder_path):
    file_list = []
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            file_list.append(os.path.join(root, file))
    return file_list

tmp_files = load_all_files_in_folder("data/tmp")
## Load each of the pickle files
import pickle
audio_list = []
txt_list = []
for each_file in tmp_files:
    with open(each_file, "rb") as f:
        tmp_dict = pickle.load(f)
        audio_list.extend(tmp_dict["wav"])
        txt_list.extend(tmp_dict["text"])

In [2]:
with open("/home/CS546-CLEO/data/processed_dataset.pickle", "rb") as f:
    processed_data = pickle.load(f)
df = processed_data["df"]

In [8]:
import pandas as pd
audio_df = pd.DataFrame({"wav": audio_list, "text": txt_list})
df = df.merge(audio_df, on="text", how="inner")

In [6]:
for each_file in tmp_files:
    os.remove(each_file)

In [10]:
from datasets import Dataset
processed_dataset = Dataset.from_pandas(df)

In [13]:
processed_dataset.save_to_disk("/home/CS546-CLEO/data/processed_dataset")

Saving the dataset (0/1 shards):   0%|          | 0/80 [00:00<?, ? examples/s]

In [15]:
from datasets import load_from_disk

processed_dataset = load_from_disk("/home/CS546-CLEO/data/processed_dataset")

In [20]:
processed_dataset.to_pandas()#["text"]#.nunique()

Unnamed: 0,eid,graph,text,wav
0,Id1,"{'data': [], 'directed': False, 'elements': {'...","The Aarhus is the airport of Aarhus, Denmark.","{'audio': [0.0013169994, 0.0008414648, 0.00102..."
1,Id1,"{'data': [], 'directed': False, 'elements': {'...","Aarhus Airport serves the city of Aarhus, Denm...","{'audio': [0.0009158152, 0.00080315955, 0.0007..."
2,Id2,"{'data': [], 'directed': False, 'elements': {'...",Aarhus airport serves the city of Aarhus.,"{'audio': [-0.04144665, -0.044604544, -0.04439..."
3,Id3,"{'data': [], 'directed': False, 'elements': {'...",Aarhus Airport is 25 metres above sea level.,"{'audio': [1.691666e-05, -0.000120812096, 0.00..."
4,Id3,"{'data': [], 'directed': False, 'elements': {'...",Aarhus airport is at an elevation of 25 metres...,"{'audio': [0.008753816, 0.0075816633, 0.006622..."
...,...,...,...,...
195,Id81,"{'data': [], 'directed': False, 'elements': {'...",The Alderney Airport elevation above the sea l...,"{'audio': [0.0007371929, 0.00031506407, 0.0004..."
196,Id82,"{'data': [], 'directed': False, 'elements': {'...",Alderney Airport has a runway length of 497.0.,"{'audio': [0.0004782201, -6.468003e-05, 0.0001..."
197,Id82,"{'data': [], 'directed': False, 'elements': {'...",The runway length of Alderney Airport is 497.0.,"{'audio': [-0.0128557645, -0.012591194, -0.013..."
198,Id83,"{'data': [], 'directed': False, 'elements': {'...",The Alderney Airport runway has a length of 73...,"{'audio': [0.0007543769, 0.00026334112, 0.0002..."
