In [None]:
branch_name = "tts-exp"
repo_url = "https://github.com/rkaushik97/MSGAI-AI-Podcast-Generator.git"

!git clone -b {branch_name} --single-branch {repo_url}

from google.colab import userdata
# setting key in secrets google colab
hf_token = userdata.get('HUGGINGFACE_API_KEY')

!pip install kokoro_onnx piper-tts jiwer

In [None]:
# after having created in msgai folder the kokoro folder with the .bin and .onnx files
# for the model, you can find them in the readme.md
%cd MSGAI-AI-Podcast-Generator/podcast-generator

## Load the results of the experiment on the LLM

In [None]:
import json
# import pandas as pd

with open('../notebooks/llm_exp/full_exp_merged.json') as infile:
  llm_results = json.load(infile)

# llm_results = pd.read_json()

In [None]:
llm_results[0].keys()
llm_results[0]['output_text']

## Pass the generated script of the podcast to the TTS and evaluate WER and Latency

In [None]:
from podcast_pipeline.adaptive_tts_synthesizer import AdaptiveTTSSynthesizer
from podcast_pipeline.audio_quality_analyzer import AudioQualityAnalyzer
import os
import time

idx = 160
tts_backend = 'kokoro'
# tts_backend = 'piper'

output_filename_full = f'output/experiment/test_{tts_backend}.wav'
script_filename_full = f'output/experiment/test_{tts_backend}.md'

# with open(script_filename_full, 'w', encoding='utf-8') as f:
#             f.write(f"# Podcast Script: {llm_results[idx]['output_text']['topic']}\n\n")
#             f.write(f"Metadata: Host={llm_results[idx]['output_text']['metadata']['HOST_GENDER']}, Guest={llm_results[idx]['output_text']['metadata']['GUEST_GENDER']}\n\n")
#             f.write("--- Dialogue ---\n\n")
#             f.write(llm_results[idx]['output_text']['dialogue'])

adaptive_tts = AdaptiveTTSSynthesizer(tts_backend)

start_tts = time.time()
adaptive_tts.synthesize(llm_results[idx]['output_text'], output_filename_full)
end_tts = time.time()

analyzer = AudioQualityAnalyzer()

audio_quality_results = analyzer.evaluate(audio_path=output_filename_full,
                    transcript_md_path=None,
                    script=llm_results[idx]['output_text']
                    )

audio_quality_scores = {
    "wer": audio_quality_results["wer"],
    "detailed_measures": audio_quality_results["detailed_measures"],
    "audio_metrics": audio_quality_results["audio_metrics"]
}

# Save audio quality report
audio_quality_report = f"test_audio_quality.json"
analyzer.save_results(audio_quality_results, os.path.join("output/experiment", audio_quality_report))

# Print summary
analyzer.print_summary(audio_quality_results)
print(end_tts - start_tts)

In [None]:
llm_results[idx]['output_text']
# audio_quality_scores

In [None]:
from podcast_pipeline.adaptive_tts_synthesizer import AdaptiveTTSSynthesizer
from podcast_pipeline.audio_quality_analyzer import AudioQualityAnalyzer
import os
from tqdm import tqdm
import time

tts_backend = 'kokoro'
# tts_backend = 'piper'

audio_quality_scores_all = []
adaptive_tts = AdaptiveTTSSynthesizer(tts_backend)
base_path = f'output/experiment/{tts_backend}'

###############################################################################
# cold start
cold_start={
    "topic": "The cold war never ends",
    "dialogue": "HOST: let's do some warm up exercise before starting with the experiment!",
    "metadata": {
        "HOST_GENDER": "MALE",
        "GUEST_GENDER": "FEMALE"
    }
}
adaptive_tts.synthesize(cold_start, f'{base_path}/cold_start.wav')
###############################################################################

for i, result in tqdm(enumerate(llm_results), total=len(llm_results)):
  output_filename_full = f'{base_path}/gen_{i}_{tts_backend}.wav'
  script_filename_full = f'{base_path}/gen_{i}_{tts_backend}.md'

  if not os.path.exists('/'.join(script_filename_full.split('/')[:-1])):
    os.makedirs('/'.join(script_filename_full.split('/')[:-1]))

  with open(script_filename_full, 'w', encoding='utf-8') as f:
              f.write(f"# Podcast Script: {result['output_text']['topic']}\n\n")
              f.write(f"Metadata: Host={result['output_text']['metadata']['HOST_GENDER']}, Guest={result['output_text']['metadata']['GUEST_GENDER']}\n\n")
              f.write("--- Dialogue ---\n\n")
              f.write(result['output_text']['dialogue'])

  start_tts = time.time()
  adaptive_tts.synthesize(result['output_text'], output_filename_full)
  end_tts = time.time()

  analyzer = AudioQualityAnalyzer()

  audio_quality_results = analyzer.evaluate(audio_path=output_filename_full,
                      transcript_md_path=None,
                      script=result['output_text']
                      )

  audio_quality_scores = {
      "wer": audio_quality_results["wer"],
      "detailed_measures": audio_quality_results["detailed_measures"],
      "audio_metrics": audio_quality_results["audio_metrics"],
      "inference_time": end_tts - start_tts
  }

  audio_quality_scores_all.append(audio_quality_scores)

  # Save audio quality report
  audio_quality_report = f"gen_{i}_{tts_backend}.json"
  analyzer.save_results(audio_quality_results, os.path.join(base_path, audio_quality_report))

with open(f'{base_path}/full_results.json', 'w') as outfile:
  json.dump(audio_quality_scores_all, outfile)



In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import shutil
tts_backend_save = 'kokoro'
drive_path = f"/content/drive/MyDrive/GenAI/tts_exp_{tts_backend_save}"
exp_path = f"output/experiment/{tts_backend_save}"

shutil.copytree(exp_path, drive_path)