In [1]:
from datetime import datetime
import json
import csv
import sys
import os

from langchain import OpenAI
from langchain.chat_models import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain.chains.llm import LLMChain

os.environ["OPENAI_API_KEY"] = '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'
episodes = [22,23,79,94]

In [2]:

eval_prompt_template = """You are comparing predicted_summary and baseline_summary and 
trying to determine if the predicted_summary is accurate using the baseline_summary as the source of truth.


Here is the data:
[BEGIN DATA]
************
[predicted_summary]: {predicted_summary}
************
[baseline_summary]: {baseline_summary}
[END DATA]


Your response must be either Very, Most, Somewhat, or Not. Your response should not contain any text
or characters aside from that.

The string Very means that predicted_summary is a very accurate.

The string Mostly means that predicted_summary is a mostly accurate.

The string Somewhat means that predicted_summary is a somewhat accurate.

The string Not means that predicted_summary is not accurate.

You response should also contain reasons behind your evaluation.

Return your answer in the following format:
  Very/Mostly/Somewhat/Not | reasons...
"""



In [3]:

for episode in episodes:

    with open(f"./predicted/podcast_summaries_ollama_gemma_{episode}.json") as f: 
        json_data = json.load(f)
        summarized_content = json_data['final_summary']

    with open(f"./baseline/podcast_summaries_openai_gpt35turbo_{episode}_v2.json") as f: 
        json_data = json.load(f)
        baseline_summary = json_data['final_summary']

    eval_input_data = [
        {
            'predicted_summary': summarized_content,   
            'baseline_summary': baseline_summary,     
        }
    ]
    
    print(f"#######  Episode {episode}")
    print("## Predicted Summary")
    print(summarized_content)
    print("## End of Predicted Summary")
    
    print()
    print("## Baseline Summary")
    print(baseline_summary)
    print("## End of Baseline Summary")
    
    eval_prompt = PromptTemplate(template=eval_prompt_template, input_variables=["predicted_summary", "baseline_summary"])

    # Define the LLMs
    map_llm = ChatOpenAI(temperature=0, model_name='gpt-3.5-turbo-1106')

    map_llm_chain = LLMChain(llm = map_llm, prompt = eval_prompt)

    map_llm_chain_input = eval_input_data
    # Run the input through the LLM chain (works in parallel)
    map_llm_chain_results = map_llm_chain.apply(map_llm_chain_input)
    print()

    print(f"Truthfulness evaluation score for episode {episode}")
    print(map_llm_chain_results)
    print("##############################################")
    print("##############################################")
    print()
    
#     break

#######  Episode 22
## Predicted Summary
The podcast describes the growth of deep learning and machine learning, highlighting the success of AI projects and the open-source nature of TensorFlow. It also discusses the impact of open-source projects on technology and the overall impact of TensorFlow on the AI community.

The podcast summarizes the key points of various articles about paid services, advertising on the internet, AI, and its potential impact on education and advertising. It highlights the accessibility and power of AI tools like TPUs, cloud services, and TensorFlow, as well as the benefits of platforms like Colab for machine learning beginners. Additionally, it explores the impact of advertising on information accessibility and its potential for connecting users to desired products.

The podcast concludes by discussing the future of advertising and monetization on the internet. It emphasizes the potential of AI to revolutionize the advertising industry and its ability to co

  warn_deprecated(



Truthfulness evaluation score for episode 22
[{'text': 'Somewhat | The predicted_summary covers some key points from the baseline_summary, such as the growth of deep learning and machine learning, the open-source nature of TensorFlow, and the impact of AI on advertising and education. However, it misses important details about the evolution of TensorFlow, the competition with PyTorch, the integration of Keras, and the potential evolution of hardware accelerators. Overall, it captures the general theme but lacks specific details from the baseline_summary.'}]
##############################################
##############################################

#######  Episode 23
## Predicted Summary
The podcast explores various topics related to creativity, technology, and personal growth, highlighting the interconnectedness of various fields and the potential impact of AI on various aspects of human experience. It covers topics such as AI and creativity, poetry and AI, home automation, the in


Truthfulness evaluation score for episode 94
[{'text': 'Somewhat | The predicted_summary covers some similar topics as the baseline_summary, such as neural networks, deep learning, and artificial intelligence. However, the predicted_summary focuses more on the advancements and potential of these technologies, while the baseline_summary delves into a wider range of specific topics and discussions. The predicted_summary also lacks some of the in-depth discussions and details provided in the baseline_summary, making it somewhat accurate but not fully capturing the breadth of information in the baseline_summary.'}]
##############################################
##############################################

