In [1]:
from datetime import datetime
import pandas as pd
import numpy as np
import json
import os
import matplotlib.pyplot as plt
from scipy.spatial.distance import cosine
import networkx as nx
from networkx.algorithms import community

from langchain import OpenAI
from langchain.chat_models import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain.chains.llm import LLMChain
from langchain.embeddings import OpenAIEmbeddings
from langchain.docstore.document import Document
from langchain.chains.summarize import load_summarize_chain

os.environ["OPENAI_API_KEY"] = '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'

In [25]:
eval_input_data = [
    {
        'document': '''
Mr.  Speaker.  Madam Vice President.  Our First Lady and Second Gentleman. 

Members of Congress and the Cabinet.  Leaders of our military. 

Mr.  Chief Justice  Associate Justices  and retired Justices of the Supreme Court. 

And you  my fellow Americans. 

I start tonight by congratulating the members of the 118th Congress and the new Speaker of the House  Kevin McCarthy. 

Mr.  Speaker  I look forward to working together. 

I also want to congratulate the new leader of the House Democrats and the first Black House Minority Leader in history  Hakeem Jeffries. 

Congratulations to the longest serving Senate Leader in history  Mitch McConnell. 

And congratulations to Chuck Schumer for another term as Senate Majority Leader  this time with an even bigger majority.        
        ''',
        'summary': '''
The President congratulates the members of Congress, the new Speaker of the House, the new House Minority Leader, and the Senate Majority Leader in his State of the Union Address. He expresses his desire to work together with the new leadership and looks forward to a productive term.        
        ''',        
    },
    {
        'document': '''
Congratulations to the longest serving Senate Leader in history  Mitch McConnell. 

And congratulations to Chuck Schumer for another term as Senate Majority Leader  this time with an even bigger majority. 

And I want to give special recognition to someone who I think will be considered the greatest Speaker in the history of this country  Nancy Pelosi. 

The story of America is a story of progress and resilience.  Of always moving forward.  Of never giving up. 

A story that is unique among all nations. 

We are the only country that has emerged from every crisis stronger than when we entered it. 

That is what we are doing again. 

Two years ago  our economy was reeling. 

As I stand here tonight  we have created a record 12 million new jobs  more jobs created in two years than any president has ever created in four years.        
        ''',
        'summary': '''
A message of congratulations to Mitch McConnell, Chuck Schumer, and Nancy Pelosi for their roles in American leadership, highlighting the resilience and progress of the country. The speaker also emphasizes the strength of the American economy, with a record 12 million new jobs created in the past two years.        
        ''',        
    },    
    
]

In [3]:

eval_prompt_template = """You are comparing the summary text and it's original document and trying to determine if the summary is good. 
Here is the data:
[BEGIN DATA]
************
[Summary]: {summary}
************
[Original Document]: {document}
[END DATA]
Compare the Summary above to the Original Document and determine if the Summary is
comprehensive, concise, coherent, and independent relative to the Original Document.
Your response must be a string, either good or bad, and should not contain any text
or characters aside from that. The string bad means that the Summary is not comprehensive, concise,
coherent, and independent relative to the Original Document. The string good means the Summary
is comprehensive, concise, coherent, and independent relative to the Original Document."""

eval_prompt = PromptTemplate(template=eval_prompt_template, input_variables=["summary", "document"])

# Define the LLMs
map_llm = ChatOpenAI(temperature=0, model_name='gpt-3.5-turbo-1106')

map_llm_chain = LLMChain(llm = map_llm, prompt = eval_prompt)

map_llm_chain_input = eval_input_data
# Run the input through the LLM chain (works in parallel)
map_llm_chain_results = map_llm_chain.apply(map_llm_chain_input)



In [4]:
map_llm_chain_results

[{'text': 'bad'}, {'text': 'bad'}]

In [9]:
eval_prompt_template = """You are comparing the summary text and it's original document and trying to determine if the summary is good. 
Here is the data:
[BEGIN DATA]
************
[Summary]: {summary}
************
[Original Document]: {document}
[END DATA]

Compare the Summary above to the Original Document and determine if the Summary is
comprehensive, concise, coherent, and independent relative to the Original Document.

Your response will first return a string, either good or bad, and should not contain any text
or characters aside from that. The string bad means that the Summary is not comprehensive, concise,
coherent, and independent relative to the Original Document. The string good means the Summary
is comprehensive, concise, coherent, and independent relative to the Original Document.

You response should also contain reasons behind your evaluation.

Return your answer in the following format:
  good/bad | reasons...
"""

eval_prompt = PromptTemplate(template=eval_prompt_template, input_variables=["summary", "document"])

# Define the LLMs
map_llm = ChatOpenAI(temperature=0, model_name='gpt-3.5-turbo-1106')

map_llm_chain = LLMChain(llm = map_llm, prompt = eval_prompt)

map_llm_chain_input = eval_input_data
# Run the input through the LLM chain (works in parallel)
map_llm_chain_results = map_llm_chain.apply(map_llm_chain_input)

In [10]:
map_llm_chain_results

[{'text': 'bad | The summary is not comprehensive as it only mentions the President congratulating the members of Congress, the new Speaker of the House, the new House Minority Leader, and the Senate Majority Leader, while the original document contains additional greetings and acknowledgments. It is also not entirely coherent as it does not capture the specific individuals being congratulated and the context of the State of the Union Address. Additionally, it is not independent as it heavily relies on the original document for information.'},
 {'text': 'bad | The summary is not comprehensive as it only focuses on congratulating the leaders and highlighting the strength of the American economy, while the original document also includes a message about the story of America, progress, and resilience. It is also not coherent as it does not capture the full message of the original document.'}]

In [11]:
# read the state of the union
txt_path = 'stateoftheunion.txt'
with open(txt_path, 'r') as f:
  txt = f.read()

eval_input_data_all = [
    {
        'document': txt,
        'summary': '''
In his State of the Union Address, the President congratulates the new leadership in Congress and expresses his desire to work together for a productive term. He highlights the resilience and progress of the country, emphasizing the strength of the American economy with 12 million new jobs created in the past two years. The President also emphasizes the nation's recovery from the COVID pandemic and the importance of unity and working together for the progress and resilience of the nation. He points out that over 300 bipartisan laws have been signed, proving that both parties can come together for the greater good of the nation. The President stresses the need to move past political conflict and focus on restoring the soul of the nation and rebuilding the middle class.\n\nThe text outlines the speaker's vision to restore the soul of the nation, rebuild the middle class, and unite the country. It emphasizes the need to bring back pride and self-worth, and to fundamentally change the economy to work for everyone, building it from the bottom up and the middle out. The text discusses the benefits of a strong middle class, the increase in manufacturing jobs, and the importance of investing in infrastructure to maintain a strong economy. It also highlights the Bipartisan Infrastructure Law as the largest investment in infrastructure since the Interstate Highway System and the funding of over 20,000 projects across the country.\n\nThe text outlines the Inflation Reduction Act, which aims to lower healthcare costs for Americans by capping the cost of insulin and out-of-pocket drug costs for seniors on Medicare. It also discusses efforts to combat the climate crisis, including building new electric grids and promoting clean energy. The author emphasizes the need for bipartisan action to protect future generations and address corporate tax inequality. The proposal includes a minimum 15% tax for billion-dollar companies and measures to crack down on wealthy tax cheats, resulting in a $114 billion reduction in the deficit.        
        ''',        
    }   
]

In [12]:
txt

'Mr. Speaker. Madam Vice President. Our First Lady and Second Gentleman.\n\nMembers of Congress and the Cabinet. Leaders of our military.\n\nMr. Chief Justice, Associate Justices, and retired Justices of the Supreme Court.\n\nAnd you, my fellow Americans.\n\nI start tonight by congratulating the members of the 118th Congress and the new Speaker of the House, Kevin McCarthy.\n\nMr. Speaker, I look forward to working together.\n\nI also want to congratulate the new leader of the House Democrats and the first Black House Minority Leader in history, Hakeem Jeffries.\n\nCongratulations to the longest serving Senate Leader in history, Mitch McConnell.\n\nAnd congratulations to Chuck Schumer for another term as Senate Majority Leader, this time with an even bigger majority.\n\nAnd I want to give special recognition to someone who I think will be considered the greatest Speaker in the history of this country, Nancy Pelosi.\n\nThe story of America is a story of progress and resilience. Of alway

In [13]:
eval_prompt_template = """You are comparing the summary text and it's original document and trying to determine if the summary is good. 
Here is the data:
[BEGIN DATA]
************
[Summary]: {summary}
************
[Original Document]: {document}
[END DATA]

Compare the Summary above to the Original Document and determine if the Summary is
comprehensive, concise, coherent, and independent relative to the Original Document.

Your response will first return a string, either good or bad, and should not contain any text
or characters aside from that. The string bad means that the Summary is not comprehensive, concise,
coherent, and independent relative to the Original Document. The string good means the Summary
is comprehensive, concise, coherent, and independent relative to the Original Document.

You response should also contain reasons behind your evaluation.

Return your answer in the following format:
  good/bad | reasons...
"""

eval_prompt = PromptTemplate(template=eval_prompt_template, input_variables=["summary", "document"])

# Define the LLMs
map_llm = ChatOpenAI(temperature=0, model_name='gpt-3.5-turbo-1106')

map_llm_chain = LLMChain(llm = map_llm, prompt = eval_prompt)

map_llm_chain_input = eval_input_data_all
# Run the input through the LLM chain (works in parallel)
map_llm_chain_results = map_llm_chain.apply(map_llm_chain_input)

In [14]:
map_llm_chain_results

[{'text': 'bad | The summary does not capture the comprehensive and coherent nature of the original document. It lacks details on key points such as the specific legislative actions, the personal stories shared, and the specific policy proposals outlined by the President. Additionally, it does not provide a concise and independent overview of the original document.'}]

### Try change the prompt a bit to less strict(?) so it has better chance to give "good" response.

Results:
The 'bad' responses doesn't really make sense coz it does include congrats to those members..

In [16]:
eval_input_data

[{'document': '\nMr.  Speaker.  Madam Vice President.  Our First Lady and Second Gentleman. \n\nMembers of Congress and the Cabinet.  Leaders of our military. \n\nMr.  Chief Justice  Associate Justices  and retired Justices of the Supreme Court. \n\nAnd you  my fellow Americans. \n\nI start tonight by congratulating the members of the 118th Congress and the new Speaker of the House  Kevin McCarthy. \n\nMr.  Speaker  I look forward to working together. \n\nI also want to congratulate the new leader of the House Democrats and the first Black House Minority Leader in history  Hakeem Jeffries. \n\nCongratulations to the longest serving Senate Leader in history  Mitch McConnell. \n\nAnd congratulations to Chuck Schumer for another term as Senate Majority Leader  this time with an even bigger majority.        \n        ',
  'summary': '\nThe President congratulates the members of Congress, the new Speaker of the House, the new House Minority Leader, and the Senate Majority Leader in his Stat

In [26]:
eval_prompt_template = """You are comparing the summary text and it's original document and trying to determine 
if the summary is good. 

Here is the data:
[BEGIN DATA]
************
[Summary]: {summary}
************
[Original Document]: {document}
[END DATA]

Compare the Summary above to the Original Document and determine if the Summary contains most of the main
points in the original document.

Your response will first return a string, either good or bad, and should not contain any text
or characters aside from that. The string bad means that the Summary miss most of the main points in 
the Original Document. The string good means the Summary contains most of main points in the Original Document.

You response should also contain reasons behind your evaluation.

Return your answer in the following format:
  good/bad | reasons...
"""

eval_prompt = PromptTemplate(template=eval_prompt_template, input_variables=["summary", "document"])

# Define the LLMs
map_llm = ChatOpenAI(temperature=0, model_name='gpt-3.5-turbo-1106')

map_llm_chain = LLMChain(llm = map_llm, prompt = eval_prompt)

map_llm_chain_input = eval_input_data
# Run the input through the LLM chain (works in parallel)
map_llm_chain_results = map_llm_chain.apply(map_llm_chain_input)

In [27]:
map_llm_chain_results

[{'text': 'good | The summary contains most of the main points in the original document. It includes the President congratulating the members of Congress, the new Speaker of the House, the new House Minority Leader, and the Senate Majority Leader, as well as expressing his desire to work together with the new leadership and looking forward to a productive term.'},
 {'text': 'bad | The summary misses the specific congratulations to Mitch McConnell, Chuck Schumer, and Nancy Pelosi, as well as the emphasis on their roles in American leadership. It also fails to mention the unique story of progress and resilience in America and the specific mention of the record 12 million new jobs created in the past two years.'}]

### Try give it some percetange number see if it can figure out.

Results:
The 'bad' response seems better, it lists out the points that was not mentioned in the summary. However, it might not be really considered as 'bad' for a summary since it can't possibly keep everything.

Try out next:
Give more instructions on what is important to be included in the summary and what is not! Then evaluate based on that. Check out the Spotify paper on what is considered a 'Good Summary'.
https://tsapps.nist.gov/publication/get_pdf.cfm?pub_id=934398

"The stated task was to generate a short, accurate, and grammatically
sound text summary for each podcast episode using the transcripts
of the podcast episodes and/or the original audio"

'NIST assessors judged each summary on a four-point scale (Excellent, Good, Fair, and Bad)'


In [30]:
eval_input_data

[{'document': '\nMr.  Speaker.  Madam Vice President.  Our First Lady and Second Gentleman. \n\nMembers of Congress and the Cabinet.  Leaders of our military. \n\nMr.  Chief Justice  Associate Justices  and retired Justices of the Supreme Court. \n\nAnd you  my fellow Americans. \n\nI start tonight by congratulating the members of the 118th Congress and the new Speaker of the House  Kevin McCarthy. \n\nMr.  Speaker  I look forward to working together. \n\nI also want to congratulate the new leader of the House Democrats and the first Black House Minority Leader in history  Hakeem Jeffries. \n\nCongratulations to the longest serving Senate Leader in history  Mitch McConnell. \n\nAnd congratulations to Chuck Schumer for another term as Senate Majority Leader  this time with an even bigger majority.        \n        ',
  'summary': '\nThe President congratulates the members of Congress, the new Speaker of the House, the new House Minority Leader, and the Senate Majority Leader in his Stat

In [28]:
eval_prompt_template = """You are comparing the summary text and it's original document and trying to determine 
if the summary is good. 

Here is the data:
[BEGIN DATA]
************
[Summary]: {summary}
************
[Original Document]: {document}
[END DATA]

Compare the Summary above to the Original Document and determine if the Summary contains most of the key
points in the original document.

Your response will first return a string, either good or bad, and should not contain any text
or characters aside from that. The string bad means that the Summary contains less than 90% of the key points in 
the Original Document. The string good means the Summary contains more than 90% of the keys points in 
the Original Document.

You response should also contain reasons behind your evaluation.

Return your answer in the following format:
  good/bad | reasons...
"""

eval_prompt = PromptTemplate(template=eval_prompt_template, input_variables=["summary", "document"])

# Define the LLMs
map_llm = ChatOpenAI(temperature=0, model_name='gpt-3.5-turbo-1106')

map_llm_chain = LLMChain(llm = map_llm, prompt = eval_prompt)

map_llm_chain_input = eval_input_data
# Run the input through the LLM chain (works in parallel)
map_llm_chain_results = map_llm_chain.apply(map_llm_chain_input)

In [29]:
map_llm_chain_results

[{'text': "good | The summary contains most of the key points in the original document, including the President congratulating the members of Congress, the new Speaker of the House, the new House Minority Leader, and the Senate Majority Leader. It also mentions the President's desire to work together with the new leadership and looks forward to a productive term."},
 {'text': 'bad | The summary only mentions the congratulations to Mitch McConnell, Chuck Schumer, and Nancy Pelosi, and the strength of the American economy with 12 million new jobs created. It does not include the emphasis on American progress and resilience, the uniqueness of the American story, and the comparison of job creation to previous presidents.'}]

### Idea:

Should we ask user about what is important to be included in the summary first before generating those summary??


### Try use the Manual Evaluation instructions in the Good Summary paper as prompt

Results:
Pretty good. More instructions on how to rate the summary is returning much better evaluations. It has evident to know to look out for important attributes that is missing out in the summary.

In [31]:
eval_prompt_template = """You are comparing the summary text and it's original document and evaluate how good 
the summary text is.

Here is the data:
[BEGIN DATA]
************
[Summary]: {summary}
************
[Original Document]: {document}
[END DATA]

Compare the Summary above to the Original Document and determine if the Summary is either Excellent, Good, Fair, 
or Bad.

Your response will first return a string, either Excellent, Good, Fair, or Bad, and should not contain any text
or characters aside from that. 

The string Excellent means the summary accurately conveys all the most important
attributes of the episode, which could include topical content, genre,
and participants. In addition to giving an accurate representation
of the content, it contains almost no redundant material which is
not needed when deciding whether to listen. It is also coherent,
comprehensible, and has no grammatical errors.

The string Good means the summary conveys most of the important attributes and
gives the reader a reasonable sense of what the episode contains
with little redundant material which is not needed when deciding
whether to listen. Occasional grammatical or coherence errors are
acceptable.

The string Fair means the summary conveys some attributes of the content but gives
the reader an imperfect or incomplete sense of what the episode
contains. It may contain redundant material which is not needed
when deciding whether to listen and may contain repetitions or
broken sentences.

The string Bad means the summary does not convey any of the most important
content items of the episode or gives the reader an incorrect or incomprehensible sense of what the episode contains. It may contain
a large amount of redundant information that is not needed.

You response should also contain reasons behind your evaluation.

Return your answer in the following format:
  Excellent/Good/Fair/Bad | reasons...
"""

eval_prompt = PromptTemplate(template=eval_prompt_template, input_variables=["summary", "document"])

# Define the LLMs
map_llm = ChatOpenAI(temperature=0, model_name='gpt-3.5-turbo-1106')

map_llm_chain = LLMChain(llm = map_llm, prompt = eval_prompt)

map_llm_chain_input = eval_input_data
# Run the input through the LLM chain (works in parallel)
map_llm_chain_results = map_llm_chain.apply(map_llm_chain_input)

In [32]:
map_llm_chain_results

[{'text': 'Fair | The summary conveys some important attributes of the original document, such as the President congratulating the members of Congress, the new Speaker of the House, the new House Minority Leader, and the Senate Majority Leader. However, it does not mention the specific names of the leaders being congratulated, which is an important detail. Additionally, the summary could be more coherent and comprehensive in conveying the content of the original document.'},
 {'text': 'Good | The summary accurately captures the main points of the original document, such as congratulating Mitch McConnell, Chuck Schumer, and Nancy Pelosi, and highlighting the resilience and progress of the country. It also mentions the record 12 million new jobs created in the past two years. However, it could have included more details about the specific achievements of each leader and the state of the American economy. Overall, it gives a reasonable sense of the original document with little redundant 