In [1]:
from datetime import datetime
import pandas as pd
import numpy as np
import json
import os
import csv
import sys
import matplotlib.pyplot as plt
from scipy.spatial.distance import cosine
import networkx as nx
from networkx.algorithms import community

from langchain import OpenAI
from langchain.chat_models import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain.chains.llm import LLMChain
from langchain.embeddings import OpenAIEmbeddings
from langchain.docstore.document import Document
from langchain.chains.summarize import load_summarize_chain
from langchain.text_splitter import RecursiveCharacterTextSplitter
import torch
import random

os.environ["OPENAI_API_KEY"] = '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'

random.seed(1234)
np.random.seed(1234)
torch.manual_seed(1234)

<torch._C.Generator at 0x7fb031bc6f30>

In [2]:

print(torch.cuda.is_available())
print(torch.cuda.device_count())
print(torch.cuda.current_device())
print(torch.cuda.device(0))
print(torch.cuda.get_device_name(0))


True
2
0
<torch.cuda.device object at 0x7fb031aa2d90>
NVIDIA GeForce RTX 2080 Ti


In [3]:
# Load the vtt_data.csv file
# filter only use 'large' files

csv.field_size_limit(sys.maxsize)

podcast_data = []
row_num = 0
with open('vtt_data.csv') as csvfile:
    reader = csv.reader(csvfile, delimiter='|')
    for row in reader:
        row_num += 1
        
        if row_num == 1:
            continue
            
        filename = row[5]
        if not filename.endswith("_large.vtt"):
            continue

        podcast = {    
            "episode_index": row[0],    
            "guest": row[1],
            "episode_name": row[2],
            "host_name": row[3],
            "episode_number": row[4],
            "transcript": row[6],
            "duration": row[7],
        }
        podcast_data.append(podcast)
#         break

print(len(podcast_data))        

319


In [4]:

def parse_title_summary_results(results):
  out = []
  for e in results:
    e = e.replace('\n', '')
    if '|' in e:
      processed = {'title': e.split('|')[0],
                    'summary': e.split('|')[1][1:]
                    }
    elif ':' in e:
      processed = {'title': e.split(':')[0],
                    'summary': e.split(':')[1][1:]
                    }
    elif '-' in e:
      processed = {'title': e.split('-')[0],
                    'summary': e.split('-')[1][1:]
                    }
    else:
      processed = {'title': '',
                    'summary': e
                    }
    out.append(processed)
  return out

In [5]:
def summarize_stage_1(chunks_text):
  
  print(f'Start time: {datetime.now()}')

  # Prompt to get title and summary for each chunk
  map_prompt_template = """Firstly, give the following podcast an informative title. Then, on a new line, write a 75-100 word summary of the following text:
  {text}

  Return your answer in the following format:
  Title | Summary...
  e.g. 
  Why Artificial Intelligence is Good | AI can make humans more productive by automating many repetitive processes.

  TITLE AND CONCISE SUMMARY:"""

  map_prompt = PromptTemplate(template=map_prompt_template, input_variables=["text"])

  # Define the LLMs
  map_llm = ChatOpenAI(temperature=0, model_name='gpt-3.5-turbo-1106')
    
  map_llm_chain = LLMChain(llm = map_llm, prompt = map_prompt)
  map_llm_chain_input = [{'text': t} for t in chunks_text]
  # Run the input through the LLM chain (works in parallel)
  map_llm_chain_results = map_llm_chain.apply(map_llm_chain_input)

  stage_1_outputs = parse_title_summary_results([e['text'] for e in map_llm_chain_results])

  print(f'Stage 1 done time {datetime.now()}')

  return {
    'stage_1_outputs': stage_1_outputs
  }

In [6]:
def generate_embeddings(text_array):
    os.environ["OPENAI_API_KEY"] = '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'
    # Use OpenAI to embed the summaries and titles. Size of _embeds: (num_chunks x 1536)
    openai_embed = OpenAIEmbeddings()

    return np.array(openai_embed.embed_documents(text_array))
    

In [7]:
# Run the community detection algorithm

def get_topics(title_similarity, num_topics = 8, bonus_constant = 0.25, min_size = 3):

  proximity_bonus_arr = np.zeros_like(title_similarity)
  for row in range(proximity_bonus_arr.shape[0]):
    for col in range(proximity_bonus_arr.shape[1]):
      if row == col:
        proximity_bonus_arr[row, col] = 0
      else:
        proximity_bonus_arr[row, col] = 1/(abs(row-col)) * bonus_constant
        
  title_similarity += proximity_bonus_arr

  title_nx_graph = nx.from_numpy_array(title_similarity)

  desired_num_topics = num_topics
    
  # Store the accepted partitionings
  topics_title_accepted = []

  resolution = 0.85
  resolution_step = 0.01
  iterations = 40

  # Find the resolution that gives the desired number of topics
  topics_title = []
  while len(topics_title) not in [desired_num_topics, desired_num_topics + 1, desired_num_topics + 2]:
    topics_title = community.louvain_communities(title_nx_graph, weight = 'weight', resolution = resolution)
    resolution += resolution_step
  topic_sizes = [len(c) for c in topics_title]
  sizes_sd = np.std(topic_sizes)
  modularity = community.modularity(title_nx_graph, topics_title, weight = 'weight', resolution = resolution)

  lowest_sd_iteration = 0
  # Set lowest sd to inf
  lowest_sd = float('inf')

  for i in range(iterations):
    topics_title = community.louvain_communities(title_nx_graph, weight = 'weight', resolution = resolution)
    modularity = community.modularity(title_nx_graph, topics_title, weight = 'weight', resolution = resolution)
    
    # Check SD
    topic_sizes = [len(c) for c in topics_title]
    sizes_sd = np.std(topic_sizes)
    
    topics_title_accepted.append(topics_title)
    
    if sizes_sd < lowest_sd and min(topic_sizes) >= min_size:
      lowest_sd_iteration = i
      lowest_sd = sizes_sd
      
  # Set the chosen partitioning to be the one with highest modularity
  topics_title = topics_title_accepted[lowest_sd_iteration]
  print(f'Best SD: {lowest_sd}, Best iteration: {lowest_sd_iteration}')
  
  topic_id_means = [sum(e)/len(e) for e in topics_title]
  # Arrange title_topics in order of topic_id_means
  topics_title = [list(c) for _, c in sorted(zip(topic_id_means, topics_title), key = lambda pair: pair[0])]
  # Create an array denoting which topic each chunk belongs to
  chunk_topics = [None] * title_similarity.shape[0]
  for i, c in enumerate(topics_title):
    for j in c:
      chunk_topics[j] = i
            
  return {
    'chunk_topics': chunk_topics,
    'topics': topics_title
    }

In [8]:
def rewrite_summary(summary):
    eval_prompt_template = """
    Rewrite the given summary to improve readability.
    Use transitional words or phrases at the beginning of paragraphs if necessary.
    Remove the reference of 'podcast' in the rewritten summary.
    The rewritten summary should have 300-400 words.

    Here is the data:
    {summary}

    Return your answer in the following format:
    REWRITTEN_SUMMARY
    """
    
    eval_prompt = PromptTemplate(template=eval_prompt_template, input_variables=["summary"])

    # Define the LLMs
    map_llm = ChatOpenAI(temperature=0, model_name='gpt-3.5-turbo-1106')

    map_llm_chain = LLMChain(llm = map_llm, prompt = eval_prompt)

    eval_input_data = [
        {
            'summary': summary    
        }
    ]
    
    map_llm_chain_input = eval_input_data
    # Run the input through the LLM chain (works in parallel)
    map_llm_chain_results = map_llm_chain.apply(map_llm_chain_input)
    print()
    print("RRR given summary")
    print(summary)
    print("RRR rewritten summary")
    print(map_llm_chain_results)
    return map_llm_chain_results[0]['text']
    

In [9]:
def summarize_stage_2(stage_1_outputs, topics, summary_num_words = 250):
  print(f'Stage 2 start time {datetime.now()}')
  
  # Prompt that passes in all the titles of a topic, and asks for an overall title of the topic
  title_prompt_template = """Write an informative title that summarizes each of the following groups of titles. Make sure that the titles capture as much information as possible, 
  and are different from each other:
  {text}
  
  Return your answer in a numbered list, with new line separating each title: 
  1. Title 1
  2. Title 2
  3. Title 3

  TITLES:
  """

#   map_prompt_template = """Wite a 75-100 word summary of the following text:
#     {text}

#     CONCISE SUMMARY:"""

# Use less word to try solve the warning/error:
# Token indices sequence length is longer than the specified maximum sequence length for this model (1313 > 1024). 
# Running this sequence through the model will result in indexing errors
# 75-100
  map_prompt_template = """Write a 175-200 word summary of the following topic of a podcast:
      {text}

      CONCISE SUMMARY:"""
    

  print(f"RRRRRR summary_num_words: {summary_num_words}")

  combine_prompt_template = 'Write a ' + str(summary_num_words) + """-word summary of the following podcast, removing irrelevant information. 
  
  Finish your answer:
  {text}
  """ + str(summary_num_words) + """-WORD SUMMARY:"""

  title_prompt = PromptTemplate(template=title_prompt_template, input_variables=["text"])
  map_prompt = PromptTemplate(template=map_prompt_template, input_variables=["text"])
  combine_prompt = PromptTemplate(template=combine_prompt_template, input_variables=["text"])

  topics_data = []
  for c in topics:
    topic_data = {
      'summaries': [stage_1_outputs[chunk_id]['summary'] for chunk_id in c],
      'titles': [stage_1_outputs[chunk_id]['title'] for chunk_id in c]
    }
    topic_data['summaries_concat'] = ' '.join(topic_data['summaries'])
    topic_data['titles_concat'] = ', '.join(topic_data['titles'])
    topics_data.append(topic_data)
    
  # Get a list of each community's summaries (concatenated)
  topics_summary_concat = [c['summaries_concat'] for c in topics_data]
  topics_titles_concat = [c['titles_concat'] for c in topics_data]

  # Concat into one long string to do the topic title creation
  topics_titles_concat_all = ''''''
  for i, c in enumerate(topics_titles_concat):
    topics_titles_concat_all += f'''{i+1}. {c}
    '''
  
  # print('topics_titles_concat_all', topics_titles_concat_all)
  title_llm = ChatOpenAI(temperature=0, model_name='gpt-3.5-turbo-1106')
  title_llm_chain = LLMChain(llm = title_llm, prompt = title_prompt)
  title_llm_chain_input = [{'text': topics_titles_concat_all}]
  title_llm_chain_results = title_llm_chain.apply(title_llm_chain_input)
  
  
  # Split by new line
  titles = title_llm_chain_results[0]['text'].split('\n')
  # Remove any empty titles
  titles = [t for t in titles if t != '']
  # Remove spaces at start or end of each title
  titles = [t.strip() for t in titles]

  map_llm = ChatOpenAI(temperature=0, model_name='gpt-3.5-turbo-1106')
  reduce_llm = ChatOpenAI(temperature=0, model_name='gpt-3.5-turbo-1106')

  # Run the map-reduce chain
  docs = [Document(page_content=t) for t in topics_summary_concat]
  chain = load_summarize_chain(chain_type="map_reduce", map_prompt = map_prompt, combine_prompt = combine_prompt, return_intermediate_steps = True,
                              llm = map_llm, reduce_llm = reduce_llm)

  output = chain({"input_documents": docs}, return_only_outputs = True)
  summaries = output['intermediate_steps']
  stage_2_outputs = [{'title': t, 'summary': s} for t, s in zip(titles, summaries)]
  final_summary = output['output_text']


  final_summary = rewrite_summary(final_summary)

  # Return: stage_1_outputs (title and summary), stage_2_outputs (title and summary), final_summary, chunk_allocations
  out = {
    'stage_2_outputs': stage_2_outputs,
    'final_summary': final_summary
  }
  print(f'Stage 2 done time {datetime.now()}')
  
  return out

In [10]:
# Filter out and keep only techincal podcasts
f = open('./summarized_dataset/check_is_techincal_podcast.json')
 
# returns JSON object as 
# a dictionary
check_is_technical_podcast = json.load(f)
 
is_techincal_episode_numbers = []

for podcast in check_is_technical_podcast:
    is_technical = podcast['is_technical']
    if is_technical == "yes":
        is_techincal_episode_numbers.append(podcast['episode_number'])
        
print(is_techincal_episode_numbers)
print(len(is_techincal_episode_numbers))


['3', '4', '5', '6', '7', '9', '10', '11', '13', '14', '15', '17', '18', '19', '20', '21', '22', '23', '24', '25', '28', '30', '31', '32', '34', '35', '36', '38', '40', '41', '42', '43', '44', '47', '48', '49', '50', '52', '53', '56', '57', '60', '61', '62', '65', '66', '68', '69', '70', '71', '72', '73', '74', '75', '76', '79', '80', '81', '83', '86', '89', '90', '91', '92', '93', '94', '95', '97', '98', '99', '103', '104', '106', '108', '109', '110', '111', '113', '114', '115', '118', '119', '120', '122', '126', '129', '130', '131', '132', '133', '139', '141', '144', '146', '147', '148', '151', '153', '155', '157', '160', '168', '173', '177', '181', '183', '186', '187', '188', '190', '193', '195', '206', '208', '209', '213', '215', '217', '218', '219', '221', '222', '224', '225', '235', '241', '246', '247', '250', '252', '257', '258', '261', '266', '271', '280', '294', '299', '302', '306', '307', '309', '322', '325']
144


In [56]:
def extract_keypoints(chunks_text, show_log=False):
  
  print(f'extract_keypoints start time: {datetime.now()}')

  # Prompt to get title and summary for each chunk
  map_prompt_template = """
  Extract the key points out of the give text:
  {text}

  Return your answer in a list, with new line separating each key point.
  There is no limit on the number of key points in your list
  Each key point starts with '<->' and ends with a '.'
  Here is the format of the list: 
  <-> key point 1
  <-> key point 2
  <-> key point 3
  ...

  KEY_POINTS:
  """

  map_prompt = PromptTemplate(template=map_prompt_template, input_variables=["text"])

  # Define the LLMs
  map_llm = ChatOpenAI(temperature=0, model_name='gpt-3.5-turbo-1106')
    
  map_llm_chain = LLMChain(llm = map_llm, prompt = map_prompt)
  map_llm_chain_input = [{'text': t} for t in chunks_text]
  # Run the input through the LLM chain (works in parallel)
  map_llm_chain_results = map_llm_chain.apply(map_llm_chain_input)

#   if show_log:   
#       print("map_llm_chain_results:")
#       print(map_llm_chain_results)
    
  keypoints = []
  for i, result in enumerate(map_llm_chain_results):
      if show_log:
          print("chunks:")
          print(chunks_text[i])
          print("keypoints:")
          print(result['text'])
          print("-------")
            
      result_keypoints = result['text'].split('<->')
      result_keypoints = [k.strip() for k in result_keypoints if k.strip()]
      keypoints = keypoints + result_keypoints

  # deduplication
  keypoints = list(set(keypoints)) 

  print(f'extract_keypoints done time {datetime.now()}')
  return keypoints

In [45]:
def remove_questions(chunks_text, show_log=False):
  print(f'remove_questions start time: {datetime.now()}')

  map_prompt_template = """
  Your jon is to read through the given text and remove sentences that are asking a question.
  Remove all the sentences that end with a question mark '?'.
  Here is the given text:
  {text}

  Return your answer as text with sentences that are question removed.

  QUESTIONS_REMOVED_TEXT:
  """

  map_prompt = PromptTemplate(template=map_prompt_template, input_variables=["text"])

  # Define the LLMs
  map_llm = ChatOpenAI(temperature=0, model_name='gpt-3.5-turbo-1106')
    
  map_llm_chain = LLMChain(llm = map_llm, prompt = map_prompt)
  map_llm_chain_input = [{'text': t} for t in chunks_text]
  # Run the input through the LLM chain (works in parallel)
  map_llm_chain_results = map_llm_chain.apply(map_llm_chain_input)

  print("remove_questions map_llm_chain_results:")
#   print(map_llm_chain_results)
  print(f'remove_questions done time {datetime.now()}')
 
  processed_chunks = []
  for i, result in enumerate(map_llm_chain_results):
      if show_log: 
          print("chunks:")
          print(chunks_text[i])
          print("question removed chunks:")
          print(result['text'])
          print("-------")
      processed_chunks.append({'text':result['text']})

  return processed_chunks   
    

In [41]:
def create_sentences(segments, MIN_WORDS, MAX_WORDS):

  # Combine the non-sentences together
  sentences = []

  is_new_sentence = True
  sentence_length = 0
  sentence_num = 0
  sentence_segments = []

  for i in range(len(segments)):
    if is_new_sentence == True:
      is_new_sentence = False
    # Append the segment
    sentence_segments.append(segments[i])
    segment_words = segments[i].split(' ')
    sentence_length += len(segment_words)
    
    # If exceed MAX_WORDS, then stop at the end of the segment
    # Only consider it a sentence if the length is at least MIN_WORDS
    if (sentence_length >= MIN_WORDS and segments[i][-1] == '.') or sentence_length >= MAX_WORDS:
      sentence = ' '.join(sentence_segments)
      sentences.append({
        'sentence_num': sentence_num,
        'text': sentence,
        'sentence_length': sentence_length
      })
      # Reset
      is_new_sentence = True
      sentence_length = 0
      sentence_segments = []
      sentence_num += 1

  return sentences

def create_chunks(sentences, CHUNK_LENGTH, STRIDE):

  sentences_df = pd.DataFrame(sentences)
  
  chunks = []
  for i in range(0, len(sentences_df), (CHUNK_LENGTH - STRIDE)):
    chunk = sentences_df.iloc[i:i+CHUNK_LENGTH]
    chunk_text = ' '.join(chunk['text'].tolist())
    
    chunks.append({
      'start_sentence_num': chunk['sentence_num'].iloc[0],
      'end_sentence_num': chunk['sentence_num'].iloc[-1],
      'text': chunk_text,
      'num_words': len(chunk_text.split(' '))
    })
    
  chunks_df = pd.DataFrame(chunks)
  return chunks_df.to_dict('records')

In [57]:

import time
    
podcast_summary = []

for podcast in podcast_data:
    
    if not podcast['episode_number'] in is_techincal_episode_numbers:
        #print(f"episode {podcast['episode_number']} is not technical. skip")
        continue
    
    if int(podcast['episode_number']) != 22:    
        #print(f"episode {podcast['episode_number']} already processed. skip")
        continue
    
    
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=900,
        chunk_overlap=200,
        length_function=len,
        is_separator_regex=False,
    )
    chunks_text = text_splitter.split_text(podcast['transcript'])
    
    
#     segments = podcast['transcript'].split('.')
#     # Put the . back in
#     segments = [segment + '.' for segment in segments]
#     # Further split by comma
#     segments = [segment.split(',') for segment in segments]
#     # Flatten
#     segments = [item for sublist in segments for item in sublist]

#     sentences = create_sentences(segments, MIN_WORDS=20, MAX_WORDS=80)
#     chunks = create_chunks(sentences, CHUNK_LENGTH=5, STRIDE=1)
#     chunks_text = [chunk['text'] for chunk in chunks]
    
    chunks_text = remove_questions(chunks_text)
    
#     continue
    
    print(f"chunks_text len: {len(chunks_text)}")
    keypoints = extract_keypoints(chunks_text)
    
    print("RRR keypoints")
    for keypoint in keypoints:
        print(keypoint)
        
    continue
    
    # Run Stage 1 Summarizing
    stage_1_outputs = summarize_stage_1(chunks_text)['stage_1_outputs']
    # Split the titles and summaries
    stage_1_summaries = [e['summary'] for e in stage_1_outputs]
    stage_1_titles = [e['title'] for e in stage_1_outputs]
    num_1_chunks = len(stage_1_summaries)
    
    # Generate embeddings
    print("generating embeddings...")
    summary_embeds = generate_embeddings(stage_1_summaries)
    #title_embeds = generate_embeddings(stage_1_titles) # not used
    print("done gen embeddings.")
    
    # Get similarity matrix between the embeddings of the chunk summaries
    summary_similarity_matrix = np.zeros((num_1_chunks, num_1_chunks))
    summary_similarity_matrix[:] = np.nan

    for row in range(num_1_chunks):
      for col in range(row, num_1_chunks):
        # Calculate cosine similarity between the two vectors
        similarity = 1- cosine(summary_embeds[row], summary_embeds[col])
        summary_similarity_matrix[row, col] = similarity
        summary_similarity_matrix[col, row] = similarity
        
    time.sleep(10)    
    
    # Set num_topics to be 1/4 of the number of chunks, or 8, which ever is smaller
    num_topics = min(int(num_1_chunks / 4), 8)
    
    print(f"num_topics: {num_topics}")
    print(f"get topics {datetime.now()} ...")
    topics_out = get_topics(summary_similarity_matrix, num_topics = num_topics, bonus_constant = 0.2)
    print(f"done get topics {datetime.now()}.")
    chunk_topics = topics_out['chunk_topics']
    topics = topics_out['topics']
    
    print(f"topics out: {len(topics)}")
    
#     # Plot a heatmap of this array
#     plt.figure(figsize = (10, 4))
#     plt.imshow(np.array(chunk_topics).reshape(1, -1), cmap = 'tab20')
#     # Draw vertical black lines for every 1 of the x-axis 
#     for i in range(1, len(chunk_topics)):
#       plt.axvline(x = i - 0.5, color = 'black', linewidth = 0.5)
    
    # Query LLM to get a summarized title for each topic_data
    out = summarize_stage_2(stage_1_outputs, topics, summary_num_words = 600) #250)
    stage_2_outputs = out['stage_2_outputs']
    stage_2_titles = [e['title'] for e in stage_2_outputs]
    
    print(f"stage_2_titles: len: {len(stage_2_titles)}")
    print(stage_2_titles)
    
    stage_2_summaries = [e['summary'] for e in stage_2_outputs]
    final_summary = out['final_summary']
    
    summarized_podcast = {
        "episode_number": podcast['episode_number'],
        "title_and_summary_array": stage_2_outputs,
        "final_summary": final_summary
    }
    
    with open(f"./summarized_dataset/podcast_summaries_openai_gpt35turbo_{podcast['episode_number']}_v3_stage3_extractkeypoints.json", "w") as outfile: 
        json.dump(summarized_podcast, outfile)

    time.sleep(20)
#     break
    
# print(podcast_summary)    
    

remove_questions start time: 2024-03-17 21:23:09.608663
remove_questions map_llm_chain_results:
remove_questions done time 2024-03-17 21:27:51.814655
chunks_text len: 92
extract_keypoints start time: 2024-03-17 21:27:51.814801
extract_keypoints done time 2024-03-17 21:29:43.430702
RRR keypoints
Rajat Manga is an engineer and director of Google, leading the TensorFlow team.
TensorFlow is an open source library at the center of much of the work in deep learning.
It is now an ecosystem of tools for the deployment of machine learning in various platforms.
There is a big emphasis on growing a passionate community of developers.
TensorFlow 2.0 is now in alpha and is being developed by a large team of engineers at Google Brain.
The decision to open source TensorFlow is a definitive moment in the tech industry.
TensorFlow 2.0 is now in alpha.
The decision to open source TensorFlow is a definitive moment in the tech industry.
Open innovation can be successful and inspire many companies to open 