In [1]:
from datetime import datetime
import pandas as pd
import numpy as np
import json
import os
import csv
import sys
import matplotlib.pyplot as plt
from scipy.spatial.distance import cosine
import networkx as nx
from networkx.algorithms import community

from langchain import OpenAI
from langchain.chat_models import ChatOllama
from langchain.prompts import PromptTemplate
from langchain.chains.llm import LLMChain
from langchain.embeddings import OpenAIEmbeddings
from langchain.docstore.document import Document
from langchain.chains.summarize import load_summarize_chain
from langchain.text_splitter import RecursiveCharacterTextSplitter

csv.field_size_limit(sys.maxsize)

In [2]:
import torch

print(torch.cuda.is_available())

print(torch.cuda.device_count())
print(torch.cuda.current_device())
print(torch.cuda.device(0))
print(torch.cuda.get_device_name(0))


True
2
0
<torch.cuda.device object at 0x7f9b35453050>
NVIDIA GeForce RTX 2080 Ti


In [3]:
# Load the vtt_data.csv file
# filter only use 'large' files

podcast_data = []
row_num = 0
with open('vtt_data.csv') as csvfile:
    reader = csv.reader(csvfile, delimiter='|')
    for row in reader:
        row_num += 1
        
        if row_num == 1:
            continue
            
        filename = row[5]
        if not filename.endswith("_large.vtt"):
            continue

        podcast = {    
            "episode_index": row[0],    
            "guest": row[1],
            "episode_name": row[2],
            "host_name": row[3],
            "episode_number": row[4],
            "transcript": row[6],
            "duration": row[7],
        }
        podcast_data.append(podcast)
#         break

print(len(podcast_data))        

319


In [None]:
def is_techincal_podcast_title(title):

    eval_prompt_template = """
    Determine if the given title belongs to a technical podcast. 
    
    Here is the Title: {title}
    
    Answer 'yes' if the title belongs to a technical podcast. Otherwise, answer 'no'.
    Also, gives a reason for your answer.
    
    Your answer format should be:
    yes/no | reason
    """
    
    eval_prompt = PromptTemplate(template=eval_prompt_template, input_variables=['title'])

    # Define the LLMs
    map_llm = ChatOllama(model="openchat")

    map_llm_chain = LLMChain(llm = map_llm, prompt = eval_prompt)

    input_data = [
        {
            'title': title
        }
    ]

    # Run the input through the LLM chain (works in parallel)
    map_llm_chain_results = map_llm_chain.apply(input_data)
    
    return map_llm_chain_results
    

is_techincal_answers = []    
for podcast in podcast_data:
    
    # use LLM to determine if this episode_name is considered a 'technical podcast'
    episode_name = podcast['episode_name']   
    episode_number = podcast['episode_number'] 
    
    result = is_techincal_podcast_title(episode_name)
    
#     print(result)
    
    result = result[0]['text'].split("|")
    
    answer = {
        "episode_number": episode_number,
        "is_technical" : result[0].strip(),
        "reason": result[1].strip()
    }
    
#     print(answer)

    print(f"episode_name: {episode_name}, is_technical: {answer['is_technical']}")
    is_techincal_answers.append(answer)    

with open(f"./summarized_dataset/check_is_techincal_podcast_ollama_openchat.json", "w") as outfile: 
    json.dump(is_techincal_answers, outfile)    


episode_name: Life 3.0, is_technical: no
episode_name: Consciousness, is_technical: no
episode_name: AI in the Age of Reason, is_technical: yes
episode_name: Deep Learning, is_technical: yes
episode_name: Statistical Learning, is_technical: Yes
episode_name: Python, is_technical: no
episode_name: Stack Overflow and Coding Horror, is_technical: yes
episode_name: Google, is_technical: no
episode_name: Long-Term Future of Artificial Intelligence, is_technical: no
episode_name: Deep Reinforcement Learning, is_technical: yes
episode_name: Godel Machines, Meta-Learning, and LSTMs, is_technical: Yes
episode_name: Poker and Game Theory, is_technical: no
episode_name: Brains, Minds, and Machines, is_technical: no
episode_name: Cruise Automation, is_technical: no
episode_name: Reinforcement Learning, Planning, and Robotics, is_technical: yes
episode_name: Revolutionary Ideas in Science, Math, and Society, is_technical: no
episode_name: OpenAI and AGI, is_technical: no
episode_name: Tesla Autopil