In [None]:
import os,re
import yt_dlp
import json
import time
import math 
import httplib2
import requests
import pinecone 
import pandas as pd
import numpy as np
import urllib.request
from bs4 import BeautifulSoup
from bs4.element import Comment
import matplotlib.pyplot as plt
from youtubesearchpython import *
from langchain.llms import OpenAIChat
from bs4 import BeautifulSoup, SoupStrainer
from langchain.vectorstores import Pinecone
from langchain.chains import RetrievalQAWithSourcesChain
from langchain.chains import VectorDBQAWithSourcesChain
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter

## Besties GPT

`Here, we will prepare the VectorDB index for ALL IN podcast:`

* Use Whisper to transcribe episodes 
* Chunk data
* Embed it to Pinecone
* Test VectorDBQA chain on it 
 
`1. Get video urls -` 

In [None]:
from youtubesearchpython import ChannelsSearch
channelsSearch = ChannelsSearch('All In Podcast', limit = 10, region = 'US')
print(channelsSearch.result())


In [None]:
# https://pypi.org/project/youtube-search-python/
channel_id = "UCESLZhusAkFfsNsApnjF_Cg" # Get ID from ChannelsSearch
playlist = Playlist(playlist_from_channel_id(channel_id))
while playlist.hasMoreVideos:
    print('Getting more videos...')
    playlist.getNextVideos()
    print(f'Videos Retrieved: {len(playlist.videos)}')

In [None]:
# Episode data
stor_metadata=pd.DataFrame()
for v in playlist.videos:
    try:
        ep_number = int(v['title'].split(":")[0].split("E")[-1])
        stor_metadata.loc[v['title'],'number']=ep_number
        stor_metadata.loc[v['title'],'link']=v['link']
        stor_metadata.loc[v['title'],'title']=v['title']
        stor_metadata.loc[v['title'],'img']=v['thumbnails'][3]['url']
    except:
        if v['title']=="E76.5: Food shortage, China's grand plan, inflation, French election plus an All-In Summit preview":
            stor_metadata.loc[v['title'],'number']=1
            stor_metadata.loc[v['title'],'link']=v['link']
            stor_metadata.loc[v['title'],'title']=v['title']
            stor_metadata.loc[v['title'],'img']=v['thumbnails'][3]['url']
        print("Failed on %s", v['title'])

`2. Get audio -` 

In [None]:
# Iterate through episodes 
for ix in stor_metadata.index[-95:]:
    
    ep_number=int(stor_metadata.loc[ix,'number'])
    print("EPISODE: %s"%ep_number)
    img_url=stor_metadata.loc[ix,'img']
    ep_link=stor_metadata.loc[ix,'link']
    # Write img 
    with open("../public/0%s.jpg"%str(ep_number), 'wb') as f:
        response = requests.get(img_url)
        f.write(response.content)
    # Write audio
    ydl_opts = {
    'format': 'm4a/bestaudio/best',
    'outtmpl': 'audio/%s.m4a'%str(ep_number),
    'noplaylist': True,
    'postprocessors': [{  
        'key': 'FFmpegExtractAudio',
        'preferredcodec': 'm4a',
    }]}
    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        error_code = ydl.download(ep_link)
        
stor_metadata.reset_index().to_csv("audio_transcription/episodes.csv")

`3. Run Whisper -`
 
* On GPU, ideally: 10-20 min / video on 2080Ti with `medium` model
* Run `python run_whisper.py`

If running this step on a remote machine:
* scp the transcription: `audio_transcription/episodes.csv`
* scp the audio files: `audio/*`
* Run `python run_whisper.py`
* Then, scp the `audio_transcription/` back to local 

In [None]:
! python run_whisper.py

`4. Get transcripts -`

In [None]:
# *** Chunk size: key parameter *** 
chunks = 1500
splits_new = [ ]
metadatas_new = [ ]

# Read the csv file
new_ep=pd.read_csv("audio_transcription/episodes.csv",index_col=None)

for ix in new_ep.index:

    # Get data
    title=new_ep.loc[ix,'title']
    ep_number=int(new_ep.loc[ix,'number'])
    
    # Ep
    episode_id="0"+str(ep_number) 
    file_path='audio_transcription/%s.txt'%str(episode_id)
    transcript=pd.read_csv(file_path,sep='\t',header=None)
    transcript.columns=['links','time','chunks']
    
    # Clean text chunks 
    transcript['clean_chunks']=transcript['chunks'].astype(str).apply(lambda x: x.strip())
    links = list(transcript['links'])
    texts = transcript['clean_chunks'].str.cat(sep=' ')

    # Splits 
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunks, 
                                                   chunk_overlap=50) 
    splits = text_splitter.split_text(texts)
    print(len(splits)) 

    # Metadata 
    N = len(splits) 
    bins = np.linspace(0, len(links)-1, N, dtype=int)
    sampled_links = [links[i] for i in bins]
    
    # Here we can add "link", "title", etc that can be fetched in the app 
    metadatas=[{"source":title + " " +link,"id":episode_id,"link":link,"title":title} for link in sampled_links]
    print(len(metadatas)) 

    # Append to output 
    splits_new.append(splits)
    metadatas_new.append(metadatas)

`5. Assemble final list -`

In [None]:
# Join the list of lists 
splits_all = []
for sublist in splits_new:
    splits_all.extend(sublist)

metadatas_all = []
for sublist in metadatas_new:
    metadatas_all.extend(sublist)

`6. Embed full dataset in Pinecone VectorDB -`

In [None]:
# Pinecone
pinecone.init(
    api_key=os.environ.get('PINECONE_API_KEY'),  
    environment="us-east1-gcp")

# Update - 
index_name = "besties-gpt"
embeddings = OpenAIEmbeddings()
p = Pinecone.from_existing_index(index_name=index_name,embedding=embeddings)

In [None]:
# Add data in chunk to avoid data ingest errors
chunk_size = 100
last_chunk = 0
num_chunks = math.ceil(len(splits_all) / chunk_size)
for i in range(last_chunk,num_chunks):
    
    print(i)
    start_time = time.time()
    start_idx = i * chunk_size
    end_idx = min(start_idx + chunk_size, len(splits_all))
    
    # Extract the current chunk
    current_splits = splits_all[start_idx:end_idx]
    current_metadatas = metadatas_all[start_idx:end_idx]
    
    # Add the current chunk to the vector database
    p.add_texts(texts = current_splits, metadatas=current_metadatas)
    end_time = time.time()
    elapsed_time = end_time - start_time
    print(f"Elapsed time: {elapsed_time} seconds")
    print("--------")

`7. Read in VectorDB for testing` 

In [None]:
# Pinecone
pinecone.init(
    api_key=os.environ.get('PINECONE_API_KEY'),  
    environment="us-east1-gcp")
index_name = "besties-gpt"
embeddings = OpenAIEmbeddings()
p = Pinecone.from_existing_index(index_name=index_name,embedding=embeddings)

In [None]:
def run_retrievalQA_sources_chain(llm,query,docstore):

    start_time = time.time()
    chain = RetrievalQAWithSourcesChain.from_chain_type(llm,chain_type="stuff",retriever=docstore.as_retriever(k=3))
    a = chain({"question": query},return_only_outputs=True)
    print(a["answer"])
    print(a["sources"])
    end_time = time.time()
    elapsed_time = end_time - start_time
    print(f"Elapsed time: {elapsed_time} seconds")
    print("--------")

def run_vectorDBQA_sources_chain(llm,query,docstore,k):

    start_time = time.time()
    chain = VectorDBQAWithSourcesChain.from_chain_type(llm,chain_type="stuff",vectorstore=docstore,k=k)
    a = chain({"question": query},return_only_outputs=True)
    print(a["answer"])
    print(a["sources"])
    end_time = time.time()
    elapsed_time = end_time - start_time
    print(f"Elapsed time: {elapsed_time} seconds")
    print("--------")

llm = OpenAIChat(temperature=0)
q = "What is the cause of the SVB crisis?"
run_vectorDBQA_sources_chain(llm,q,p,4)

In [None]:
llm = OpenAIChat(model_name="gpt-4",temperature=0)
q = "What is the cause of the SVB crisis?"
run_vectorDBQA_sources_chain(llm,q,p,8)

`8. Evaluation` 

In [None]:
import json
with open('eval/final_eval.json', 'r') as f:
    eval_set = json.load(f)

In [None]:
from langchain.chains import VectorDBQA

llm = OpenAIChat(temperature=0)
chain_gpt3_k_4 = VectorDBQA.from_chain_type(llm,chain_type="stuff",vectorstore=p,k=4,input_key = "question")

llm = OpenAIChat(model_name="gpt-4",temperature=0)
chain_gpt4_k_4 = VectorDBQA.from_chain_type(llm,chain_type="stuff",vectorstore=p,k=4,input_key = "question")
chain_gpt4_k_8 = VectorDBQA.from_chain_type(llm,chain_type="stuff",vectorstore=p,k=8,input_key = "question")
 
from langchain.evaluation.qa import QAEvalChain
from langchain.chat_models import ChatOpenAI 
eval_chain = QAEvalChain.from_llm(llm=ChatOpenAI(temperature=0))

In [None]:
def run_eval(chain):

    predictions = []
    predicted_dataset = []
    latency = []

    for data in eval_set:
        
        start_time = time.time()
        new_data = {"question": data["question"],"answer": data["answer"]}
        predictions.append(chain(new_data))
        predicted_dataset.append(new_data)
        end_time = time.time()
        elapsed_time = end_time - start_time
        latency.append(elapsed_time)

    return predictions,predicted_dataset,latency

predictions_list = []
scores_list = []
latency_list = []

# Eval on chains 
for i,chain in enumerate([chain_gpt3_k_4,chain_gpt4_k_4,chain_gpt4_k_8]):    
    print(f"Evaluating chain {i+1}")
    predictions,predicted_dataset,latency=run_eval(chain)
    predictions_list.append(predictions)
    graded_outputs = eval_chain.evaluate(predicted_dataset, predictions, question_key="question", prediction_key="result")
    scores_list.append(graded_outputs)
    latency_list.append(latency)

In [None]:
# Results
stor=pd.DataFrame()

for i,chunk_size in enumerate(["GPT3.5_k_4","GPT4_k_4","GPT4_k_8"]):
    d=scores_list[i]
    incorrect_counts = []
    for dictionary in d:
        if dictionary['text'] == 'INCORRECT':
            incorrect_counts.append(1)
        else:
            incorrect_counts.append(0)
    stor.loc[chunk_size,'num_incorrect']=sum(incorrect_counts)

stor['pct_incorrect'] = stor['num_incorrect']  / len(eval_set)
stor['pct_correct'] = 1 - stor['pct_incorrect']
stor['pct_correct'].plot(kind='bar')
plt.title('Percentage of Correct Answers')
plt.xlabel('Chain')
plt.ylabel('Percentage')
plt.show()

In [None]:
latency=pd.DataFrame(latency_list).T
latency.columns = ["GPT3.5_k_4","GPT4_k_4","GPT4_k_8"]
latency.to_csv("results/latency.csv")
latency.boxplot()
plt.xlabel("Model")
plt.ylabel("Latency per query (seconds)")
plt.title("Latency for QA comparing ChatGPT vs GPT4 \n $\mu$ per model = 4.7s,13.3s,19.1s, $N=52$")

In [None]:
def eval_summary(i):
    d=pd.DataFrame(predictions_list[i])
    d['score']=list(score["text"] for score in scores_list[i])
    return d

GPT35_k_4_result=eval_summary(0)
GPT4_k_4_result=eval_summary(1)
GPT4_k_8_result=eval_summary(2)

In [None]:
GPT35_k_4_result.to_csv("results/GPT35_k_4_result.csv")
GPT4_k_4_result.to_csv("results/GPT4_k_4_result.csv")
GPT4_k_8_result.to_csv("results/GPT4_k_8_result.csv")

In [None]:
wrong3_5=GPT35_k_4_result[GPT35_k_4_result.score != "CORRECT"]
wrong3_5.to_csv("results/wrong3_5.csv")

In [None]:
wrong4=GPT4_k_4_result[GPT4_k_4_result.score != "CORRECT"]
wrong4.to_csv("results/wrong4_k4.csv")

In [None]:
wrong4=GPT4_k_4_result[GPT4_k_4_result.score != "CORRECT"]
wrong4.to_csv("results/wrong4_k8.csv")