<a href="https://colab.research.google.com/github/samtru99/Youtube-Chatbot/blob/main/YT_chatbot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Installation

In [None]:
!pip install -qU pytube moviepy pydub langchain pinecone-client tiktoken openai

In [None]:
#Youtube Extraction
from pytube import YouTube
from moviepy.editor import *
from pydub import AudioSegment
import os

#OpenAI
import openai
from openai import OpenAI

#Langchain
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Pinecone
from langchain.chains import LLMChain, HypotheticalDocumentEmbedder

from langchain.schema import (
    SystemMessage,
    HumanMessage,
    AIMessage
)
from langchain.chat_models import ChatOpenAI
import langchain

#others
import tiktoken
import re
import pinecone


# Phase 1 - Podcast Extaction

Enter in the Youtube Clip

ex - https://www.youtube.com/watch?v=MVYrJJNdrEg&t=80s

In [None]:
user_input = input("Enter in the link to the podcast")
yt = YouTube(user_input)

Enter in the link to the podcasthttps://www.youtube.com/watch?v=MVYrJJNdrEg&t=80s


Extract all MP4 audio files

In [None]:
audio = yt.streams.filter(only_audio=True)
for file in audio:
    if file.mime_type == "audio/mp4":
        stream = yt.streams.get_by_itag(file.itag)
        print(f"file - {file}")
        stream.download(filename='podcast.mp4')


file - <Stream: itag="139" mime_type="audio/mp4" abr="48kbps" acodec="mp4a.40.5" progressive="False" type="audio">
file - <Stream: itag="140" mime_type="audio/mp4" abr="128kbps" acodec="mp4a.40.2" progressive="False" type="audio">


Convert the MP4 to MP3

In [None]:
def MP4ToMP3(mp4, mp3):
    FILETOCONVERT = AudioFileClip(mp4)
    FILETOCONVERT.write_audiofile(mp3)
    FILETOCONVERT.close()

VIDEO_FILE_PATH = "podcast.mp4"
AUDIO_FILE_PATH = "new_podcast.mp3"
MP4ToMP3(VIDEO_FILE_PATH, AUDIO_FILE_PATH)

MoviePy - Writing audio in new_podcast.mp3


                                                                       

MoviePy - Done.




# Phase 2 - Audio Processing

Slice the MP3 audio file into 1 minute clips

In [None]:
from pydub import AudioSegment

amt_of_clips = 0
audio = AudioSegment.from_mp3("new_podcast.mp3")
one_minute = 1 * 60 * 1000
out_path = './audio_clipss/'
os.mkdir(out_path)
start = 0
end = one_minute
minute = 1
while end < len(audio):

    end = min(end, len(audio))
    new_chunk = audio[start:end]
    new_chunk.export(out_path+f"{minute}_minute.mp3", format="mp3")
    minute += 1
    start = end
    end += one_minute
    amt_of_clips+=1


In [None]:
amt_of_clips

64

# Phase 3 - Transcribe audio files

Import OpenAIs Whisper

In [None]:
client = OpenAI(
    api_key="ENTER YOUR KEY"
)


Tokenizer to calculate amount of tokens needed

In [None]:
tokenizer = tiktoken.get_encoding('cl100k_base')

#Create a length function
def tiktoken_len(text):
  tokens = tokenizer.encode(
      text,
      disallowed_special=()
  )
  return len(tokens)

Text Splitter function for chunking

In [None]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 400,
    chunk_overlap = 20,
    length_function = tiktoken_len,
    separators = ['\n\n', '\n', ' ', '']
)

Function to clean up text to reduce token usage

In [None]:
def clean_text_func(text):
    clean_text = re.sub(r'(\r\n|\r|\n){2,}', r'\n', text)
    clean_text = re.sub(r'[ \t]+', ' ', clean_text)
    clean_text = re.sub(r'[\n\n]', '', clean_text)
    return clean_text

Transcribe all audio files

In [None]:
list_of_chunks = []
for i in range(amt_of_clips):
  a_f = open("./audio_clips/" + f"{i+1}" + "_minute.mp3", "rb")
  time_stamp = i
  transcript = client.audio.transcriptions.create(
  model="whisper-1",
  file=a_f
  )
  clean_transcript = clean_text_func(transcript.text)
  list_of_chunks.append([clean_transcript, time_stamp])

list_of_chunks

# Phase 4 - Embed Text and Store Vectors in Pinecone

Utilize OpenAI Embedding

In [None]:
embed_model = OpenAIEmbeddings(
    model="text-embedding-ada-002",
    openai_api_key = "ENTER YOUR KEY"
)

Create Pinecone Database

In [None]:
pinecone.init(
    api_key = 'ENTER YOUR KEY',
    environment = 'gcp-starter'
)

if 'yt-db' not in pinecone.list_indexes():
  pinecone.create_index('yt-db', dimension = 1536)

index = pinecone.Index('yt-db')


Embed and Store Vectors

In [None]:
batch_size = 20

for i in range(0, amt_of_clips,batch_size):
  i_end = min(i+batch_size, amt_of_clips)
  # IDs
  ids = [str(n) for n in range(i,i_end)]
  # Meta
  meta = []
  for x in range(i, i_end):
    meta.append(
        {
          'text': list_of_chunks[x][0],
          'timestamp':list_of_chunks[x][1]
        }
    )
  #Embeddings
  embeddings = []
  for x in range(i,i_end):
    embed = embed_model.embed_documents(list_of_chunks[x][0])
    embeddings.append(embed[0])
  index.upsert(vectors=zip(ids,embeddings,meta))

amn =  64


# Phase 5 Querying

Initialize ChatGPT and Pinecone

In [None]:
text_field = "text"
vectorstore = Pinecone(
    index, embed_model, text_field
)

chat = ChatOpenAI(
    openai_api_key="ENTER YOUR KEY",
    model="gpt-3.5-turbo"
)

Initialize HyDE to be used in semantic searches

In [None]:
hyDE_embedding = HypotheticalDocumentEmbedder.from_llm(
    chat, embed_model, prompt_key="web_search"
)

Ask Away

In [None]:
def end_convo():
  return False


'''
  Main portion of the program
'''
def continue_convo(messages):

  timestamps = []
  def augmented(query: str):
    hyDE_ans = hyDE_embedding.embed_query(query)
    results = index.query(top_k=3,vector = hyDE_ans, include_metadata=True)
    source_knowledge = "\n".join([x['metadata']['text'] for x in results['matches']])
    #In Progress - Adding time stamps to print out for ref in the convo
    '''
    for t in results['matches']['metadata']['timestamp']:
    timestamps.append(t)
    '''
    augmented_prompt = f"""Using the contexts below, answer the query.

    Contexts:
    {source_knowledge}

    Query: {query}"""

    return augmented_prompt
  question = input("Enter in Question")
  prompt = HumanMessage(
      content = augmented(question)
  )
  messages.append(prompt)
  ai_response = chat(messages)
  print(f"AI: \n{ai_response.content}")
  '''
  print("Timestamp references(min)\n")
  for t in timestamps:
    print(t)
  '''
  messages.append(ai_response)
  return True

def new_convo():
  messages.clear()
  messages.append(SystemMessage(content="You are a helpful assistant"))
  return True

def default_case():
  print("Invalid Input")
  return True


convo = True
switch_dict = {
      'E': end_convo,
      'C': continue_convo,
      'N': new_convo
  }

messages = [
    SystemMessage(content="You are a helpful assistant")
]
while convo:
  if len(messages) == 1:
    print("Please ask a question to begin the conversation")
    continue_convo(messages)
  else:
    user_input = input(f"End Conversation (E)\nContinue Conversation(C)\nNew Conversation(N)")
    action = switch_dict.get(user_input, default_case)

    if action == continue_convo:
      convo = action(messages)
    else:
      convo = action()




print("convo over")