In [47]:
from pinecone import Pinecone, ServerlessSpec

import os
from dotenv import load_dotenv

load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
PINECONE_API_KEY=os.getenv("PINECONE_API_KEY")

# I am using Santiago's video instead
# YOUTUBE_VIDEO="https://www.youtube.com/watch?v=BrsocJb-fAo&t=114s"
YOUTUBE_VIDEO = "https://www.youtube.com/watch?v=cdiD-9MMpb0"


In [3]:
from langchain_openai.chat_models import ChatOpenAI

model = ChatOpenAI(openai_api_key=OPENAI_API_KEY, model="gpt-3.5-turbo")
model.invoke("name all the MLB team won the World Series during the COVID-19 pandemic?")

AIMessage(content='1. Los Angeles Dodgers (2020)\n2. Atlanta Braves (2021)', response_metadata={'token_usage': {'completion_tokens': 17, 'prompt_tokens': 23, 'total_tokens': 40}, 'model_name': 'gpt-3.5-turbo', 'system_fingerprint': 'fp_b28b39ffa8', 'finish_reason': 'stop', 'logprobs': None})

In [4]:
from langchain_core.output_parsers import StrOutputParser

parser = StrOutputParser()

chain = model | parser
chain.invoke("name all the MLB team won the World Series during the COVID-19 pandemic?")

'- Los Angeles Dodgers (2020)\n- Atlanta Braves (2021)'

In [5]:
from langchain.prompts import ChatPromptTemplate

template = """
Answer the question based on the context below. If you can't 
answer the question, reply "I don't know".

Context: {context}

Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)

prompt.format(context="Mary's sister is Susana", question="Who is Mary's sister?")


'Human: \nAnswer the question based on the context below. If you can\'t \nanswer the question, reply "I don\'t know".\n\nContext: Mary\'s sister is Susana\n\nQuestion: Who is Mary\'s sister?\n'

In [6]:
chain = prompt | model | parser
chain.invoke({
    "context": "Mary's sister is Susana",
    "question": "Who is Mary's sister?"
})

'Susana'

In [7]:
translation_prompt = ChatPromptTemplate.from_template(
    "Translate {answer} to {language}"
)

In [8]:
from operator import itemgetter

translation_chain = (
    {"answer": chain, "language": itemgetter("language")} | translation_prompt | model | parser
)

translation_chain.invoke(
    {
        "context": "Mary's sister is Susana. She doesn't have any more siblings.",
        "question": "How many sisters does Mary have?",
        "language": "Farsi",
    }
)

'مری یک خواهر دارد.'

In [9]:
import tempfile
import whisper
from pytube import YouTube


# Let's do this only if we haven't created the transcription file yet.
if not os.path.exists("transcription.txt"):
    youtube = YouTube(YOUTUBE_VIDEO)
    audio = youtube.streams.filter(only_audio=True).first()

    # Let's load the base model. This is not the most accurate
    # model but it's fast.
    whisper_model = whisper.load_model("base")

    with tempfile.TemporaryDirectory() as tmpdir:
        file = audio.download(output_path=tmpdir)
        transcription = whisper_model.transcribe(file, fp16=False)["text"].strip()

        with open("transcription.txt", "w") as file:
            file.write(transcription)

In [10]:
with open("transcription.txt") as file:
    transcription = file.read()

transcription[:100]

"I think it's possible that physics has exploits and we should be trying to find them. arranging some"

In [11]:
try:
    result = chain.invoke({
        "context": transcription,
        "question": "what is pinecone?"
    })
    print(result)
except Exception as e:
    print(e)

Error code: 400 - {'error': {'message': "This model's maximum context length is 16385 tokens. However, your messages resulted in 47045 tokens. Please reduce the length of the messages.", 'type': 'invalid_request_error', 'param': 'messages', 'code': 'context_length_exceeded'}}


In [29]:
from langchain_community.document_loaders import TextLoader

loader = TextLoader("transcription.txt")
text_documents = loader.load()
text_documents
print(text_documents)

[Document(page_content="I think it's possible that physics has exploits and we should be trying to find them. arranging some kind of a crazy quantum mechanical system that somehow gives you buffer overflow, somehow gives you a rounding error in the floating point. Synthetic intelligences are kind of like the next stage of development. And I don't know where it leads to. Like at some point, I suspect the universe is some kind of a puzzle. These synthetic AIs will uncover that puzzle and solve it. The following is a conversation with Andre Kappathi, previously the director of AI at Tesla. And before that, at OpenAI and Stanford, he is one of the greatest scientist engineers and educators in the history of artificial intelligence. This is the Lex Friedman podcast to support it. Please check out our sponsors and now to your friends. Here's Andre Kappathi. What is a neural network? And what does it seem to do such a surprisingly good job of learning? What is a neural network? It's a mathema

In [32]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=200, chunk_overlap=20)
text_splitter.split_documents(text_documents)

[Document(page_content="I think it's possible that physics has exploits and we should be trying to find them. arranging some kind of a crazy quantum mechanical system that somehow gives you buffer overflow, somehow gives you", metadata={'source': 'transcription.txt'}),
 Document(page_content="somehow gives you a rounding error in the floating point. Synthetic intelligences are kind of like the next stage of development. And I don't know where it leads to. Like at some point, I suspect the", metadata={'source': 'transcription.txt'}),
 Document(page_content='I suspect the universe is some kind of a puzzle. These synthetic AIs will uncover that puzzle and solve it. The following is a conversation with Andre Kappathi, previously the director of AI at', metadata={'source': 'transcription.txt'}),
 Document(page_content='director of AI at Tesla. And before that, at OpenAI and Stanford, he is one of the greatest scientist engineers and educators in the history of artificial intelligence. This 

In [33]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=20)
documents = text_splitter.split_documents(text_documents)

In [34]:
try:
    result = chain.invoke({
        "context": text_splitter,
        "question": "what exploits exist?"
    })
    print(result)
except Exception as e:
    print(e)

expected string or buffer


In [15]:
from langchain_openai.embeddings import OpenAIEmbeddings

embeddings = OpenAIEmbeddings()
embedded_query = embeddings.embed_query("Who is Mary's sister?")

print(f"Embedding length: {len(embedded_query)}")
print(embedded_query[:10])

Embedding length: 1536
[-0.0013711900817658918, -0.03434698236453121, -0.011476094990116795, 0.0012773800454156583, -0.026166747008526305, 0.00923090794939205, -0.015660022937300146, 0.0017948988196774909, -0.01185133513551773, -0.03324627818637451]


In [16]:
sentence1 = embeddings.embed_query("Mary's sister is Susana")
sentence2 = embeddings.embed_query("Pedro's mother is a teacher")

In [17]:
from sklearn.metrics.pairwise import cosine_similarity

query_sentence1_similarity = cosine_similarity([embedded_query], [sentence1])[0][0]
query_sentence2_similarity = cosine_similarity([embedded_query], [sentence2])[0][0]

query_sentence1_similarity, query_sentence2_similarity

(0.9173518138025496, 0.7680495517171395)

In [18]:
from langchain_community.vectorstores import DocArrayInMemorySearch

vectorstore1 = DocArrayInMemorySearch.from_texts(
    [
        "Mary's sister is Susana",
        "John and Tommy are brothers",
        "Patricia likes white cars",
        "Pedro's mother is a teacher",
        "Lucia drives an Audi",
        "Mary has two siblings",
    ],
    embedding=embeddings,
)

In [19]:
vectorstore1.similarity_search_with_score(query="Who is Mary's sister?", k=3)

[(Document(page_content="Mary's sister is Susana"), 0.9174549036927804),
 (Document(page_content='Mary has two siblings'), 0.9045440036524317),
 (Document(page_content='John and Tommy are brothers'), 0.801535744115216)]

In [21]:
retriever1 = vectorstore1.as_retriever()
retriever1.invoke("Who is Mary's sister?")

[Document(page_content="Mary's sister is Susana"),
 Document(page_content='Mary has two siblings'),
 Document(page_content='John and Tommy are brothers'),
 Document(page_content="Pedro's mother is a teacher")]

In [23]:
from langchain_core.runnables import RunnableParallel, RunnablePassthrough

setup = RunnableParallel(context=retriever1, question=RunnablePassthrough())
setup.invoke("What color is Patricia's car?")

{'context': [Document(page_content='Patricia likes white cars'),
  Document(page_content='Lucia drives an Audi'),
  Document(page_content="Pedro's mother is a teacher"),
  Document(page_content="Mary's sister is Susana")],
 'question': "What color is Patricia's car?"}

In [24]:
chain = setup | prompt | model | parser
chain.invoke("What color is Patricia's car?")

'White'

In [25]:
chain.invoke("What car does Lucia drive?")

'Lucia drives an Audi.'

In [40]:
vectorstore2 = DocArrayInMemorySearch.from_documents(documents, embeddings)
len(documents)

221

In [44]:
chain = (
    {"context": vectorstore2.as_retriever(), "question": RunnablePassthrough()}
    | prompt
    | model
    | parser
)
chain.invoke("What is AGI, synth intelligence?")

'AGI refers to Artificial General Intelligence, which is the concept of a machine that has the ability to understand, learn, and apply knowledge across a wide range of tasks similar to human intelligence. Synthetic intelligence, on the other hand, refers to artificially created intelligent systems that may surpass human intelligence in certain aspects.'

In [54]:
    import os
    from pinecone import Pinecone, ServerlessSpec

    pc = Pinecone(
        api_key=os.environ.get("PINECONE_API_KEY")
    )

    # Now do stuff
    if 'my_index' not in pc.list_indexes().names():
        pc.create_index(
            name='youtube-index', 
            dimension=1536, 
            metric='euclidean',
            spec=ServerlessSpec(
                cloud='aws',
                region='us-east-1'
            )
        )



In [56]:
from langchain_pinecone import PineconeVectorStore

index_name = "youtube-index"

pinecone = PineconeVectorStore.from_documents(
    documents, embeddings, index_name=index_name
)

In [57]:
pinecone.similarity_search("What is Hollywood going to start doing?")[:3]

[Document(page_content="It's like high quality audio and you're speaking usually pretty clearly. I don't know what open AI's plans are either. Yeah, there's always fun projects basically. And stable diffusion also is opening up a huge amount of experimentation. I would say in the visual realm and generating images and videos and movies. I'll think like videos now. And so that's going to be pretty crazy. That's going to almost certainly work and it's going to be really interesting when the cost of content creation is going to fall to zero. You used to need a painter for a few months to paint a thing and now it's going to be speak to your phone to get your video. So Hollywood will start using it to generate scenes, which completely opens up. Yeah, so you can make a movie like Avatar eventually for under a million dollars. Much less. Maybe just by talking to your phone. I mean, I know it sounds kind of crazy. And then there'd be some voting mechanism. Like how do you have a, like, would t

In [59]:
chain = (
    {"context": pinecone.as_retriever(), "question": RunnablePassthrough()}
    | prompt
    | model
    | parser
)

chain.invoke("What is Hollywood going to start doing?")

'Hollywood is going to start using AI to generate scenes for movies.'

In [64]:
import ebooklib
from ebooklib import epub

book = epub.read_epub('/Users/quentin/src/ask-a-dead-author-app/twain-books/mark-twain_a-connecticut-yankee-in-king-arthurs-court.epub')

# Access book metadata
print("Title:", book.title)
print("Author:", book.get_metadata('DC', 'creator')[0][0])
print("Language:", book.language)

Title: A Connecticut Yankee in King Arthur’s Court
Author: Mark Twain
Language: en


  for root_file in tree.findall('//xmlns:rootfile[@media-type]', namespaces={'xmlns': NAMESPACES['CONTAINERNS']}):


In [65]:
def extract_body(epub_path):
    book = epub.read_epub(epub_path)
    body_content = ""

    for item in book.get_items():
        if item.get_type() == ebooklib.ITEM_DOCUMENT:
            body_content += item.get_body_content().decode('utf-8')

    return body_content

In [66]:
epub_file_path = '/Users/quentin/src/ask-a-dead-author-app/twain-books/mark-twain_a-connecticut-yankee-in-king-arthurs-court.epub'
extracted_body = extract_body(epub_file_path)
print(extracted_body)



<body epub:type="bodymatter z3998:fiction">
		<section id="chapter-1" role="doc-chapter" epub:type="chapter">
			<hgroup>
				<h2 class="first-child" epub:type="ordinal z3998:roman">I</h2>
				<p epub:type="title">Camelot</p>
			</hgroup>
			<p>“Camelot﻿—Camelot,” said I to myself. “I don’t seem to remember hearing of it before. Name of the asylum, likely.”</p>
			<p>It was a soft, reposeful summer landscape, as lovely as a dream, and as lonesome as Sunday. The air was full of the smell of flowers, and the buzzing of insects, and the twittering of birds, and there were no people, no wagons, there was no stir of life, nothing going on. The road was mainly a winding path with hoof-prints in it, and now and then a faint trace of wheels on either side in the grass﻿—wheels that apparently had a tire as broad as one’s hand.</p>
			<p>Presently a fair slip of a girl, about ten years old, with a cataract of golden hair streaming down over her shoulders, came along. Around her head she wore a h