In [122]:
from google.colab import userdata
import os

OPENAI_API_KEY = userdata.get('OPENAI_API_KEY')

YOUTUBE_VIDEO = "https://youtu.be/oO8w6XcXJUs"

In [123]:
#Setting up the model

from langchain_openai.chat_models import ChatOpenAI

model = ChatOpenAI(openai_api_key=OPENAI_API_KEY, model="gpt-3.5-turbo")

In [124]:
model.invoke("Which NBA team won the NBA Finals in the 2019?")

AIMessage(content='The Toronto Raptors won the NBA Finals in 2019.', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 12, 'prompt_tokens': 20, 'total_tokens': 32, 'completion_tokens_details': {'reasoning_tokens': 0}}, 'model_name': 'gpt-3.5-turbo-0125', 'system_fingerprint': None, 'finish_reason': 'stop', 'logprobs': None}, id='run-d91d7896-2f26-4ecf-b2a9-277b43cce9a2-0', usage_metadata={'input_tokens': 20, 'output_tokens': 12, 'total_tokens': 32})

In [125]:
model.invoke("Who won the Oscars for Best Actor in 2018?")

AIMessage(content='Gary Oldman won the Oscar for Best Actor in 2018 for his role as Winston Churchill in the film "Darkest Hour."', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 27, 'prompt_tokens': 19, 'total_tokens': 46, 'completion_tokens_details': {'reasoning_tokens': 0}}, 'model_name': 'gpt-3.5-turbo-0125', 'system_fingerprint': None, 'finish_reason': 'stop', 'logprobs': None}, id='run-5274c721-10b3-401b-9718-4a3b74b7f140-0', usage_metadata={'input_tokens': 19, 'output_tokens': 27, 'total_tokens': 46})

In [126]:
from langchain_core.output_parsers import StrOutputParser

parser = StrOutputParser()

chain = model | parser

chain.invoke("Who won the Preimer League in 2019?")

'Manchester City won the Premier League in 2019.'

In [127]:
#Introducing propmt templates

from langchain.prompts import ChatPromptTemplate


template = """
Answer the question based on the context below. If you can't
answer the question, reply "I don't know the answer to that question."

Context: {context}

Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)
prompt.format(context="Aneta's brother is John", question="Who is Aneta's brother?")


'Human: \nAnswer the question based on the context below. If you can\'t\nanswer the question, reply "I don\'t know the answer to that question."\n\nContext: Aneta\'s brother is John\n\nQuestion: Who is Aneta\'s brother?\n'

In [128]:
chain = prompt | model | parser

chain.invoke({
    "context": "Aneta's brother is John",
    "question": "Whi is Aneta's brother?"
})

"Aneta's brother is John."

In [129]:
# Combining chains

translation_prompt = ChatPromptTemplate.from_template(
     "Translate {answer} to {language}"
)


In [130]:
from operator import itemgetter

translation_chain = (
    {"answer": chain, "language": itemgetter("language")} | translation_prompt | model | parser
)

translation_chain.invoke(
    {
    "context": "Aneta's brother is John. She doesn't have any more siblings.",
    "question": "How many siblings does Aneta have?",
    "language": "French",
    }
)

'Aneta a un frère, son frère John.'

In [131]:
# Transcribing the YouTube Video

import tempfile
import whisper
from pytubefix import YouTube
from pytubefix.cli import on_progress

if not os.path.exists("transcription.txt"):
  youtube = YouTube(YOUTUBE_VIDEO)
  audio = youtube.streams.filter(only_audio=True).first()


  whisper_model = whisper.load_model("base")

  with tempfile.TemporaryDirectory() as tmpdir:
    file = audio.download(output_path=tmpdir)
    transcription = whisper_model.transcribe(file, fp16=False)["text"].strip()

    with open("transcription.txt", "w") as file:
      file.write(transcription)

In [132]:
with open("transcription.txt") as file:
  transcription = file.read()

transcription[:100]

'My first guest is the man who made electric cars a thing, and is currently working on perfecting reu'

In [133]:
# Using transcription as context

try:
  chain.invoke({
      "context": transcription,
      "question": "is making electric cars a good idea?"
  })
except Exception as e:
  print(e)

In [134]:
# Splitting the trasncription

from langchain_community.document_loaders import TextLoader

loader = TextLoader("transcription.txt")
text_documents = loader.load()
text_documents


[Document(metadata={'source': 'transcription.txt'}, page_content="My first guest is the man who made electric cars a thing, and is currently working on perfecting reusable rockets, space travel, connecting the human brain directly to computers, connecting cities with electromagnetic bullet trains, the Starling satellite system, that's so important to the war in Ukraine, and then on Tuesday. LAUGHTER It's going to work on that tunnel thing on traffic. He also tweets a lot. Elon Musk, ladies and gentlemen. CHEERING AND APPLAUSE Look at you. LAUGHTER Come on. Come on. CHEERING AND APPLAUSE What did I get the full order of things that you do in a day there when I was reading there? I left out the tunnel thing at the end. Do you work on all these... A lot of jobs. Do you do all these things every day? Do you work on all of them in a single day? No. But I do have a long work day. Yeah, so I work a lot. LAUGHTER Well, I'm so thrilled you're here, because, you know, we do a show where we talk 

In [135]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=20)
split_docs = text_splitter.split_documents(text_documents)[:5]

In [136]:
split_docs[:5]

[Document(metadata={'source': 'transcription.txt'}, page_content="My first guest is the man who made electric cars a thing, and is currently working on perfecting reusable rockets, space travel, connecting the human brain directly to computers, connecting cities with electromagnetic bullet trains, the Starling satellite system, that's so important to the war in Ukraine, and then on Tuesday. LAUGHTER It's going to work on that tunnel thing on traffic. He also tweets a lot. Elon Musk, ladies and gentlemen. CHEERING AND APPLAUSE Look at you. LAUGHTER Come on. Come on. CHEERING AND APPLAUSE What did I get the full order of things that you do in a day there when I was reading there? I left out the tunnel thing at the end. Do you work on all these... A lot of jobs. Do you do all these things every day? Do you work on all of them in a single day? No. But I do have a long work day. Yeah, so I work a lot. LAUGHTER Well, I'm so thrilled you're here, because, you know, we do a show where we talk 

In [138]:
from langchain_openai.embeddings import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)
embedded_query = embeddings.embed_query("Who is Aneta's brother?")


print(f"Embedding length: {len(embedded_query)}")
print(embedded_query[:10])

Embedding length: 1536
[0.00965595431625843, -0.02455728128552437, 2.9322402042453177e-05, -0.023637039586901665, -0.014947338029742241, 0.016722088679671288, -0.025201449170708656, 0.00012776556832250208, -0.0065895807929337025, 0.0010451305424794555]


In [139]:
sentence1 = embeddings.embed_query("Aneta's brother is John")
sentence2 = embeddings.embed_query("Sophia's mother is a teacher")

In [140]:
from sklearn.metrics.pairwise import cosine_similarity

query_sentence1_similarity = cosine_similarity([embedded_query], [sentence1])[0][0]
query_sentence2_similarity = cosine_similarity([embedded_query], [sentence2])[0][0]

query_sentence1_similarity, query_sentence2_similarity

(0.9508115269637826, 0.7688260016651269)

In [141]:
from langchain_community.vectorstores import DocArrayInMemorySearch

vectorstore1 = DocArrayInMemorySearch.from_texts(
   [ "Aneta's brother is John",
     "Joe and Steve are borthers",
     "Joanna likes cars",
     "Sophia's mother is a teacher",
     "Lucia drives an Audi",
     "Aneta has two siblings",
     "Mercedes is an amazing car"
   ],
   embedding=embeddings
)

In [142]:
vectorstore1.similarity_search_with_score(query="Who is Aneta's brother?", k=3)

[(Document(page_content="Aneta's brother is John"), 0.950811531882629),
 (Document(page_content='Aneta has two siblings'), 0.9092610198366385),
 (Document(page_content='Joe and Steve are borthers'), 0.7867379526754881)]

In [143]:
retriever1 = vectorstore1.as_retriever()
retriever1.invoke("Who is Aneta's brother?")

[Document(page_content="Aneta's brother is John"),
 Document(page_content='Aneta has two siblings'),
 Document(page_content='Joe and Steve are borthers'),
 Document(page_content='Joanna likes cars')]

In [144]:
from langchain_core.runnables import RunnableParallel, RunnablePassthrough

setup = RunnableParallel(context=retriever1, question=RunnablePassthrough())
setup.invoke("What's the great car?")

{'context': [Document(page_content='Mercedes is an amazing car'),
  Document(page_content='Joanna likes cars'),
  Document(page_content='Lucia drives an Audi'),
  Document(page_content='Joe and Steve are borthers')],
 'question': "What's the great car?"}

In [145]:
chain = setup | prompt | model | parser
chain.invoke("What does Joanna like?")

'Joanna likes cars.'

In [146]:
# Loading documents into the vector store

vectorstore2 = DocArrayInMemorySearch.from_documents(text_documents, embeddings)

In [147]:
chain = (
    {"context": vectorstore2.as_retriever(), "question": RunnablePassthrough()}
    | prompt
    | model
    | parser
)
chain.invoke("What is a First Amendment?")

'The First Amendment is the part of the United States Constitution that guarantees freedom of speech.'

In [167]:
from langchain_pinecone import PineconeVectorStore
import os

os.environ["OPENAI_API_KEY"] = userdata.get("OPENAI_API_KEY")
os.environ["PINECONE_API_KEY"] = userdata.get("DB_API_KEY")

embeddings = OpenAIEmbeddings()

index_name = "rag-index"


In [168]:
vectorstore = PineconeVectorStore.from_documents(split_docs, embeddings, index_name=index_name)

In [169]:
vectorstore.similarity_search("What is Tesla going to start doing?")[:3]

[Document(metadata={'source': 'transcription.txt'}, page_content="My first guest is the man who made electric cars a thing, and is currently working on perfecting reusable rockets, space travel, connecting the human brain directly to computers, connecting cities with electromagnetic bullet trains, the Starling satellite system, that's so important to the war in Ukraine, and then on Tuesday. LAUGHTER It's going to work on that tunnel thing on traffic. He also tweets a lot. Elon Musk, ladies and gentlemen. CHEERING AND APPLAUSE Look at you. LAUGHTER Come on. Come on. CHEERING AND APPLAUSE What did I get the full order of things that you do in a day there when I was reading there? I left out the tunnel thing at the end. Do you work on all these... A lot of jobs. Do you do all these things every day? Do you work on all of them in a single day? No. But I do have a long work day. Yeah, so I work a lot. LAUGHTER Well, I'm so thrilled you're here, because, you know, we do a show where we talk 

In [171]:
chain = (
    {"context": vectorstore.as_retriever(), "question": RunnablePassthrough()}
    | prompt
    | model
    | parser
)

chain.invoke("What is Tesla going to do?")

'Tesla is going to work on perfecting reusable rockets, space travel, connecting the human brain directly to computers, connecting cities with electromagnetic bullet trains, and the Starling satellite system.'