In [14]:
pip install requests numpy pandas transformers langchain llama-index


Note: you may need to restart the kernel to use updated packages.


In [172]:
import requests
import pandas as pd
from transformers import pipeline


In [173]:
lecture_urls = [
    "https://stanford-cs324.github.io/winter2022/lectures/introduction/",
    "https://stanford-cs324.github.io/winter2022/lectures/harms-1/",
    "https://stanford-cs324.github.io/winter2022/lectures/modeling/",
    "https://stanford-cs324.github.io/winter2022/lectures/data/"
]

In [174]:
from bs4 import BeautifulSoup

lectures = []
for url in lecture_urls:
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    # Find and extract text elements from the webpage
    text = ' '.join([element.text for element in soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'ul'])])
    lectures.append(text)


In [175]:

architecture_table_url = "https://github.com/Hannibal046/Awesome-LLM?tab=readme-ov-file#milestone-papers"
response = requests.get(architecture_table_url)

soup = BeautifulSoup(response.text, 'html.parser')

table = ' '.join([element.text for element in soup.find_all(['table'])])

# print(table)


In [176]:


import google.generativeai as genai
import os

os.environ['GOOGLE_API_KEY'] = ''
# api_key = os.getenv('GOOGLE_API_KEY')
genai.configure(api_key=os.environ["GOOGLE_API_KEY"])

In [177]:
model = genai.GenerativeModel('gemini-1.0-pro-latest')
# response = model.generate_content("The opposite of hot is")
# print(response.text)

In [178]:
len(lectures)

4

In [179]:
from langchain_google_genai import GoogleGenerativeAIEmbeddings


gemini_embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")

In [181]:
lecture_embeddings = gemini_embeddings.embed_documents(lectures)
architecture_embeddings = gemini_embeddings.embed_documents(table)


In [182]:
documents = lectures + [table]
embeddings = lecture_embeddings + architecture_embeddings


In [183]:
from langchain_community.vectorstores import FAISS
from langchain_text_splitters import CharacterTextSplitter
from langchain_community.document_loaders import TextLoader


In [184]:
from langchain.schema import Document

document_objects = [Document(page_content=doc) for doc in documents]


In [185]:
vector_store = FAISS.from_documents(document_objects, gemini_embeddings)


In [214]:
queries = [
    "What is the problem with LAMBADA?"
]

In [215]:
def handle_query(query, threshold=0.7):
    # Embed the query
    query_embedding = gemini_embeddings.embed_query(query)
    
    relevant_docs_with_scores = vector_store.similarity_search_with_score_by_vector(query_embedding, k=5)
    
    relevant_docs = [doc for doc, score in relevant_docs_with_scores if score >= threshold]
    
    if not relevant_docs:
        return "The answer to your query could not be found in the provided documents."
    
    combined_content = ' '.join([doc.page_content for doc in relevant_docs])
    
    prompt = f"Based on the following documents:\n{combined_content}\n\nAnswer the following query:\n{query}"
    
    response = model.generate_content(prompt)
    
    return response.text



In [218]:
for query in queries:
    print("\n")
    print(f"Query: {query}")
    print(f"Response: {handle_query(query)}")
    print("\n")
    break



Query: What is the problem with LAMBADA?
Response: The document does not mention LAMBADA. So I cannot answer this question.


