In [None]:
%pip install -U -q langchain langchain-community langchain-core sentence_transformers langchain-openai python-dotenv beautifulsoup4 langchain-chroma langchain-groq langchain-ollama FlagEmbedding peft gradio

In [None]:

# https://mer.vin/2024/02/ollama-embedding/ - original source code
import sys
print(f"Python interpreter: {sys.executable}") # getting python interpreter

from langchain_community.document_loaders import WebBaseLoader
from langchain_community.document_loaders import PyPDFLoader
from langchain_chroma import Chroma
from langchain_community import embeddings
from langchain.embeddings import HuggingFaceBgeEmbeddings


# from langchain_community.chat_models import ChatOllama
from langchain_ollama import ChatOllama
from langchain_groq import ChatGroq
from langchain_openai import ChatOpenAI, OpenAIEmbeddings # change model and embedding #c1
from langchain_community.embeddings import OllamaEmbeddings

from langchain_core.runnables import RunnablePassthrough, RunnableLambda
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain.output_parsers import PydanticOutputParser
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter

from urllib.parse import urlparse, unquote
from pathlib import Path  

import shutil
import requests
import re

from langchain.docstore.document import Document 

from FlagEmbedding import FlagReranker


In [None]:


def extract_page_name(docs):
    for item in docs: 
        full_url = item.metadata.get("source")
        parsed_url = urlparse(full_url)
        page_name = unquote(parsed_url.path.split('/')[-1])

        item.metadata["id"] = page_name
        print(item.metadata["id"])
        

def scrape_jina_ai_2(url: str) -> Document:  
    response = requests.get("https://r.jina.ai/" + url) 
    content = response.text
    
    # remove urls
    content = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', content)
    
    return Document(page_content=content, metadata={"source": url}) 

    

In [None]:
# embedding = OpenAIEmbeddings( model = "text-embedding-3-small") # using openAI embedding

model_name = "BAAI/bge-large-en-v1.5"           # using bge as embedding
encode_kwargs = {'normalize_embeddings': True}  # set True to compute cosine similarity
embedding = HuggingFaceBgeEmbeddings(
    model_name=model_name,
    encode_kwargs=encode_kwargs
) 

# from LLM for devs: https://github.com/trancethehuman/ai-workshop-code/blob/main/Web_scraping_for_LLM_in_2024.ipynb
print("Initializing vector store...")

urls = [
    "https://frackinuniverse.miraheze.org/wiki/Main_Page",
    "https://frackinuniverse.miraheze.org/wiki/Getting_Started", 
    "https://frackinuniverse.miraheze.org/wiki/Personal_Tricorder",
    "https://frackinuniverse.miraheze.org/wiki/The_Player",
    
    "https://frackinuniverse.miraheze.org/wiki/Stars",
    "https://frackinuniverse.miraheze.org/wiki/Crafting",
    "https://frackinuniverse.miraheze.org/wiki/Combat",
    "https://frackinuniverse.miraheze.org/wiki/Weapons",
    "https://frackinuniverse.miraheze.org/wiki/Planets",
    "https://frackinuniverse.miraheze.org/wiki/Biomes"
]

# loading urls 
# docs = [WebBaseLoader(url).load() for url in urls]
# docs_list = [item for sublist in docs for item in sublist]

docs_list = [scrape_jina_ai_2(url) for url in urls] # scrape using Jina AI's reader

print("BIOMES PAGE >>> ")
print(docs_list[-1].page_content+"\n")

# set page name in metadata
extract_page_name(docs_list)

# split document into chunks
# TODO: experiment with chunk size
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=1000, chunk_overlap=200)
doc_splits = text_splitter.split_documents(docs_list)

print("\n--- Document Chunks Information ---")
print(f"Number of document chunks: {len(doc_splits)}")
print(f"Sample chunk\n{doc_splits[0]}\n")


# Convert documents to Embeddings and store them
print("\n--- Creating vector store ---")
vectorstore = Chroma.from_documents(
    documents=doc_splits,
    embedding = embedding,
)


In [None]:
# retriever = vectorstore.as_retriever(search_kwargs={"k":6})  # initialize retriever
retriever = vectorstore.as_retriever(search_kwargs={"k":6})  # initialize retriever


# Add this function after your existing imports and before the main code
def rerank_and_select(query, documents, top_k=3):
    reranker = FlagReranker('BAAI/bge-reranker-large', use_fp16=True)
    
    # Prepare passages for reranking
    passages = [doc.page_content for doc in documents]
    
    # rerank_scores is a list of floats, e.g., [0.95, 0.87, 0.76, ...]
    rerank_scores = reranker.compute_score([[query, p] for p in passages])
    
    # Sort documents based on scores | ranked_results gets a sorted list of tuples (if an iterator of sets were passed, then sorted() returns a sorted list of sets), where each tuple is (float, Document)
    ranked_results = sorted(zip(rerank_scores, documents), key=lambda x: x[0], reverse=True)
    # INSIDES --> result of zip is an iterator of tuples, where each tuple is (float, Document) | iterators are 'consumed' only ONCE
    #             key : the tuples based on the rerank_score, the 1st part element of the tuple
    #             reverse: we want the largest score to be placed first (descending)
    
    # Select top k results
    top_results = ranked_results[:top_k]
    
    # Extract the original documents from the top results
    selected_docs = [result[1] for result in top_results]
    
    return selected_docs

def retrieve_and_format(query):
    relevant_docs = retriever.invoke(query)
    
    reranked_docs = rerank_and_select(query,relevant_docs)
    
    print('\n>>>> SOURCES <<<<< :')
    print([doc.metadata.get("id") for doc in reranked_docs])
    
    print_page_contents(reranked_docs)
    
    return "\n\n".join([doc.page_content for doc in reranked_docs])

def print_page_contents(docs):
    i=1
    for doc in docs:
        print(f"======= Doc {i} =======")
        print(doc.page_content)
        # print(doc.page_content[:100])
        i+=1
        

import os
from dotenv import load_dotenv

# accessing API keys locally
load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")

# accessing API keys thorugh Google Colab's secret keys
# from google.colab import userdata
# api_key = userdata.get("OPENAI_API_KEY")


model_local = ChatOpenAI(   
    model="gpt-4o-mini",
    temperature = 0.1,
    api_key=api_key
    )

# api_key = os.getenv("GROQ_API_KEY")

# model_local = ChatGroq(
#         api_key=os.getenv("GROQ_API_KEY"),
#         model="llama3-8b-8192", # this > llama 3.1 8b-instant (performance)
#         # model="llama-3.1-8b-instant",
#         # model="llama-3.1-70b-versatile",
#         temperature=0,
#         )



In [None]:

# question = "List out all one-handed melee weapons"
question = "what are the features of the personal tricorder?"
exit_keyword = "exit"

                
llama_template = """
                <|begin_of_text|><|start_header_id|>system<|end_header_id|>
                You are a pedantic but knowledgeable, efficient and direct AI assistant for Frackin' Universe Website. Provide concise answers focusing on key information. Offer tactful suggestions to solve the user's question. 
                Answer the *question* based only on the following *context*. If the information is not in the *context*, say that you don't have the informmation.
                Focus on the MAIN contents of the given webpage in the *context*, ignoring the periphery of the website i.e., navigation bars, headers, footers, social media etc. 
                It is **CRITICAL** that you thoroughly digest the given *context* to answer the user's *question*.
                Context: {context}
                <|eot_id|>
                <|start_header_id|>user<|end_header_id|>
                {question}
                <|eot_id|>
                <|start_header_id|>assistant<|end_header_id|>
                """
# rag_prompt = ChatPromptTemplate.from_template(rag_template)
rag_prompt = ChatPromptTemplate.from_template(llama_template)
rag_chain = (
    {"context": RunnableLambda(retrieve_and_format), "question": RunnablePassthrough()}
    | rag_prompt
    | model_local
    | StrOutputParser()
)
result = rag_chain.invoke(question)

print("********ANSWER********")
print(result)



In [None]:
tricorder_2 = "where can i find this tricorder if i happen to lose it?"
result_2 = rag_chain.invoke(tricorder_2)
print(result_2)

In [None]:
tricorder_3 = "Can i craft a personal tricorder without having to use my pixels?"
result_3 = rag_chain.invoke(tricorder_3)
print(result_3)

In [None]:
tricorder_4 = "can you tell me a few things that i can craft (along with its  required crafting amterials) with the tricorder?"
result_4 = rag_chain.invoke(tricorder_4)
print(result_4)

In [None]:
# Stars
q5 = "what kinds of planets are in gentle stars?"
r5 = rag_chain.invoke(q5)
print(r5)

In [None]:
# Planets
q6 = "what locations can i find gelatinous planets?" # only takes sources from the Stars page (in Planets, it says Gentle Stars, Temperate Stars etc.)
r6 = rag_chain.invoke(q6) 
print(r6)

In [None]:
# Planets
# q7 = "what's the fauna threat for gelatinous planets?"
q7 = "what's the highest tier for a normal volcaninc planet?"
r7 = rag_chain.invoke(q7)
print(r7)

In [None]:
# Biomes
q8 = "what's the reason to visit for Bog biomes?"
r8 = rag_chain.invoke(q8)
print(r8)

#### UI

In [30]:
import gradio as gr
from langchain_community.document_loaders import WebBaseLoader, PyPDFLoader
from langchain_community.vectorstores import Chroma
from langchain_community import embeddings
from langchain.llms import OpenAI
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain.output_parsers import PydanticOutputParser
from langchain.text_splitter import CharacterTextSplitter

gr_embedding = embedding
gr_documents = doc_splits
gr_llm = model_local

def process_input(urls, question):
    # model_local = ChatOllama(model="mistral")
    
    # Convert string of URLs to list
    urls_list = urls.split("\n")
    # docs = [WebBaseLoader(url).load() for url in urls_list]
    # docs_list = [item for sublist in docs for item in sublist]
    docs_list = [scrape_jina_ai_2(url) for url in urls_list] # scrape using Jina AI's reader
    
    text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=1000, chunk_overlap=200)
    # text_splitter = CharacterTextSplitter.from_tiktoken_encoder(chunk_size=1000, chunk_overlap=200)
    doc_splits = text_splitter.split_documents(docs_list)
    
    vectorstore = Chroma.from_documents(
        documents=gr_documents,
        collection_name="rag-chroma",
        embedding=gr_embedding,
    )
    
    # retriever = vectorstore.as_retriever()
    retriever = vectorstore.as_retriever(search_kwargs={"k":4})

    llama_template = """
                <|begin_of_text|><|start_header_id|>system<|end_header_id|>
                You are a pedantic but knowledgeable, efficient and direct AI assistant for Frackin' Universe Website. Provide concise answers focusing on key information. Offer tactful suggestions to solve the user's question. 
                Answer the *question* based only on the following *context*. If the information is not in the *context*, say that you don't have the informmation.
                Focus on the MAIN contents of the given webpage in the *context*, ignoring the periphery of the website i.e., navigation bars, headers, footers, social media etc. 
                It is **CRITICAL** that you thoroughly digest the given *context* to answer the user's *question*.
                Context: {context}
                <|eot_id|>
                <|start_header_id|>user<|end_header_id|>
                {question}
                <|eot_id|>
                <|start_header_id|>assistant<|end_header_id|>
                """
    
    after_rag_prompt = ChatPromptTemplate.from_template(llama_template)
    after_rag_chain = (
        {"context": retriever, "question": RunnablePassthrough()}
        | after_rag_prompt
        # | model_local
        | gr_llm
        | StrOutputParser()
    )
    return after_rag_chain.invoke(question)

# Define Gradio interface
iface = gr.Interface(fn=process_input,
                     inputs=[gr.Textbox(label="Enter URLs separated by new lines"), gr.Textbox(label="Question")],
                     outputs="text",
                     title="Webpage Query",
                     description="Enter URLs and a question to query the documents.")
iface.launch(share=True)

Running on local URL:  http://127.0.0.1:7862
Running on public URL: https://bb95cb682c80dcb8c7.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


