In [22]:
%pip install langchain langchain-community langchain-core

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [27]:
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.vectorstores import Chroma
from langchain_community import embeddings

# from langchain_community.chat_models import ChatOllama
from langchain_openai import ChatOpenAI, OpenAIEmbeddings # change model and embedding #c1
from langchain_community.embeddings import OllamaEmbeddings

from langchain_core.runnables import RunnablePassthrough, RunnableLambda
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain.output_parsers import PydanticOutputParser
from langchain.text_splitter import CharacterTextSplitter

from urllib.parse import urlparse, unquote

import os
from dotenv import load_dotenv
load_dotenv()


# model_local = ChatOllama(model="mistral") #c1
model_local = ChatOpenAI(   
    model_name="gpt-4o-mini",
    temperature = 1
    )

# 1. Split data into chunks
urls = [
    "https://frackinuniverse.miraheze.org/wiki/Main_Page",
    "https://frackinuniverse.miraheze.org/wiki/Getting_Started", 
    "https://frackinuniverse.miraheze.org/wiki/Personal_Tricorder",
    "https://frackinuniverse.miraheze.org/wiki/The_Player",
    "https://frackinuniverse.miraheze.org/wiki/Stars",
    "https://frackinuniverse.miraheze.org/wiki/Crafting",
    "https://frackinuniverse.miraheze.org/wiki/Combat",
    "https://frackinuniverse.miraheze.org/wiki/Weapons"
]
docs = [WebBaseLoader(url).load() for url in urls]
docs_list = [item for sublist in docs for item in sublist]

# extract page name
for item in docs_list:
    full_url = item.metadata.get("source")
    parsed_url = urlparse(full_url)
    page_name = unquote(parsed_url.path.split('/')[-1])

    item.metadata["id"] = page_name
    print(item.metadata["id"])

# TODO: experiment with chunk size

# text_splitter = CharacterTextSplitter.from_tiktoken_encoder(chunk_size=7500, chunk_overlap=100)
text_splitter = CharacterTextSplitter.from_tiktoken_encoder(chunk_size=1000, chunk_overlap=200)
doc_splits = text_splitter.split_documents(docs_list)



Main_Page
Getting_Started
Personal_Tricorder
The_Player
Stars
Crafting
Combat
Weapons


In [28]:
# 2. Convert documents to Embeddings and store them
vectorstore = Chroma.from_documents(
    documents=doc_splits,
    collection_name="rag-chroma",
    embedding= OllamaEmbeddings(model='nomic-embed-text'),
)

# using openAI embeddings
vectorstore_openai = Chroma.from_documents(
    documents=doc_splits,
    collection_name="rag-openai",
    embedding= OpenAIEmbeddings(),
)

retriever = vectorstore.as_retriever()
# print(retriever)
retriever_openai = vectorstore_openai.as_retriever()

def retrieve_and_format(query):
    relevant_docs = retriever.invoke(query)
    print('\n\n*********************')
    print([doc.metadata.get("id") for doc in relevant_docs])
    return "\n\n".join([doc.page_content for doc in relevant_docs])


# question = "List out all one-handed melee weapons"
question = "what are the features of the personal tricorder?"
exit_keyword = "exit"
print(f"Enter your question. Type '{exit_keyword}' to end the program.")

# 4. After RAG
print("\n########\nembed: nomic embed text\n")
rag_template = """Answer the question based only on the following context. If the information is not in the context, say you don't have that information.
:
{context}
Question: {question}
"""
rag_prompt = ChatPromptTemplate.from_template(rag_template)
rag_chain = (
    {"context": RunnableLambda(retrieve_and_format), "question": RunnablePassthrough()}
    | rag_prompt
    | model_local
    | StrOutputParser()
)
print(rag_chain.invoke(question))

# print("\n########\nembed: open ai\n")
# rag_chain_openai = (
#     {"context": retriever_openai, "question": RunnablePassthrough()}
#     | rag_prompt
#     | model_local
#     | StrOutputParser()
# )
# print(rag_chain_openai.invoke(question))

# while True:
    # user_question = input("\nHuman: ").strip()

    # if user_question.lower() == exit_keyword.lower():
    #     print("Exiting the program. Goodbye!")
    #     break

    # if user_question:
    #     print("\nProcessing your question...\n")
    #     nomic_response = rag_chain.invoke(user_question)
    #     openai_response = rag_chain_openai.invoke(user_question)
    #     print("====== Nomic Answer======:\n ",nomic_response)
    #     print("\n====== OpenAI Answer ======\n",openai_response)
    # else:
    #     print("Please enter a valid question")

# loader = PyPDFLoader("Ollama.pdf")
# doc_splits = loader.load_and_split()


Enter your question. Type 'exit' to end the program.

########
embed: nomic embed text



*********************
['Personal_Tricorder', 'Personal_Tricorder', 'Personal_Tricorder', 'Personal_Tricorder']
The features of the Personal Tricorder include:

1. **Shift + left-click**: Opens the Research interface to unlock recipes for crafting new items.

2. **Left-click**: Opens a menu with:
   - Information about the player, including bonuses/penalties by race, resistances to damage types, and temporary immunities.
   - GPS information about the current planet, such as weather, gravity, biomes, and surface dungeons.
   - Tool to create mobility Techs like Microsphere or Phase Sprint, and equip them.
   - Tool to refuel your Mech.
   - Tool to replace weapons, thrusters, etc. on your Mech.
   - Tool to upgrade all weapons/armor and tools up to their maximum supported tier.
   - Codex reader that is more convenient than the usual codex reader in Starbound.

3. **Right-click**: Opens the list of

#### UI

In [25]:
import gradio as gr
from langchain_community.document_loaders import WebBaseLoader, PyPDFLoader
from langchain_community.vectorstores import Chroma
from langchain_community import embeddings
# from langchain_community.chat_models import ChatOllama
from langchain.llms import OpenAI
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain.output_parsers import PydanticOutputParser
from langchain.text_splitter import CharacterTextSplitter

def process_input(urls, question):
    model_local = ChatOllama(model="mistral")
    
    # Convert string of URLs to list
    urls_list = urls.split("\n")
    docs = [WebBaseLoader(url).load() for url in urls_list]
    docs_list = [item for sublist in docs for item in sublist]
    
    text_splitter = CharacterTextSplitter.from_tiktoken_encoder(chunk_size=1000, chunk_overlap=200)
    doc_splits = text_splitter.split_documents(docs_list)

    vectorstore = Chroma.from_documents(
        documents=doc_splits,
        collection_name="rag-chroma",
        embedding=embeddings.ollama.OllamaEmbeddings(model='nomic-embed-text'),
    )
    retriever = vectorstore.as_retriever()

    after_rag_template = """Answer the question based only on the following context (If the information is not in the context, say you don't have that information):
    {context}
    Question: {question}
    """
    after_rag_prompt = ChatPromptTemplate.from_template(after_rag_template)
    after_rag_chain = (
        {"context": retriever, "question": RunnablePassthrough()}
        | after_rag_prompt
        | model_local
        | StrOutputParser()
    )
    return after_rag_chain.invoke(question)

# Define Gradio interface
iface = gr.Interface(fn=process_input,
                     inputs=[gr.Textbox(label="Enter URLs separated by new lines"), gr.Textbox(label="Question")],
                     outputs="text",
                     title="Document Query with Ollama",
                     description="Enter URLs and a question to query the documents.")
iface.launch()

  from .autonotebook import tqdm as notebook_tqdm


Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.


