In [81]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import UnstructuredFileLoader
import openai
import os
import json
from redisvl.utils.vectorize import HFTextVectorizer
from dotenv import load_dotenv
from redisvl.redis.utils import array_to_buffer


env_path = "../"
load_dotenv(env_path)


class Settings:
    BASE_PATH = "/data"
    CHAT_MODEL = "gpt-3.5-turbo-0125"
    CHUNK_SIZE = 2500
    CHUNK_OVERLAP = 10
    VECTORIZER = "sentence-transformers/all-MiniLM-L6-v2"
    REDIS_INDEX_NAME = "at_tool_redis"

settings = Settings()

class Context:
    def __init__(self, docs):
        self.docs: list[str] = docs
        self.chunks: list[str] = self.get_chunks()
        self.metadata: list[str] = self.create_metadata()
        self.propositions: list[str] = self.create_propositions()
        self.embeddings = self.create_embeddings()

        if not (len(self.chunks) == len(self.propositions) == len(self.embeddings)):
            raise ValueError("Chunk, embedding, and proposition lengths do not match")

    def get_chunks(self):
        """Load and split data from docs into chunks"""

        """
        Note: right now for simplicity all chunks of documents go into one list 
        but this could be extended to different data structure. For example, could store 
        in dict where doc_name is the key to chunks etc
        """

        chunks = []

        for doc in self.docs:
            chunks.extend(self.load_and_split(doc))

        return chunks

    def create_metadata(self):
        """TODO: store metadata of chunks page number etc."""

    def create_propositions(self):
        """this takes the chunks and makes them better for later use"""
        return [self.create_proposition(chunk) for chunk in self.chunks]

    @staticmethod
    def create_proposition(chunk):
        """this takes the chunks and makes them better for later use"""

        SYSTEM_PROMPT = """
          You are a trail guide tool. Segments of a raw pdf containing tables, information text, and maps of the Appalachian Trail.

          Create a clear proposition from the data which includes all potentially important information on the location of shelters, mile markers, etc.

          Return the proposition as a single string with key proposition in a json like so: {"proposition": "single string"}
        """

        response = openai.OpenAI().chat.completions.create(
            model=settings.CHAT_MODEL,
            response_format={"type": "json_object"},
            messages=[
                {"role": "system", "content": SYSTEM_PROMPT},
                {
                    "role": "user",
                    "content": f"Decompose this raw content using the rules above:\n {chunk} ",
                },
            ],
        )

        res = response.choices[0].message.content
        return json.loads(res)["proposition"]

    def create_embeddings(self):
        """this converts the propositions into vectors"""

        hf = HFTextVectorizer(settings.VECTORIZER)

        return hf.embed_many([proposition for proposition in self.propositions])

    def get_redis_data(self):
        """this takes the data and makes it ready for inserting into DB"""

        return [
            {
                "chunk_id": f"{i}",
                "raw_content": chunk.page_content,
                "proposition": self.propositions[i],
                # convert embeddings to bytes for hash
                "text_embedding": array_to_buffer(self.embeddings[i]),
            }
            for i, chunk in enumerate(self.chunks)
        ]

    @staticmethod
    def load_and_split(
        doc, chunk_size=settings.CHUNK_SIZE, chunk_overlap=settings.CHUNK_OVERLAP
    ):
        loader = UnstructuredFileLoader(doc, mode="single", strategy="fast")

        # providing some chunk_overlap for help with summarization
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size, chunk_overlap=chunk_overlap
        )

        return loader.load_and_split(text_splitter)

In [71]:
docs = [f"../{settings.BASE_PATH}/docs/white_blaze_sample_quick_info.pdf"]
c = Context(docs)

assert len(c.embeddings) == len(c.chunks) == len(c.propositions)

In [74]:
from redisvl.redis.utils import array_to_buffer

redis_data = [
  {
      "chunk_id": f"{i}",
      "raw_content": chunk,
      "proposition": c.propositions[i],
      # convert embeddings to bytes for hash
      "text_embedding": array_to_buffer(c.embeddings[i]),
  }
  for i, chunk in enumerate(c.chunks)
]

In [75]:
redis_data

[{'chunk_id': '0',
  'raw_content': Document(page_content='Resupply\n\nResupply locations along the Appalachian Trail ATTENTION: For more detailed information read write-up under mileage in book and see maps. Shaded entries are 1.0 miles or less from the Appalachian Trail that are full resupplies or PO’s.\n\n~Designates map available = e Location Suches, GA~e Neel Gap, GA Blairsville, GA Dahlonega, GA Helen, GA~e Hiawassee, GA~e Franklin, NC~e NOC, NC~e Stecoah Gap, NC (NC. 143)~e Robbinsville, NC~e Fontana Village, NC~e Gatlinburg, TN~e Cherokee, NC Davenport Gap, TN~e Green Corner Road~e Hot Springs, NC~e Log Cabin Rd~e Sams Gap, TN~e Uncle Johnny’s Nolichucky Hostel~e Erwin, TN~e Elk Park, NC~e Roan Mountain, TN~e Scotty’s Budget Hostel Dennis Cove, TN~e Shook Branch Road~e Hampton, TN~e Shady Valley, TN~e Damascus, VA~e Troutdale, VA~e Sugar Grove, VA\n\nNOBO Mile 20.5 31.3 31.3 31.3 52.5 69.2 109.4 136.7 150.5 150.5 165.9 207.7 207.7 239.2 241.5 274.9 291.2 319.7 344.3 344.3 395.3

: 

In [68]:
hf = HFTextVectorizer(settings.VECTORIZER)
embeddings = hf.embed_many(
            [proposition for proposition in c.propositions]
        )


In [69]:
assert len(embeddings) == len(c.propositions)

In [1]:
t = ["one", "two", "three", "four", "five"]

"\n".join(t)

'one\ntwo\nthree\nfour\nfive'

In [12]:
def define_schema() -> IndexSchema:
    return IndexSchema.from_dict(
        {
            "index": {"name": settings.REDIS_INDEX_NAME, "prefix": "chunk"},
            "fields": [
                {"name": "chunk_id", "type": "tag", "attrs": {"sortable": True}},
                {"name": "raw_content", "type": "text"},
                {"name": "proposition", "type": "text"},
                {
                    "name": "text_embedding",
                    "type": "vector",
                    "attrs": {
                        "dims": hf.dims,
                        "distance_metric": "cosine",
                        "algorithm": "hnsw",
                        "datatype": "float32",
                    },
                },
            ],
        }
    )


In [30]:
from redisvl.query import VectorQuery
from redisvl.utils.vectorize import HFTextVectorizer


hf = HFTextVectorizer(settings.VECTORIZER)


class Chat:
    def __init__(self, index):
        self.index = index
        self.last_message: str = ""
        self.chat_messages: list[str] = []
        # might have other structures for holding data but none that I can think of right now

    def add_message(self, msg):
        self.chat_messages.append(msg)

    def embed_query(self, query):
        return hf.embed(query)

    # this is when we want to respond to a message we first need to get the most relevant data to answer the question from the db
    def retrieve_context(self, query):

        query_embedding = self.embed_query(query)

        vector_query = VectorQuery(
            vector=query_embedding,
            vector_field_name="text_embedding",
            num_results=3,
            return_fields=["label", "propositions"],
            return_score=True,
        )

        # we would need reference to the index to query it that makes sense
        res = self.index.query(vector_query)

        return "\n".join(
            [r["propositions"] for r in res]
        )  # just take the first one for now.

    def promptify(self, query: str, context: str) -> str:
        """promptify takes the user's question and the context from the db and ask the generative AI to make an answer based in that world"""

        return f"""Use the provided context below derived from a Applachian trail guide pdf to answer the user's question.
        If you can't answer the user's question, based on the context; do not guess. If there is no context at all,
        respond with "I don't know".

        User's previous questions:

        {" ".join(self.chat_messages)}

        User question:

        {self.last_message}

        Helpful context:

        {context}

        Answer:
    """

    async def answer_question(self, query):
        context = self.retrieve_context(query)

        SYSTEM_PROMPT = "You are a tool assiting hikers find simple trail info."

        response = await openai.AsyncClient().chat.completions.create(
            model=settings.CHAT_MODEL,
            messages=[
                {"role": "system", "content": SYSTEM_PROMPT},
                {"role": "user", "content": self.promptify(query, context)},
            ],
            temperature=0.1,
            seed=42,
        )

        # Response provided by LLM
        res = response.choices[0].message.content
        self.add_message(res)
        return res


In [22]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import UnstructuredFileLoader
import openai
import os
from dotenv import load_dotenv
# from at_tool.context import Context
# from at_tool.chat import Chat
# from utilities.redis import define_schema
from redis import Redis
from redisvl.schema import IndexSchema
from redisvl.index import SearchIndex
import asyncio



def init_db():
    # define docs to be loaded into context
    cwd = os.getcwd()
    docs = [os.path.join(cwd, "data/docs/white_blaze_sample_quick_info.pdf")]

    # init redis client
    client = Redis.from_url(os.environ["REDIS_URL"])
    schema = define_schema()

    # create an index from schema and the client
    index = SearchIndex(schema, client)
    print(f"{client.keys=}")

    if not client.keys:
        index.create(overwrite=True, drop=True)

        # create context and load into database
        context = Context(docs)
        redis_data = context.get_redis_data()
        print(redis_data)
        index.load(redis_data, id_field="chunk_id")

    return index

In [37]:

index = init_db()
c = Chat(index)

client.keys=<bound method BasicKeyCommands.keys of Redis<ConnectionPool<Connection<host=redis-15067.c274.us-east-1-3.ec2.cloud.redislabs.com,port=15067,db=0>>>>


In [38]:
c.add_message("hello")
c.chat_messages

hello
None


['hello', 'hello']

In [14]:
async def start_chat(chat):
  # move these to settings
  stopterms = ["exit", "quit", "end", "cancel"]

  # Simple Chat
  while True:
      most_recent_question = input()
      if most_recent_question.lower() in stopterms:
          break

      answer = await chat.answer_question(most_recent_question)
      print(answer, flush=True)

In [15]:
start_chat(c)

<coroutine object start_chat at 0x000001AD2D447C40>

In [25]:
c.chat_messages

In [34]:
class SomeClass:
  def __init__(self):
    self.some_list: list[str] = []
  
  def update_list(self, msg):
    self.some_list.append(msg)

In [35]:
s = SomeClass()
s.update_list("hello")
s.some_list

['hello']

In [54]:
def retrieve_context(index, query):
  query_embedding = hf.embed(query)

  vector_query = VectorQuery(
      vector=query_embedding,
      vector_field_name="text_embedding",
      num_results=3,
      return_fields=["raw_content", "proposition"],
      return_score=True,
  )

  # we would need reference to the index to query it that makes sense
  res = index.query(vector_query)
  print(f"hello: {res}")

  return "\n".join(
      [r["proposition"] for r in res]
  )  # just take the first one for now.

In [64]:
def init_db():
    # define docs to be loaded into context
    # cwd = os.getcwd()
    docs = ["../data/docs/white_blaze_sample_quick_info.pdf"]

    # init redis client
    client = Redis.from_url(os.environ["REDIS_URL"])
    schema = define_schema()

    # create an index from schema and the client
    index = SearchIndex(schema, client)
    print(f"{client.keys=}")

    if not client.keys():
        print("No data found creating loading index")
        index.create(overwrite=True, drop=True)
        print("index created")

        # create context and load into database
        context = Context(docs)
        redis_data = context.get_redis_data()
        print(redis_data)
        index.load(redis_data, id_field="chunk_id")

    return index

In [82]:
# define docs to be loaded into context
# cwd = os.getcwd()
docs = ["../data/docs/white_blaze_sample_quick_info.pdf"]

# init redis client
client = Redis.from_url(os.environ["REDIS_URL"])
schema = define_schema()

# create an index from schema and the client
index = SearchIndex(schema, client)
print(f"{client.keys=}")

if not client.keys():
    print("No data found creating loading index")
    index.create(overwrite=True, drop=True)
    print("index created")

    # create context and load into database
    context = Context(docs)
    redis_data = context.get_redis_data()

client.keys=<bound method BasicKeyCommands.keys of Redis<ConnectionPool<Connection<host=redis-15067.c274.us-east-1-3.ec2.cloud.redislabs.com,port=15067,db=0>>>>
No data found creating loading index
[32m12:12:01[0m [34mredisvl.index.index[0m [1;30mINFO[0m   Index already exists, overwriting.
index created


In [83]:
redis_data[0]

{'chunk_id': '0',
 'raw_content': 'Resupply\n\nResupply locations along the Appalachian Trail ATTENTION: For more detailed information read write-up under mileage in book and see maps. Shaded entries are 1.0 miles or less from the Appalachian Trail that are full resupplies or PO’s.\n\n~Designates map available = e Location Suches, GA~e Neel Gap, GA Blairsville, GA Dahlonega, GA Helen, GA~e Hiawassee, GA~e Franklin, NC~e NOC, NC~e Stecoah Gap, NC (NC. 143)~e Robbinsville, NC~e Fontana Village, NC~e Gatlinburg, TN~e Cherokee, NC Davenport Gap, TN~e Green Corner Road~e Hot Springs, NC~e Log Cabin Rd~e Sams Gap, TN~e Uncle Johnny’s Nolichucky Hostel~e Erwin, TN~e Elk Park, NC~e Roan Mountain, TN~e Scotty’s Budget Hostel Dennis Cove, TN~e Shook Branch Road~e Hampton, TN~e Shady Valley, TN~e Damascus, VA~e Troutdale, VA~e Sugar Grove, VA\n\nNOBO Mile 20.5 31.3 31.3 31.3 52.5 69.2 109.4 136.7 150.5 150.5 165.9 207.7 207.7 239.2 241.5 274.9 291.2 319.7 344.3 344.3 395.3 395.3 407.4 420.0 428.5

In [84]:
index.load(redis_data, id_field="chunk_id")

['chunk:0',
 'chunk:1',
 'chunk:2',
 'chunk:3',
 'chunk:4',
 'chunk:5',
 'chunk:6',
 'chunk:7']

In [85]:
client = Redis.from_url(os.environ["REDIS_URL"])

In [86]:
client.keys()

[b'chunk:3',
 b'chunk:0',
 b'chunk:5',
 b'chunk:7',
 b'chunk:1',
 b'chunk:4',
 b'chunk:6',
 b'chunk:2']

In [87]:
client.hgetall('chunk:3')

{b'raw_content': b'(0.2W) 7.9<<5.3<Hawk Mountain Shelter>7.6>>19.6>>>20.8 w Water is located 400 yards on a blue blazed trail behind the shelter, t tent pads, p privy, J bear cables. (0.1W) 15.5<<<12.9<<7.6<Gooch Mountain Shelter>12>>13.2>>>22.3 w Water (spring) is located 100 yards behind the shelter, t tent pads that can accommo- date two tents each, p privy, J bear box.\n\n+[34.66467,-84.13702] s[34.66608,-84.13638]\n\n+[34.65492,-84.04948] s[34.65573,-84.04998] w[34.65652,-84.04954]\n\n3194\n\n3000\n\n2193.7\n\n2188.4 2180.8\n\n27.7\n\n28.9\n\n38.0\n\n42.8\n\nW+t s{7} EpJ`\n\ns{8} pZv`\n\nw+Z t{3}s{7} EpJv `\n\nws{7} t{4} EpJ`\n\nBird Gap (0.4W) 24.9<<<19.6<<12<Woods Hole Shelter>1.2>>10.3>>>15.1 W Water (stream) on trail to shelter is unreliable in dry months, p privy, t tenting, J bear cables. Bird Gap, Freeman Trail just east bypasses Blood Mtn. and rejoins AT at Flatrock Gap. Blood Mountain, open rocky summit. 20.8<<<13.2<<1.2<Blood Mountain Shelter>9.1>>13.9>>>21.2 Shelter is 

In [42]:
index

<redisvl.index.index.SearchIndex at 0x1ad2d240050>

In [88]:
query = 'How many miles are between Stover Creek Shelter and Hawk Mountain shelter?'

context = retrieve_context(index, query)


hello: [{'id': 'chunk:3', 'vector_distance': '0.287704229355', 'raw_content': '(0.2W) 7.9<<5.3<Hawk Mountain Shelter>7.6>>19.6>>>20.8 w Water is located 400 yards on a blue blazed trail behind the shelter, t tent pads, p privy, J bear cables. (0.1W) 15.5<<<12.9<<7.6<Gooch Mountain Shelter>12>>13.2>>>22.3 w Water (spring) is located 100 yards behind the shelter, t tent pads that can accommo- date two tents each, p privy, J bear box.\n\n+[34.66467,-84.13702] s[34.66608,-84.13638]\n\n+[34.65492,-84.04948] s[34.65573,-84.04998] w[34.65652,-84.04954]\n\n3194\n\n3000\n\n2193.7\n\n2188.4 2180.8\n\n27.7\n\n28.9\n\n38.0\n\n42.8\n\nW+t s{7} EpJ`\n\ns{8} pZv`\n\nw+Z t{3}s{7} EpJv `\n\nws{7} t{4} EpJ`\n\nBird Gap (0.4W) 24.9<<<19.6<<12<Woods Hole Shelter>1.2>>10.3>>>15.1 W Water (stream) on trail to shelter is unreliable in dry months, p privy, t tenting, J bear cables. Bird Gap, Freeman Trail just east bypasses Blood Mtn. and rejoins AT at Flatrock Gap. Blood Mountain, open rocky summit. 20.8<<<1

''

In [102]:
from at_tool.chat import Chat
chat = Chat(index)
query = 'How many miles are between Stover Creek Shelter and Hawk Mountain shelter?'
chat.answer_question(query)

KeyError: 'propositions'

In [78]:
SYSTEM_PROMPT = """
    You are a trail guide tool. Segments of a raw pdf containing tables, information text, and maps of the Appalachian Trail.

    Create a clear proposition from the data which includes all potentially important information on the location of shelters, mile markers, etc.

    Return the proposition as a single string with key proposition in a json like so: {"proposition": "single string"}
"""

response = openai.OpenAI().chat.completions.create(
    model=settings.CHAT_MODEL,
    response_format={"type": "json_object"},
    messages=[
        {"role": "system", "content": SYSTEM_PROMPT},
        {
            "role": "user",
            "content": f"Decompose this raw content using the rules above:\n {chunk} ",
        },
    ],
)

res = response.choices[0].message.content

In [80]:
import json
json.loads(res)["proposition"]

'The resupply locations along the Appalachian Trail include: Suches, GA (20.5 miles NOBO & 2.0 miles SOBO), Neel Gap, GA, Blairsville, GA, Dahlonega, GA, Helen, GA (9.0 miles SOBO), Hiawassee, GA, Franklin, NC, NOC, NC, Stecoah Gap, NC, Robbinsville, NC, Fontana Village, NC, Gatlinburg, TN, Cherokee, NC, Davenport Gap, TN, Green Corner Road, Hot Springs, NC, Log Cabin Rd, Sams Gap, TN, Uncle Johnny’s Nolichucky Hostel, Erwin, TN, Elk Park, NC, Roan Mountain, TN, Scotty’s Budget Hostel Dennis Cove, TN, Shook Branch Road, Hampton, TN, Shady Valley, TN, Damascus, VA, Troutdale, VA, Sugar Grove, VA, Marion, VA, Atkins, VA, Quarter Way Inn, VA. 42/W Blue Grass Trail, Saint Luke’s Hostel, Bland, VA, Bastian, VA, Sugar Run Gap, VA, Narrows, VA, Pearisburg, VA, Newport, VA, Catawba, VA, Daleville, VA, Troutville, VA, Buchanan, VA, Jennings Creek Road, Big Island, VA. Additionally, there are Post Office options located in Suches, GA and Helen, GA.'

In [91]:
indexes = client.execute_command('FT._LIST')

In [99]:
name = "at_tool_redis"
if f"b'{name}'" in [str(i) for i in indexes]:
  print("yay")

yay


In [98]:

str(indexes[0])

"b'at_tool_redis'"