In [1]:
import os

current_path = os.getcwd()
parent_path = os.path.dirname(current_path)
mem_file = os.path.join(parent_path, "memory\memory.csv")
print(mem_file)

c:\Users\Auggie\Projects\stems-mini-project\memory\memory.csv


In [2]:
import sys
sys.path.append("..")

In [3]:
from src.time_weighted_retriever import ModTimeWeightedVectorStoreRetriever
from src.chroma import EnhancedChroma

In [4]:
from langchain.embeddings import VertexAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import DataFrameLoader, CSVLoader
from langchain.llms import VertexAI
from langchain.agents import initialize_agent, Tool, AgentType
from langchain.vectorstores import Chroma
from langchain.docstore import InMemoryDocstore
from langchain.vectorstores import FAISS
from langchain.retrievers import TimeWeightedVectorStoreRetriever
import faiss
import pandas as pd
from datetime import datetime

In [5]:
import os
from vertexai.preview.language_models import TextGenerationModel

os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = "../credentials/aiap-14-ds-llm-topic-01-sa.json"

model = TextGenerationModel.from_pretrained("text-bison@001")
response = model.predict("Is the earth round?")

print(f"Response from Model: {response.text}")

Response from Model: The earth is round.

The earth is a sphere.


In [6]:
# Instantiating model
llm = VertexAI(model_name="text-bison@001", 
               max_output_tokens=256, 
               temperature=0.2)

In [7]:
# Document loader
loader = CSVLoader(mem_file, metadata_columns=["last_accessed_at", "created_at"])
docs = loader.load()

In [8]:
docs

[Document(page_content='observations: A saw B walking by', metadata={'source': 'c:\\Users\\Auggie\\Projects\\stems-mini-project\\memory\\memory.csv', 'row': 0, 'last_accessed_at': '2023-10-11 22:33', 'created_at': '2023-03-03 10:33'}),
 Document(page_content='observations: B saw A fell down', metadata={'source': 'c:\\Users\\Auggie\\Projects\\stems-mini-project\\memory\\memory.csv', 'row': 1, 'last_accessed_at': '2023-09-11 20:33', 'created_at': '2023-04-03 10:33'}),
 Document(page_content='observations: A had lunch', metadata={'source': 'c:\\Users\\Auggie\\Projects\\stems-mini-project\\memory\\memory.csv', 'row': 2, 'last_accessed_at': '2023-10-09 14:15', 'created_at': '2023-03-03 12:33'}),
 Document(page_content='observations: B read a book', metadata={'source': 'c:\\Users\\Auggie\\Projects\\stems-mini-project\\memory\\memory.csv', 'row': 3, 'last_accessed_at': '2023-10-07 16:50', 'created_at': '2023-03-03 13:33'}),
 Document(page_content='observations: B noticed A was talking on the 

In [9]:
# Split documents into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
texts = text_splitter.split_documents(docs)

In [10]:
# Instantiating embedder
vertex_embeddings = VertexAIEmbeddings()

In [11]:
# Store embeddings in Chromadb
vectorstore = EnhancedChroma(embedding_function=vertex_embeddings)

In [15]:
# Instantiating time-weighted retriever
retriever = ModTimeWeightedVectorStoreRetriever(
                                            vectorstore=vectorstore, decay_rate=0.99, k=1
                                        )

In [16]:
# Add documents to retriever
retriever.add_documents(docs)

['b4f5cce8-6968-11ee-b195-8cf8c502c1d1',
 'b4f5cce9-6968-11ee-acbf-8cf8c502c1d1',
 'b4f5ccea-6968-11ee-88cb-8cf8c502c1d1',
 'b4f5cceb-6968-11ee-ac34-8cf8c502c1d1',
 'b4f5ccec-6968-11ee-a5d5-8cf8c502c1d1',
 'b4f5cced-6968-11ee-8746-8cf8c502c1d1',
 'b4f5ccee-6968-11ee-a46b-8cf8c502c1d1',
 'b4f5ccef-6968-11ee-a44a-8cf8c502c1d1',
 'b4f5ccf0-6968-11ee-8088-8cf8c502c1d1',
 'b4f5ccf1-6968-11ee-a229-8cf8c502c1d1',
 'b4f5ccf2-6968-11ee-ba5c-8cf8c502c1d1']

In [17]:
# Get relevant documents, relevancy is determined by recency and semantic similarity
retriever.get_relevant_documents("A saw B")

Number of requested results 100 is greater than number of elements in index 22, updating n_results = 22


[Document(page_content='observations: A jogged', metadata={'source': 'c:\\Users\\Auggie\\Projects\\stems-mini-project\\memory\\memory.csv', 'row': 5, 'last_accessed_at': datetime.datetime(2023, 10, 13, 9, 34, 58, 271544), 'created_at': '2023-03-03 14:33', 'buffer_idx': 5})]

FINALLY GOT IT WORKING!!!

In [None]:
df = pd.read_csv(mem_file)
df['created_at'] = df['created_at'].apply(lambda x: datetime.strptime(x, '%d/%m/%Y %H:%M'))
df['last_accessed_at'] = df['last_accessed_at'].apply(lambda x: datetime.strptime(x, '%d/%m/%Y %H:%M'))
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11 entries, 0 to 10
Data columns (total 3 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   created_at        11 non-null     datetime64[ns]
 1   last_accessed_at  11 non-null     datetime64[ns]
 2   observations      11 non-null     object        
dtypes: datetime64[ns](2), object(1)
memory usage: 392.0+ bytes


In [None]:
df.head()

Unnamed: 0,created_at,last_accessed_at,observations
0,2023-03-03 10:33:00,2023-10-11 22:33:00,A saw B walking by
1,2023-04-03 10:33:00,2023-09-11 20:33:00,B saw A fell down
2,2023-03-03 12:33:00,2023-10-09 14:15:00,A had lunch
3,2023-03-03 13:33:00,2023-10-07 16:50:00,B read a book
4,2023-03-03 13:53:00,2023-05-08 17:30:00,B noticed A was talking on the phone


In [None]:
#loader
loader = DataFrameLoader(df, page_content_column="observations")
docs = loader.load()

In [None]:
docs

[Document(page_content='observations: A saw B walking by', metadata={'source': 'c:\\Users\\Auggie\\Projects\\stems-mini-project\\memory\\memory.csv', 'row': 0, 'last_accessed_at': '0.001', 'created_at': '3/3/2023 10:33'}),
 Document(page_content='observations: B saw A fell down', metadata={'source': 'c:\\Users\\Auggie\\Projects\\stems-mini-project\\memory\\memory.csv', 'row': 1, 'last_accessed_at': '0.001', 'created_at': '3/4/2023 10:33'}),
 Document(page_content='observations: A had lunch', metadata={'source': 'c:\\Users\\Auggie\\Projects\\stems-mini-project\\memory\\memory.csv', 'row': 2, 'last_accessed_at': '0.001', 'created_at': '3/3/2023 12:33'}),
 Document(page_content='observations: B read a book', metadata={'source': 'c:\\Users\\Auggie\\Projects\\stems-mini-project\\memory\\memory.csv', 'row': 3, 'last_accessed_at': '0.001', 'created_at': '3/3/2023 13:33'}),
 Document(page_content='observations: B noticed A was talking on the phone', metadata={'source': 'c:\\Users\\Auggie\\Proj

In [None]:
vectorstore = Chroma.from_documents(texts, vertex_embeddings)

**CHROMA don't work with datetime**

In [None]:
retriever = TimeWeightedVectorStoreRetriever(vectorstore=vectorstore, decay_rate=0.0000001, k=5)

In [None]:
print(retriever.get_relevant_documents("A saw B"))

Number of requested results 100 is greater than number of elements in index 22, updating n_results = 22


[]


**With CSVloader - Returned no results because last_accessed_at is a string instead of a datetime object.**

In [None]:
docs

[Document(page_content='observations: A saw B walking by', metadata={'source': 'c:\\Users\\Auggie\\Projects\\stems-mini-project\\memory\\memory.csv', 'row': 0, 'last_accessed_at': '11/10/2023 22:33', 'created_at': '3/3/2023 10:33'}),
 Document(page_content='observations: B saw A fell down', metadata={'source': 'c:\\Users\\Auggie\\Projects\\stems-mini-project\\memory\\memory.csv', 'row': 1, 'last_accessed_at': '11/9/2023 20:33', 'created_at': '3/4/2023 10:33'}),
 Document(page_content='observations: A had lunch', metadata={'source': 'c:\\Users\\Auggie\\Projects\\stems-mini-project\\memory\\memory.csv', 'row': 2, 'last_accessed_at': '9/10/2023 14:15', 'created_at': '3/3/2023 12:33'}),
 Document(page_content='observations: B read a book', metadata={'source': 'c:\\Users\\Auggie\\Projects\\stems-mini-project\\memory\\memory.csv', 'row': 3, 'last_accessed_at': '7/10/2023 16:50', 'created_at': '3/3/2023 13:33'}),
 Document(page_content='observations: B noticed A was talking on the phone', met

In [None]:
# Initialize the vectorstore as empty
embedding_size = 1536
index = faiss.IndexFlatL2(embedding_size)
vectorstore = FAISS(embeddings.embed_query, index, InMemoryDocstore({}), {})
retriever = TimeWeightedVectorStoreRetriever(vectorstore=vectorstore, decay_rate=.1, k=1)

In [None]:
retriever.add_documents(docs)

AssertionError: 

In [None]:
index = Chroma(embedding_function=vertex_embeddings, persist_directory='db')