In [32]:
import os
from typing import Tuple

from datetime import datetime, timedelta

from langchain.docstore.document import Document
from langchain_community.document_loaders import TextLoader
from langchain_community.document_loaders.json_loader import JSONLoader
from langchain_community.vectorstores.timescalevector import TimescaleVector
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.llms.huggingface_endpoint import HuggingFaceEndpoint as HuggingFaceHub
from langchain_text_splitters import RecursiveCharacterTextSplitter

from dotenv import load_dotenv
load_dotenv()

True

In [33]:
huggingfacehub_api_token = os.environ['HUGGINGFACE_API_KEY']

In [41]:
# Load the text and split it into chunks
loader = TextLoader("../data/reformatted_transcripts/2021-05-28 09.14.38_rf.txt")
documents = loader.load()
text_splitter = RecursiveCharacterTextSplitter(chunk_size=100, chunk_overlap=5)
docs = text_splitter.split_documents(documents)

embeddings = HuggingFaceEmbeddings()

In [24]:
# Timescale Vector needs the service url to your cloud database. You can see this as soon as you create the
SERVICE_URL = os.environ["TIMESCALE_URL"]

## 1. Similarity Search with Euclidean Distance (Default)

In [35]:
# The TimescaleVector Module will create a table with the name of the collection.
COLLECTION_NAME = "testing"

# Create a Timescale Vector instance from the collection of documents
db = TimescaleVector.from_documents(
    embedding=embeddings,
    documents=docs,
    collection_name=COLLECTION_NAME,
    service_url=SERVICE_URL
)

In [36]:
query = "What is microphenomenology?"
docs_with_score = db.similarity_search_with_score(query)

In [37]:
for doc, score in docs_with_score:
    print("-" * 80)
    print("Score: ", score)
    print(doc.page_content)
    print("-" * 80)

--------------------------------------------------------------------------------
Score:  0.36237484216690063
is, of course, the visual, right. But even in the experience of microphenomenology, as we used to
--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
Score:  0.3876003623008728
the space, for instance, through microphenomenology. So there was some really interesting findings
--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
Score:  0.3964991625993671
this microphenomenological, international group. And one of the research projects that we are right
--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
Score:  0.41277581453323364
through microphenomenologi

In [38]:
retriever = db.as_retriever()

In [39]:
print(retriever)

tags=['TimescaleVector', 'HuggingFaceEmbeddings'] vectorstore=<langchain_community.vectorstores.timescalevector.TimescaleVector object at 0x000002A66964F620>


## 3. Using ANN Search Indexes to Speed Up Queries

In [44]:
# Initialize an existing TimescaleVector store
COLLECTION_NAME = "timescale_commits"

db = TimescaleVector(
    collection_name=COLLECTION_NAME,
    service_url=SERVICE_URL,
    embedding=embeddings,
)

In [45]:
db.create_index()

In [48]:
# drop the old index
db.drop_index()

# Create an HNSW index
# Note: You don't need to specify m and ef_construction parameters as we set smart defaults.
db.create_index(index_type="hnsw", m=16, ef_construction=64)