In [39]:
import os, pandas as pd
from sentence_transformers import SentenceTransformer
from sqlalchemy import create_engine, text
from langchain.docstore.document import Document
from langchain.document_loaders import TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.embeddings.fastembed import FastEmbedEmbeddings

from langchain_iris import IRISVector




In [40]:
import getpass
from dotenv import load_dotenv

load_dotenv(override=True)

if not os.environ.get("OPENAI_API_KEY"):
    os.environ["OPENAI_API_KEY"] = getpass.getpass("OpenAI API Key:")

In [21]:
username = 'demo'
password = 'demo'
hostname = os.getenv('IRIS_HOSTNAME', 'localhost')
port = '1972' 
namespace = 'USER'
CONNECTION_STRING = f"iris://{username}:{password}@{hostname}:{port}/{namespace}"

In [22]:
engine = create_engine(CONNECTION_STRING)

In [42]:
# Load a pre-trained sentence transformer model. This model's output vectors are of size 384
model = SentenceTransformer('all-MiniLM-L6-v2') 



In [32]:
# with engine.connect() as conn:
#     with conn.begin():# Load 
#         sql = f"""
#                 CREATE TABLE UserReviews3 (
#     id INT PRIMARY KEY AUTO_INCREMENT,
#     description TEXT,
#     video TEXT
# )
#                 """
#         result = conn.execute(text(sql))


In [37]:
with engine.connect() as conn:
    with conn.begin():# Load 
        sql = f"""
                CREATE TABLE UserReviews4 (
    id INT PRIMARY KEY AUTO_INCREMENT,
    description TEXT,
    video TEXT,
    detail  VECTOR(DOUBLE, 384),
    description_vector VECTOR(DOUBLE, 384)
)
                """
        result = conn.execute(text(sql))

In [43]:
video_path = "../data/test.mov"
description = "a video taken at hackMIT for fun"
video_data =  video_path 


In [44]:
single_embedding =model.encode(description, normalize_embeddings=True).tolist()

In [46]:
loader = TextLoader("../data/hackMIT_details.txt", encoding='utf-8')
documents = loader.load()
text_splitter = CharacterTextSplitter(chunk_size=400, chunk_overlap=20)
docs = text_splitter.split_documents(documents)
# print(documents)

[Document(metadata={'source': '../data/hackMIT_details.txt'}, page_content="Our journey in developing Rewind has been a fascinating exploration of cutting-edge technology and complex problem-solving. Rewind is an advanced memory system designed to harness the power of Retrieval-Augmented Generation (RAG) combined with intersystem vector search to enhance information retrieval and memory management.\n\nConceptualization and Design\n\nThe project began with a clear vision: to create a memory system that seamlessly integrates retrieval mechanisms with generative capabilities, allowing for more accurate and contextually relevant information retrieval. Our primary goal was to build a system that not only stores vast amounts of data but also intelligently retrieves and utilizes this information to enhance decision-making and knowledge retention.\n\nTechnical Challenges and Innovations\n\nOne of the core challenges was implementing an effective RAG-based approach. This involved integrating a 

In [36]:
with engine.connect() as conn:
    with conn.begin():# Load 
        sql = """
                INSERT INTO UserReviews3 (description, video, detail, description_vector)
                VALUES (:description, :video, :detail, :vector)
            """
        conn.execute(
            text(sql),
            {"description": description, "video": video_data, "detail": docs, "description_vector": single_embedding}
        )
        print("Row inserted successfully.")

Row inserted successfully.


In [None]:
description_search = "earthy and creamy taste"
search_vector = model.encode(description_search, normalize_embeddings=True).tolist() # Convert search phrase into a vector


In [None]:
with engine.connect() as conn:
    with conn.begin():
        sql = text("""
            SELECT TOP 1 * FROM UserReviews4
            ORDER BY VECTOR_DOT_PRODUCT(description_vector, TO_VECTOR(:search_vector)) DESC
        """)

        results = conn.execute(sql, {'search_vector': str(search_vector)}).fetchall()


In [None]:
results_df = pd.DataFrame(results, columns=df.columns).iloc[:, :-1] # Remove vector
pd.set_option('display.max_colwidth', None)  # Easier to read description
results_df.head()

In [None]:
embeddings = OpenAIEmbeddings()
COLLECTION_NAME = "state_of_the_union_test"

db = IRISVector.from_documents(
    embedding=embeddings,
    documents=docs,
    collection_name=COLLECTION_NAME,
    connection_string=CONNECTION_STRING,
)


In [None]:

print(f"Number of docs in vector store: {len(db.get()['ids'])}")
query = "Joint patrols to catch traffickers"
docs_with_score = db.similarity_search_with_score(query)
for doc, score in docs_with_score:
    print("-" * 80)
    print("Score: ", score)
    print(doc.page_content)
    print("-" * 80)