In [1]:
from IPython.display import clear_output

%pip install langchain-huggingface sentence-transformers ipywidgets
%pip install -e ..[all]
clear_output()

from langchain_huggingface import HuggingFaceEmbeddings
from pathlib import Path
from typing import Set
from shutil import rmtree

class GarbageCollector:
    def __init__(self):
        self.paths_to_cleanup: Set[Path] = set()

    def add(self, path: Path):
        self.paths_to_cleanup.add(path)

    def cleanup(self):
        for path in self.paths_to_cleanup:
            if path.is_file():
                path.unlink()
            else:
                rmtree(path)

gc = GarbageCollector()

from langchain_memvid import IndexManager, IndexConfig
from langchain_memvid import VideoProcessor
from langchain_memvid import VideoConfig, QRCodeConfig
from langchain_memvid import Encoder
from langchain_memvid import VectorStoreConfig
from langchain_memvid import Retriever

ImportError: cannot import name 'VideoProcessor' from 'langchain_memvid' (/home/dawid/github/sarumaj/langchain_memvid/src/langchain_memvid/__init__.py)

In [2]:
# Create index configuration
config = IndexConfig(
    index_type="faiss",
    metric="cosine",
    nlist=6  # Number of clusters for IVF index
)

# Initialize Embeddings
embeddings = HuggingFaceEmbeddings()

# Create index manager
index_manager = IndexManager(config=config, embeddings=embeddings)

# Example text chunks
texts = [
    "The quick brown fox jumps over the lazy dog",
    "A fast orange fox leaps across a sleepy canine",
    "The weather is beautiful today",
    "It's raining cats and dogs outside",
    "Python is a popular programming language",
    "JavaScript is widely used for web development"
]

# Example metadata for each text
metadata = [
    {"id": 0, "source": "example1.txt", "category": "animals"},
    {"id": 1, "source": "example1.txt", "category": "animals"},
    {"id": 2, "source": "example2.txt", "category": "weather"},
    {"id": 3, "source": "example2.txt", "category": "weather"},
    {"id": 4, "source": "example3.txt", "category": "programming"},
    {"id": 5, "source": "example3.txt", "category": "programming"}
]

# Add texts with metadata
# The index will be created automatically with the correct dimension
# and trained if using an IVF index
index_manager.add_texts(texts, metadata)

In [None]:
# Example searches
queries = [
    "Tell me about foxes",
    "What's the weather like?",
    "What programming languages are mentioned?"
]

print("\nSearching the index:")
print("-" * 50)

for query in queries:
    print(f"\nQuery: {query}")
    results = index_manager.search_text(query, k=1)

    print("Results:")
    for result in results:
        print(f"- Text: {result.text}")
        print(f"  Source: {result.source}")
        print(f"  Category: {result.category}")
        print(f"  Similarity: {result.similarity:.4f}")


Searching the index:
--------------------------------------------------

Query: Tell me about foxes
Results:
- Text: The quick brown fox jumps over the lazy dog
  Source: example1.txt
  Category: animals
  Similarity: 0.5380

Query: What's the weather like?
Results:
- Text: The weather is beautiful today
  Source: example2.txt
  Category: weather
  Similarity: 0.4703

Query: What programming languages are mentioned?
Results:
- Text: Python is a popular programming language
  Source: example3.txt
  Category: programming
  Similarity: 0.5955


In [None]:
video_config = VideoConfig(
    fps=30,
    resolution=(1920, 1080),
    codec="mp4v",
)

qrcode_config = QRCodeConfig(
    error_correction="H",
    box_size=10,
    border=4
)

video_processor = VideoProcessor(
    video_config=video_config,
    qrcode_config=qrcode_config
)

# Create a test video
data = [
    "The quick brown fox jumps over the lazy dog",
    "A fast orange fox leaps across a sleepy canine",
    "The weather is beautiful today",
    "It's raining cats and dogs outside",
    "Python is a popular programming language",
    "JavaScript is widely used for web development"
]
images = [video_processor.create_qr_code(d) for d in data]
output_path = Path("test_video.mp4")

# Encode the image into a video
video_processor.encode_video(
    frames=images,
    output_path=output_path
)

frames = video_processor.decode_video(Path("test_video.mp4"))

decoded_data = []
for frame in frames:
    decoded_data.extend(video_processor.extract_qr_codes(frame))

gc.add(output_path)
decoded_data

['The quick brown fox jumps over the lazy dog',
 'A fast orange fox leaps across a sleepy canine',
 'The weather is beautiful today',
 "It's raining cats and dogs outside",
 'Python is a popular programming language']

In [7]:
cfg = VectorStoreConfig(
    video=video_config,
    qrcode=qrcode_config
)

encoder = Encoder(config=cfg, index_manager=index_manager)
encoder.add_chunks(texts, metadata)

stats = encoder.build_video(Path("test_video.mp4"), Path("test_index.d"))

gc.add(stats.video_path)
gc.add(stats.index_path)
stats

BuildStats(total_chunks=6, video_size_mb=1.0306396484375, encoding_time=0.19210100173950195, index_path=PosixPath('test_index.d'), video_path=PosixPath('test_video.mp4'))

In [8]:
retriever = Retriever(
    video_file="test_video.mp4",
    index_file="test_index.d",
    config=cfg,
    index_manager=index_manager,
    k=2,
)

# Example searches
queries = [
    "Tell me about foxes",
    "What's the weather like?",
    "What programming languages are mentioned?"
]

print("\nRetrieving documents:")
print("-" * 50)

for query in queries:
    print(f"\nQuery: {query}")
    result = retriever.retrieve(query)

    print("Results:")
    for doc in result:
        print(f"- Text: {doc.page_content}")
        print(f"  Metadata: {doc.metadata}")


Retrieving documents:
--------------------------------------------------

Query: Tell me about foxes
Results:
- Text: The quick brown fox jumps over the lazy dog
  Metadata: {'source': 'example1.txt', 'category': 'animals', 'similarity': 0.5380151271820068, 'id': 0, 'text': 'The quick brown fox jumps over the lazy dog'}
- Text: A fast orange fox leaps across a sleepy canine
  Metadata: {'source': 'example1.txt', 'category': 'animals', 'similarity': 0.5364233255386353, 'id': 1, 'text': 'A fast orange fox leaps across a sleepy canine'}

Query: What's the weather like?
Results:
- Text: The weather is beautiful today
  Metadata: {'source': 'example2.txt', 'category': 'weather', 'similarity': 0.47028934955596924, 'id': 2, 'text': 'The weather is beautiful today'}
- Text: It's raining cats and dogs outside
  Metadata: {'source': 'example2.txt', 'category': 'weather', 'similarity': 0.27837008237838745, 'id': 3, 'text': "It's raining cats and dogs outside"}

Query: What programming languages 

In [9]:
gc.cleanup()