# Vector Databases and Document Processing with LanceDB

This notebook supports both local and Google Colab environments for document processing and vector storage.

## References
- [LanceDB Documentation](https://lancedb.github.io/lancedb/)
- [Sentence Transformers](https://www.sbert.net/)
- [Google Drive Integration](https://developers.google.com/drive/api/quickstart/python)

In [1]:
# Install required packages
!pip install -q sentence-transformers lancedb pandas numpy

In [2]:
import os
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
import lancedb
import textwrap
from pathlib import Path
from typing import List, Optional, Dict
import json

# Check if running in Colab
IN_COLAB = 'google.colab' in str(get_ipython())

if IN_COLAB:
    from google.colab import drive

  from tqdm.autonotebook import tqdm, trange


## 1. Environment Setup

Choose between local and Colab environments for storage and document processing.

In [3]:
class Config:
    def __init__(self, use_drive: bool = False):
        self.use_drive = use_drive
        
        # Set up paths based on environment
        if use_drive and IN_COLAB:
            print("Mounting Google Drive...")
            drive.mount('/content/drive')
            self.base_path = Path('/content/drive/MyDrive/vector_db')
            self.docs_path = self.base_path / 'documents'
            self.db_path = self.base_path / 'db'
        else:
            self.base_path = Path('vector_store')
            self.docs_path = self.base_path / 'documents'
            self.db_path = self.base_path / 'db'
        
        # Create directories
        self.docs_path.mkdir(parents=True, exist_ok=True)
        self.db_path.mkdir(parents=True, exist_ok=True)
        
        print(f"Documents path: {self.docs_path}")
        print(f"Database path: {self.db_path}")
    
    def save(self):
        """Save configuration to file"""
        config_data = {
            'use_drive': self.use_drive,
            'base_path': str(self.base_path),
            'docs_path': str(self.docs_path),
            'db_path': str(self.db_path)
        }
        config_path = self.base_path / 'config.json'
        with open(config_path, 'w') as f:
            json.dump(config_data, f, indent=2)
    
    @classmethod
    def load(cls, base_path: Optional[str] = None) -> 'Config':
        """Load configuration from file"""
        if base_path is None:
            if IN_COLAB:
                base_path = '/content/drive/MyDrive/vector_db'
            else:
                base_path = 'vector_store'
        
        config_path = Path(base_path) / 'config.json'
        if config_path.exists():
            with open(config_path) as f:
                config_data = json.load(f)
            config = cls(use_drive=config_data['use_drive'])
            return config
        return cls(use_drive=False)

# Initialize configuration
# Set use_drive=True to use Google Drive storage in Colab
config = Config(use_drive=False)  # Change to True for Google Drive storage
config.save()

Documents path: vector_store/documents
Database path: vector_store/db


## 2. Document Processing

In [4]:
class DocumentProcessor:
    def __init__(self, config: Config, chunk_size: int = 500, chunk_overlap: int = 50):
        self.config = config
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap
        self.model = SentenceTransformer('all-MiniLM-L6-v2')
    
    def process_text(self, text: str) -> List[str]:
        """Split text into overlapping chunks."""
        words = text.split()
        chunks = []
        
        for i in range(0, len(words), self.chunk_size - self.chunk_overlap):
            chunk = ' '.join(words[i:i + self.chunk_size])
            chunks.append(chunk)
        
        return chunks
    
    def process_file(self, file_path: str) -> Dict:
        """Process a single file."""
        with open(file_path, 'r', encoding='utf-8') as f:
            text = f.read()
        
        chunks = self.process_text(text)
        embeddings = self.model.encode(chunks)
        
        return {
            'chunks': chunks,
            'embeddings': embeddings,
            'source': str(file_path)
        }
    
    def process_directory(self, table_name: str = 'documents') -> None:
        """Process all text files in the documents directory."""
        db = lancedb.connect(self.config.db_path)
        
        all_data = []
        for file_path in self.config.docs_path.glob('**/*.txt'):
            print(f"Processing {file_path}...")
            result = self.process_file(str(file_path))
            
            for chunk, embedding in zip(result['chunks'], result['embeddings']):
                all_data.append({
                    'text': chunk,
                    'vector': embedding.tolist(),
                    'source': result['source']
                })
        
        if all_data:
            df = pd.DataFrame(all_data)
            if table_name in db.table_names():
                table = db.open_table(table_name)
                table.add(df)
            else:
                table = db.create_table(table_name, df)
            
            print(f"Added {len(all_data)} chunks to the database")
        else:
            print("No documents found to process")

## 3. Vector Database Operations

In [5]:
class VectorDB:
    def __init__(self, config: Config):
        self.config = config
        self.db = lancedb.connect(config.db_path)
        self.model = SentenceTransformer('all-MiniLM-L6-v2')
    
    def list_tables(self) -> List[str]:
        """List all tables in the database."""
        return self.db.table_names()
    
    def get_table_info(self, table_name: str) -> Dict:
        """Get information about a table."""
        table = self.db.open_table(table_name)
        df = table.to_df()
        return {
            'num_chunks': len(df),
            'sources': df['source'].unique().tolist(),
            'avg_chunk_length': df['text'].str.len().mean()
        }
    
    def semantic_search(self, query: str, table_name: str, k: int = 3) -> pd.DataFrame:
        """Perform semantic search."""
        query_embedding = self.model.encode([query])[0]
        table = self.db.open_table(table_name)
        
        results = table.search(query_embedding).limit(k).to_df()
        return results[['text', 'source', '_distance']].sort_values('_distance')
    
    def delete_table(self, table_name: str) -> None:
        """Delete a table from the database."""
        if table_name in self.list_tables():
            self.db.drop_table(table_name)
            print(f"Table '{table_name}' deleted")
        else:
            print(f"Table '{table_name}' not found")

## 4. Example Usage

### 4.1 Process Documents

In [6]:
# Initialize processor and process documents
processor = DocumentProcessor(config)
processor.process_directory()



No documents found to process


### 4.2 Database Operations

In [7]:
# Initialize vector database
db = VectorDB(config)

# List tables
print("Available tables:")
for table_name in db.list_tables():
    info = db.get_table_info(table_name)
    print(f"\nTable: {table_name}")
    print(f"Number of chunks: {info['num_chunks']}")
    print(f"Average chunk length: {info['avg_chunk_length']:.0f} characters")
    print(f"Sources: {info['sources']}")

KeyboardInterrupt: 

### 4.3 Semantic Search

In [None]:
# Example search
query = "your search query here"
results = db.semantic_search(query, 'documents', k=3)

print(f"Query: {query}\n")
for _, row in results.iterrows():
    print(f"Score: {row['_distance']:.4f}")
    print(f"Source: {row['source']}")
    print(f"Text: {textwrap.fill(row['text'], width=80)}\n")

## 5. Cleanup (Optional)

In [None]:
# Delete a table if needed
# db.delete_table('documents')