diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 6385da6..814d1e4 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -45,3 +45,4 @@ jobs: target/scanoss-folder-hashing-import-linux-arm64 scanoss-folder-hashing-api_linux-amd64_${{ github.ref_name }}-1.tgz scanoss-folder-hashing-api_linux-arm64_${{ github.ref_name }}-1.tgz + docker-compose.qdrant.yml diff --git a/cmd/import/main.go b/cmd/import/main.go index edb74c0..c474fa8 100644 --- a/cmd/import/main.go +++ b/cmd/import/main.go @@ -238,6 +238,18 @@ func main() { for _, collectionName := range collections { showCollectionStats(ctx, client, collectionName) } + + // Re-enable production HNSW indexing for all collections + log.Println("\n=== Enabling Production HNSW Indexing ===") + log.Println("Re-enabling HNSW indexing (M=48) for production queries...") + for _, collectionName := range collections { + if err := enableProductionIndexing(ctx, client, collectionName); err != nil { + log.Printf("WARNING: Failed to enable production indexing for %s: %v", collectionName, err) + } + } + log.Println("\nāœ“ Production indexing enabled for all collections.") + log.Println("The Qdrant optimizer will build HNSW indexes in the background.") + log.Println("Monitor collection stats to track indexing progress.") } // Create a language-based collection with named vectors (dirs, names, contents). @@ -245,32 +257,39 @@ func createCollection(ctx context.Context, client *qdrant.Client, collectionName log.Printf("Creating language-based collection with named vectors: %s", collectionName) // Create named vectors configuration for dirs, names, and contents + // Optimized for bulk import: vectors on disk, HNSW disabled (M=0) namedVectors := map[string]*qdrant.VectorParams{ "dirs": { Size: VectorDim, Distance: qdrant.Distance_Manhattan, + OnDisk: qdrant.PtrOf(true), // Store vectors on disk to reduce RAM during import HnswConfig: &qdrant.HnswConfigDiff{ - M: qdrant.PtrOf(uint64(48)), + M: qdrant.PtrOf(uint64(0)), // Disable HNSW during import, re-enabled after EfConstruct: qdrant.PtrOf(uint64(500)), FullScanThreshold: qdrant.PtrOf(uint64(100000)), + OnDisk: qdrant.PtrOf(true), // Store HNSW index on disk }, }, "names": { Size: VectorDim, Distance: qdrant.Distance_Manhattan, + OnDisk: qdrant.PtrOf(true), // Store vectors on disk to reduce RAM during import HnswConfig: &qdrant.HnswConfigDiff{ - M: qdrant.PtrOf(uint64(48)), + M: qdrant.PtrOf(uint64(0)), // Disable HNSW during import, re-enabled after EfConstruct: qdrant.PtrOf(uint64(500)), FullScanThreshold: qdrant.PtrOf(uint64(100000)), + OnDisk: qdrant.PtrOf(true), // Store HNSW index on disk }, }, "contents": { Size: VectorDim, Distance: qdrant.Distance_Manhattan, + OnDisk: qdrant.PtrOf(true), // Store vectors on disk to reduce RAM during import HnswConfig: &qdrant.HnswConfigDiff{ - M: qdrant.PtrOf(uint64(48)), + M: qdrant.PtrOf(uint64(0)), // Disable HNSW during import, re-enabled after EfConstruct: qdrant.PtrOf(uint64(500)), FullScanThreshold: qdrant.PtrOf(uint64(100000)), + OnDisk: qdrant.PtrOf(true), // Store HNSW index on disk }, }, } @@ -288,7 +307,7 @@ func createCollection(ctx context.Context, client *qdrant.Client, collectionName QuantizationConfig: &qdrant.QuantizationConfig{ Quantization: &qdrant.QuantizationConfig_Binary{ Binary: &qdrant.BinaryQuantization{ - AlwaysRam: qdrant.PtrOf(true), // Keep quantized vectors in RAM + AlwaysRam: qdrant.PtrOf(false), // Allow quantized vectors on disk to reduce RAM }, }, }, @@ -329,6 +348,52 @@ func createCollection(ctx context.Context, client *qdrant.Client, collectionName } } +// enableProductionIndexing re-enables HNSW indexing after bulk import is complete. +// This should be called after all data has been imported to optimize for production queries. +func enableProductionIndexing(ctx context.Context, client *qdrant.Client, collectionName string) error { + log.Printf("Enabling production HNSW indexing for collection: %s", collectionName) + + // Build named vectors config map + namedVectorsConfig := make(map[string]*qdrant.VectorParamsDiff) + for _, vectorName := range []string{"dirs", "names", "contents"} { + namedVectorsConfig[vectorName] = &qdrant.VectorParamsDiff{ + HnswConfig: &qdrant.HnswConfigDiff{ + M: qdrant.PtrOf(uint64(48)), + OnDisk: qdrant.PtrOf(false), + }, + OnDisk: qdrant.PtrOf(true), // Keep vectors on disk, only HNSW in RAM + } + } + + // Update all named vectors and collection settings in a single call + err := client.UpdateCollection(ctx, &qdrant.UpdateCollection{ + CollectionName: collectionName, + VectorsConfig: &qdrant.VectorsConfigDiff{ + Config: &qdrant.VectorsConfigDiff_ParamsMap{ + ParamsMap: &qdrant.VectorParamsDiffMap{ + Map: namedVectorsConfig, + }, + }, + }, + OptimizersConfig: &qdrant.OptimizersConfigDiff{ + IndexingThreshold: qdrant.PtrOf(uint64(0)), + }, + QuantizationConfig: &qdrant.QuantizationConfigDiff{ + Quantization: &qdrant.QuantizationConfigDiff_Binary{ + Binary: &qdrant.BinaryQuantization{ + AlwaysRam: qdrant.PtrOf(true), + }, + }, + }, + }) + if err != nil { + return fmt.Errorf("failed to update HNSW config: %w", err) + } + + log.Printf("āœ“ HNSW indexing enabled for %s. Optimizer will build indexes in background.", collectionName) + return nil +} + // Import data from a CSV file to separate collections. func importCSVFileWithProgress(ctx context.Context, client *qdrant.Client, filePath string, batchSize int, progress *progresstracker.ProgressTracker) (int, error) { file, err := os.Open(filePath) diff --git a/docker-compose.qdrant.yml b/docker-compose.qdrant.yml index 34f72df..7596230 100644 --- a/docker-compose.qdrant.yml +++ b/docker-compose.qdrant.yml @@ -7,6 +7,12 @@ services: - 6334:6334 # gRPC API port (used by our Go client) volumes: - ./qdrant_data:/qdrant/storage + environment: + # Optimize storage performance for large-scale imports + - QDRANT__STORAGE__OPTIMIZERS__OVERWRITE__MAX_SEGMENT_SIZE=500000 + - QDRANT__STORAGE__PERFORMANCE__MAX_OPTIMIZATION_THREADS=4 + # Enable WAL for durability during bulk imports + - QDRANT__STORAGE__WAL__WAL_CAPACITY_MB=32 expose: - 6333 - 6334