Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/workflows/release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -45,3 +45,4 @@ jobs:
target/scanoss-folder-hashing-import-linux-arm64
scanoss-folder-hashing-api_linux-amd64_${{ github.ref_name }}-1.tgz
scanoss-folder-hashing-api_linux-arm64_${{ github.ref_name }}-1.tgz
docker-compose.qdrant.yml
73 changes: 69 additions & 4 deletions cmd/import/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -238,39 +238,58 @@ func main() {
for _, collectionName := range collections {
showCollectionStats(ctx, client, collectionName)
}

// Re-enable production HNSW indexing for all collections
log.Println("\n=== Enabling Production HNSW Indexing ===")
log.Println("Re-enabling HNSW indexing (M=48) for production queries...")
for _, collectionName := range collections {
if err := enableProductionIndexing(ctx, client, collectionName); err != nil {
log.Printf("WARNING: Failed to enable production indexing for %s: %v", collectionName, err)
}
}
log.Println("\n✓ Production indexing enabled for all collections.")
log.Println("The Qdrant optimizer will build HNSW indexes in the background.")
log.Println("Monitor collection stats to track indexing progress.")
}

// Create a language-based collection with named vectors (dirs, names, contents).
func createCollection(ctx context.Context, client *qdrant.Client, collectionName string) {
log.Printf("Creating language-based collection with named vectors: %s", collectionName)

// Create named vectors configuration for dirs, names, and contents
// Optimized for bulk import: vectors on disk, HNSW disabled (M=0)
namedVectors := map[string]*qdrant.VectorParams{
"dirs": {
Size: VectorDim,
Distance: qdrant.Distance_Manhattan,
OnDisk: qdrant.PtrOf(true), // Store vectors on disk to reduce RAM during import
HnswConfig: &qdrant.HnswConfigDiff{
M: qdrant.PtrOf(uint64(48)),
M: qdrant.PtrOf(uint64(0)), // Disable HNSW during import, re-enabled after
EfConstruct: qdrant.PtrOf(uint64(500)),
FullScanThreshold: qdrant.PtrOf(uint64(100000)),
OnDisk: qdrant.PtrOf(true), // Store HNSW index on disk
},
},
"names": {
Size: VectorDim,
Distance: qdrant.Distance_Manhattan,
OnDisk: qdrant.PtrOf(true), // Store vectors on disk to reduce RAM during import
HnswConfig: &qdrant.HnswConfigDiff{
M: qdrant.PtrOf(uint64(48)),
M: qdrant.PtrOf(uint64(0)), // Disable HNSW during import, re-enabled after
EfConstruct: qdrant.PtrOf(uint64(500)),
FullScanThreshold: qdrant.PtrOf(uint64(100000)),
OnDisk: qdrant.PtrOf(true), // Store HNSW index on disk
},
},
"contents": {
Size: VectorDim,
Distance: qdrant.Distance_Manhattan,
OnDisk: qdrant.PtrOf(true), // Store vectors on disk to reduce RAM during import
HnswConfig: &qdrant.HnswConfigDiff{
M: qdrant.PtrOf(uint64(48)),
M: qdrant.PtrOf(uint64(0)), // Disable HNSW during import, re-enabled after
EfConstruct: qdrant.PtrOf(uint64(500)),
FullScanThreshold: qdrant.PtrOf(uint64(100000)),
OnDisk: qdrant.PtrOf(true), // Store HNSW index on disk
},
},
}
Expand All @@ -288,7 +307,7 @@ func createCollection(ctx context.Context, client *qdrant.Client, collectionName
QuantizationConfig: &qdrant.QuantizationConfig{
Quantization: &qdrant.QuantizationConfig_Binary{
Binary: &qdrant.BinaryQuantization{
AlwaysRam: qdrant.PtrOf(true), // Keep quantized vectors in RAM
AlwaysRam: qdrant.PtrOf(false), // Allow quantized vectors on disk to reduce RAM
},
},
},
Expand Down Expand Up @@ -329,6 +348,52 @@ func createCollection(ctx context.Context, client *qdrant.Client, collectionName
}
}

// enableProductionIndexing re-enables HNSW indexing after bulk import is complete.
// This should be called after all data has been imported to optimize for production queries.
func enableProductionIndexing(ctx context.Context, client *qdrant.Client, collectionName string) error {
log.Printf("Enabling production HNSW indexing for collection: %s", collectionName)

// Build named vectors config map
namedVectorsConfig := make(map[string]*qdrant.VectorParamsDiff)
for _, vectorName := range []string{"dirs", "names", "contents"} {
namedVectorsConfig[vectorName] = &qdrant.VectorParamsDiff{
HnswConfig: &qdrant.HnswConfigDiff{
M: qdrant.PtrOf(uint64(48)),
OnDisk: qdrant.PtrOf(false),
},
OnDisk: qdrant.PtrOf(true), // Keep vectors on disk, only HNSW in RAM
}
}

// Update all named vectors and collection settings in a single call
err := client.UpdateCollection(ctx, &qdrant.UpdateCollection{
CollectionName: collectionName,
VectorsConfig: &qdrant.VectorsConfigDiff{
Config: &qdrant.VectorsConfigDiff_ParamsMap{
ParamsMap: &qdrant.VectorParamsDiffMap{
Map: namedVectorsConfig,
},
},
},
OptimizersConfig: &qdrant.OptimizersConfigDiff{
IndexingThreshold: qdrant.PtrOf(uint64(0)),
},
QuantizationConfig: &qdrant.QuantizationConfigDiff{
Quantization: &qdrant.QuantizationConfigDiff_Binary{
Binary: &qdrant.BinaryQuantization{
AlwaysRam: qdrant.PtrOf(true),
},
},
},
})
if err != nil {
return fmt.Errorf("failed to update HNSW config: %w", err)
}

log.Printf("✓ HNSW indexing enabled for %s. Optimizer will build indexes in background.", collectionName)
return nil
}

// Import data from a CSV file to separate collections.
func importCSVFileWithProgress(ctx context.Context, client *qdrant.Client, filePath string, batchSize int, progress *progresstracker.ProgressTracker) (int, error) {
file, err := os.Open(filePath)
Expand Down
6 changes: 6 additions & 0 deletions docker-compose.qdrant.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,12 @@ services:
- 6334:6334 # gRPC API port (used by our Go client)
volumes:
- ./qdrant_data:/qdrant/storage
environment:
# Optimize storage performance for large-scale imports
- QDRANT__STORAGE__OPTIMIZERS__OVERWRITE__MAX_SEGMENT_SIZE=500000
- QDRANT__STORAGE__PERFORMANCE__MAX_OPTIMIZATION_THREADS=4
# Enable WAL for durability during bulk imports
- QDRANT__STORAGE__WAL__WAL_CAPACITY_MB=32
expose:
- 6333
- 6334
Expand Down
Loading