From faa7203a322e0194dbc1873f77d5f7fb83c1b5ae Mon Sep 17 00:00:00 2001 From: Matias Daloia Date: Mon, 13 Oct 2025 18:06:14 +0200 Subject: [PATCH 1/5] fix: import script resource overhead --- .gitignore | 3 +- cmd/import/main.go | 125 +++++++++++++++++++++++++++----------- docker-compose.qdrant.yml | 25 +++----- qdrant-config.yaml | 77 +++++++++++++++++++++++ 4 files changed, 175 insertions(+), 55 deletions(-) create mode 100644 qdrant-config.yaml diff --git a/.gitignore b/.gitignore index 3aca5f7..a45d7ab 100644 --- a/.gitignore +++ b/.gitignore @@ -43,4 +43,5 @@ qdrant_import server hfh-cli -papi/ \ No newline at end of file +papi/ +target diff --git a/cmd/import/main.go b/cmd/import/main.go index fc4cd19..0573037 100644 --- a/cmd/import/main.go +++ b/cmd/import/main.go @@ -44,11 +44,13 @@ const ( // QdrantPort is the default Qdrant server port. QdrantPort = 6334 // BatchSize is the number of records to process in each batch. - BatchSize = 2000 // Larger batch size for better performance + BatchSize = 2000 // Large batches are safe when indexing is disabled // MaxWorkers is the number of parallel workers for file processing. - MaxWorkers = 12 // More workers for parallel processing + MaxWorkers = 4 // Reduced workers to prevent overwhelming Qdrant // VectorDim is the dimensionality of the hash vectors. VectorDim = 64 // Single 64-bit hash per collection + // BatchInsertDelay is the delay between batch insertions to rate limit requests. + BatchInsertDelay = 100 * time.Millisecond ) var rankMap map[string]int @@ -71,6 +73,8 @@ func main() { log.Fatal("Error: You must specify a file with the -top-purls option") } + log.Println("FAST IMPORT MODE: Importing data without HNSW indexing, will enable indexing after import completes.") + // Verify that the directory exists if _, err := os.Stat(*csvDir); os.IsNotExist(err) { log.Fatalf("Error: Directory %s does not exist", *csvDir) @@ -104,6 +108,11 @@ func main() { }() log.Println("Connected to Qdrant server successfully") + // Verify connection health before starting + if err := verifyQdrantHealth(ctx, client); err != nil { + log.Fatalf("Qdrant health check failed: %v", err) + } + collections := entities.GetAllSupportedCollections() log.Printf("Will create/check %d language-based collections: %v", len(collections), collections) @@ -210,24 +219,54 @@ func main() { elapsed := time.Since(startTime) log.Printf("Import process completed. Total time: %s", elapsed) - // Show collection statistics if possible + // Enable HNSW indexing on all collections after import + log.Println("\n========================================") + log.Println("Enabling HNSW indexing on all collections...") + log.Println("========================================") + for _, collectionName := range collections { + if err := updateCollectionIndexing(ctx, client, collectionName); err != nil { + log.Printf("ERROR: Failed to enable indexing for %s: %v", collectionName, err) + } + } + log.Println("Indexing enabled for all collections. Qdrant will build indexes in the background.") + + // Show collection statistics + log.Println("\n========================================") + log.Println("Collection Statistics") + log.Println("========================================") for _, collectionName := range collections { showCollectionStats(ctx, client, collectionName) } } +// verifyQdrantHealth checks if Qdrant is healthy and responsive. +func verifyQdrantHealth(ctx context.Context, client *qdrant.Client) error { + log.Println("Performing Qdrant health check...") + + // Try to list collections as a basic health check + collections, err := client.ListCollections(ctx) + if err != nil { + return fmt.Errorf("failed to list collections: %w", err) + } + + log.Printf("Qdrant is healthy. Found %d existing collections", len(collections)) + return nil +} + // Create a language-based collection with named vectors (dirs, names, contents). +// Always creates collections with HNSW indexing disabled for fast import. func createCollection(ctx context.Context, client *qdrant.Client, collectionName string) { log.Printf("Creating language-based collection with named vectors: %s", collectionName) + log.Printf("Collection %s: HNSW indexing DISABLED for fast import (m=0)", collectionName) - // Create named vectors configuration for dirs, names, and contents + // Create named vectors configuration with indexing disabled namedVectors := map[string]*qdrant.VectorParams{ "dirs": { Size: VectorDim, Distance: qdrant.Distance_Manhattan, HnswConfig: &qdrant.HnswConfigDiff{ - M: qdrant.PtrOf(uint64(48)), - EfConstruct: qdrant.PtrOf(uint64(500)), + M: qdrant.PtrOf(uint64(0)), // m=0 disables HNSW index building + EfConstruct: qdrant.PtrOf(uint64(100)), FullScanThreshold: qdrant.PtrOf(uint64(100000)), }, }, @@ -235,8 +274,8 @@ func createCollection(ctx context.Context, client *qdrant.Client, collectionName Size: VectorDim, Distance: qdrant.Distance_Manhattan, HnswConfig: &qdrant.HnswConfigDiff{ - M: qdrant.PtrOf(uint64(48)), - EfConstruct: qdrant.PtrOf(uint64(500)), + M: qdrant.PtrOf(uint64(0)), + EfConstruct: qdrant.PtrOf(uint64(100)), FullScanThreshold: qdrant.PtrOf(uint64(100000)), }, }, @@ -244,8 +283,8 @@ func createCollection(ctx context.Context, client *qdrant.Client, collectionName Size: VectorDim, Distance: qdrant.Distance_Manhattan, HnswConfig: &qdrant.HnswConfigDiff{ - M: qdrant.PtrOf(uint64(48)), - EfConstruct: qdrant.PtrOf(uint64(500)), + M: qdrant.PtrOf(uint64(0)), + EfConstruct: qdrant.PtrOf(uint64(100)), FullScanThreshold: qdrant.PtrOf(uint64(100000)), }, }, @@ -255,11 +294,12 @@ func createCollection(ctx context.Context, client *qdrant.Client, collectionName err := client.CreateCollection(ctx, &qdrant.CreateCollection{ CollectionName: collectionName, VectorsConfig: qdrant.NewVectorsConfigMap(namedVectors), - // Aggressive optimization for large collections + ShardNumber: qdrant.PtrOf(uint32(4)), // 4 shards for parallel WAL processing + // Optimize for fast import with indexing disabled OptimizersConfig: &qdrant.OptimizersConfigDiff{ DefaultSegmentNumber: qdrant.PtrOf(uint64(32)), // Many segments for parallelism MaxSegmentSize: qdrant.PtrOf(uint64(500000)), // Large segments for efficiency - IndexingThreshold: qdrant.PtrOf(uint64(100000)), // High threshold for performance + IndexingThreshold: qdrant.PtrOf(uint64(0)), // Disable indexing during import }, // Binary quantization for memory efficiency QuantizationConfig: &qdrant.QuantizationConfig{ @@ -365,7 +405,7 @@ func importCSVFile(ctx context.Context, client *qdrant.Client, filePath, sectorN log.Printf("Processing batch %d/%d (%d records) for sector %s", batchNum, (totalRecords+BatchSize-1)/BatchSize, len(batch), sectorName) - // Insert to all three collections in parallel + // Insert batch to collections err := insertBatchToSeparateCollections(ctx, client, batch) if err != nil { log.Printf("WARNING: Error inserting batch %d: %v. Continuing with next batch.", batchNum, err) @@ -373,6 +413,9 @@ func importCSVFile(ctx context.Context, client *qdrant.Client, filePath, sectorN } batchesProcessed++ + + // Rate limit to avoid overwhelming Qdrant + time.Sleep(BatchInsertDelay) } log.Printf("All %d batches for sector %s imported successfully", batchesProcessed, sectorName) @@ -566,38 +609,46 @@ func insertBatchToSeparateCollections(ctx context.Context, client *qdrant.Client return nil } - // Insert to language-based collections in parallel using goroutines - var wg sync.WaitGroup - errChan := make(chan error, len(collectionPoints)) - + // Insert to language-based collections sequentially to avoid connection storms + // Qdrant with 4 shards will parallelize internally via separate WAL processing for collectionName, points := range collectionPoints { if len(points) > 0 { - wg.Add(1) - go func(colName string, pts []*qdrant.PointStruct) { - defer wg.Done() - _, err := client.Upsert(ctx, &qdrant.UpsertPoints{ - CollectionName: colName, - Points: pts, - }) - if err != nil { - errChan <- fmt.Errorf("error inserting to %s collection: %w", colName, err) - } else { - log.Printf("Successfully inserted %d points to %s", len(pts), colName) - } - }(collectionName, points) + _, err := client.Upsert(ctx, &qdrant.UpsertPoints{ + CollectionName: collectionName, + Points: points, + }) + if err != nil { + return fmt.Errorf("error inserting to %s collection: %w", collectionName, err) + } + log.Printf("Successfully inserted %d points to %s", len(points), collectionName) } } - wg.Wait() - close(errChan) + return nil +} - // Check for errors - for err := range errChan { - if err != nil { - return err - } +// updateCollectionIndexing enables HNSW indexing on an existing collection. +// This is called automatically after import completes to build indexes in the background. +func updateCollectionIndexing(ctx context.Context, client *qdrant.Client, collectionName string) error { + log.Printf("Updating collection %s to enable HNSW indexing...", collectionName) + + // Update indexing threshold + err := client.UpdateCollection(ctx, &qdrant.UpdateCollection{ + CollectionName: collectionName, + HnswConfig: &qdrant.HnswConfigDiff{ + M: qdrant.PtrOf(uint64(48)), + EfConstruct: qdrant.PtrOf(uint64(500)), + FullScanThreshold: qdrant.PtrOf(uint64(100000)), + }, + OptimizersConfig: &qdrant.OptimizersConfigDiff{ + IndexingThreshold: qdrant.PtrOf(uint64(100000)), + }, + }) + if err != nil { + return fmt.Errorf("error updating indexing threshold for %s: %w", collectionName, err) } + log.Printf("Successfully enabled indexing for collection: %s. Qdrant will build indexes in the background.", collectionName) return nil } diff --git a/docker-compose.qdrant.yml b/docker-compose.qdrant.yml index a89129b..aaddd47 100644 --- a/docker-compose.qdrant.yml +++ b/docker-compose.qdrant.yml @@ -3,22 +3,15 @@ services: image: qdrant/qdrant:latest container_name: scanoss-qdrant ports: - - "6333:6333" # HTTP API port - - "6334:6334" # gRPC API port (used by our Go client) + - 6333:6333 # HTTP API port + - 6334:6334 # gRPC API port (used by our Go client) volumes: - - qdrant_data:/qdrant/storage - - qdrant_snapshots:/qdrant/snapshots - environment: - # Enable all CORS origins for development - - QDRANT__SERVICE__HTTP_CORS_ENABLED=true - # Log level - - QDRANT__LOG_LEVEL=INFO - # Performance optimizations - - QDRANT__SERVICE__MAX_REQUEST_SIZE_MB=32 - - QDRANT__SERVICE__GRPC_TIMEOUT_MS=60000 - # Storage paths - - QDRANT__STORAGE__STORAGE_PATH=/qdrant/storage - - QDRANT__STORAGE__SNAPSHOTS_PATH=/qdrant/snapshots + - ./qdrant_data:/qdrant/storage + - ./qdrant-config.yaml:/qdrant/config/production.yaml:ro + expose: + - 6333 + - 6334 + - 6335 restart: unless-stopped networks: - scanoss-network @@ -26,8 +19,6 @@ services: volumes: qdrant_data: driver: local - qdrant_snapshots: - driver: local networks: scanoss-network: diff --git a/qdrant-config.yaml b/qdrant-config.yaml new file mode 100644 index 0000000..02d4c02 --- /dev/null +++ b/qdrant-config.yaml @@ -0,0 +1,77 @@ +# Qdrant Configuration for Bulk Import Optimization +# Place this file at: /path/to/qdrant/config/config.yaml +# Or use with Docker: docker run -v $(pwd)/qdrant-config.yaml:/qdrant/config/production.yaml:ro qdrant/qdrant + +log_level: INFO + +storage: + # Storage path for Qdrant data + storage_path: /qdrant/storage + + # Performance optimization for bulk imports + performance: + # Maximum number of concurrent API workers (default: number of CPU cores) + # Increase for better handling of concurrent requests + max_workers: 16 + + # Maximum concurrent updates to shard replicas + # Higher values allow more parallel writes + update_concurrency: 10 + + # Optimizer CPU budget - threads allocated for optimization jobs + # Set to number of cores or less to avoid overloading + optimizer_cpu_budget: 8 + + # Write-Ahead Log (WAL) configuration for better write throughput + wal: + # Size of each WAL segment in MB (default: 32) + # Larger segments can improve write performance + wal_capacity_mb: 64 + + # Number of WAL segments to create in advance (default: 0) + # Pre-creating segments can reduce latency during writes + wal_segments_ahead: 1 + + # Optimizer configuration + optimizers: + # Number of segments created initially (default: auto) + # More segments allow better parallelism during indexing + default_segment_number: 32 + + # Maximum points per segment (default: 100000) + # Larger segments reduce overhead but may slow down updates + max_segment_size: 500000 + + # Vacuum optimizer - controls when to merge small segments + vacuum_min_vector_number: 10000 + + # Memmap threshold - vectors larger than this use disk storage + # Set higher to keep more data in RAM (if you have enough memory) + memmap_threshold_kb: 50000 + +service: + # gRPC port (default: 6334) + grpc_port: 6334 + + # HTTP port (default: 6333) + http_port: 6333 + + # Maximum size of POST request in MB (default: 32) + # Increase for larger batch uploads + max_request_size_mb: 128 + + # Maximum allowed search/upsert timeout in seconds + max_timeout_sec: 120 + +# Cluster configuration (optional - only if running Qdrant cluster) +# cluster: +# enabled: false + +# TLS configuration (optional) +# service: +# enable_tls: false +# tls_cert_file: /path/to/cert.pem +# tls_key_file: /path/to/key.pem + +# Telemetry (optional - set to false to disable) +telemetry_disabled: false From c407b7673ffd4e42470ed960ac260770091ccaaf Mon Sep 17 00:00:00 2001 From: Matias Daloia Date: Mon, 13 Oct 2025 18:16:06 +0200 Subject: [PATCH 2/5] fix: lint errors --- cmd/import/main.go | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/cmd/import/main.go b/cmd/import/main.go index 0573037..8c7c206 100644 --- a/cmd/import/main.go +++ b/cmd/import/main.go @@ -100,6 +100,12 @@ func main() { if err != nil { log.Fatalf("Error connecting to Qdrant: %v", err) } + + // Verify connection health before starting + if err := verifyQdrantHealth(ctx, client); err != nil { + log.Fatalf("Qdrant health check failed: %v", err) + } + defer func() { log.Println("Closing connection to Qdrant") if err := client.Close(); err != nil { @@ -108,11 +114,6 @@ func main() { }() log.Println("Connected to Qdrant server successfully") - // Verify connection health before starting - if err := verifyQdrantHealth(ctx, client); err != nil { - log.Fatalf("Qdrant health check failed: %v", err) - } - collections := entities.GetAllSupportedCollections() log.Printf("Will create/check %d language-based collections: %v", len(collections), collections) @@ -130,7 +131,6 @@ func main() { log.Printf("Collection %s exists and overwrite flag is set. Dropping collection...", collectionName) err = client.DeleteCollection(ctx, collectionName) if err != nil { - //nolint:gocritic // Error is fatal, defer will not help here log.Fatalf("Error dropping collection %s: %v", collectionName, err) } log.Printf("Collection '%s' dropped successfully", collectionName) @@ -174,7 +174,7 @@ func main() { // Start workers to process files in parallel log.Printf("Starting %d worker(s) to process CSV files...", MaxWorkers) - for i := 0; i < MaxWorkers; i++ { + for workerId := range MaxWorkers { wg.Add(1) go func(workerId int) { defer wg.Done() @@ -191,7 +191,7 @@ func main() { log.Printf("Worker %d: Successfully processed sector %s", workerId, sectorName) } } - }(i) + }(workerId) } // Send files to workers @@ -397,9 +397,7 @@ func importCSVFile(ctx context.Context, client *qdrant.Client, filePath, sectorN batchesProcessed := 0 for i := 0; i < totalRecords; i += BatchSize { end := i + BatchSize - if end > totalRecords { - end = totalRecords - } + end = min(end, totalRecords) batch := validRecords[i:end] batchNum := i/BatchSize + 1 log.Printf("Processing batch %d/%d (%d records) for sector %s", From 86d7379cd224176baac12f87942a4c29e984a3be Mon Sep 17 00:00:00 2001 From: Matias Daloia Date: Mon, 13 Oct 2025 18:24:00 +0200 Subject: [PATCH 3/5] fix: lint errors --- .golangci.yml | 378 ++++++++++++-------------- cmd/import/main.go | 29 +- internal/handler/scan_handler.go | 4 +- internal/protocol/grpc/server.go | 18 +- internal/protocol/rest/server.go | 12 +- internal/service/scan_service_impl.go | 2 +- 6 files changed, 215 insertions(+), 228 deletions(-) diff --git a/.golangci.yml b/.golangci.yml index 8e3c082..d325332 100644 --- a/.golangci.yml +++ b/.golangci.yml @@ -1,235 +1,211 @@ # golangci-lint configuration # https://golangci-lint.run/usage/configuration/ +version: "2" run: timeout: 5m tests: true -output: - formats: - - format: colored-line-number - print-issued-lines: true - print-linter-name: true - sort-results: true +formatters: + enable: + - goimports + - gofmt + - gofumpt + + settings: + goimports: + local-prefixes: [github.com/scanoss/folder-hashing-api] linters: - disable-all: true enable: # Enabled by default - - errcheck # Checks for unchecked errors - - gosimple # Simplifies code - - govet # Reports suspicious constructs - - ineffassign # Detects ineffectual assignments - - staticcheck # Comprehensive static analysis - - unused # Checks for unused code + - errcheck # Checks for unchecked errors + - govet # Reports suspicious constructs + - ineffassign # Detects ineffectual assignments + - staticcheck # Comprehensive static analysis + - unused # Checks for unused code # Error handling & bugs - - errorlint # Find code that will cause problems with error wrapping - - nilerr # Finds code that returns nil even if error is not nil + - errorlint # Find code that will cause problems with error wrapping + - nilerr # Finds code that returns nil even if error is not nil # Performance - - prealloc # Finds slice declarations that could potentially be pre-allocated - - bodyclose # Checks whether HTTP response body is closed successfully + - prealloc # Finds slice declarations that could potentially be pre-allocated + - bodyclose # Checks whether HTTP response body is closed successfully # Style & code quality - - gofmt # Checks whether code is formatted - - gofumpt # Stricter gofmt - - goimports # Checks import formatting - - revive # Fast, configurable, extensible, flexible linter - - stylecheck # Stylecheck is a replacement for golint - - unconvert # Removes unnecessary type conversions - - unparam # Reports unused function parameters - - whitespace # Checks for unnecessary newlines - - misspell # Finds commonly misspelled English words + - revive # Fast, configurable, extensible, flexible linter + - unconvert # Removes unnecessary type conversions + - unparam # Reports unused function parameters + - whitespace # Checks for unnecessary newlines + - misspell # Finds commonly misspelled English words # Code complexity - - gocognit # Computes and checks cognitive complexity - - gocyclo # Computes and checks cyclomatic complexity - - funlen # Checks function length - - nestif # Reports deeply nested if statements + - gocognit # Computes and checks cognitive complexity + - gocyclo # Computes and checks cyclomatic complexity + - funlen # Checks function length + - nestif # Reports deeply nested if statements # Potential bugs & best practices - - goconst # Finds repeated strings that could be constants - - gocritic # Comprehensive and opinionated linter - - gosec # Security-focused linter - - noctx # Finds HTTP requests without context.Context + - goconst # Finds repeated strings that could be constants + - gocritic # Comprehensive and opinionated linter + - gosec # Security-focused linter + - noctx # Finds HTTP requests without context.Context - sqlclosecheck # Checks sql.Rows and sql.Stmt are closed # Code correctness - - exhaustive # Checks exhaustiveness of enum switch statements - - makezero # Finds slice declarations with non-zero initial length - - predeclared # Finds code that shadows predeclared identifiers - - reassign # Checks that package variables are not reassigned + - exhaustive # Checks exhaustiveness of enum switch statements + - makezero # Finds slice declarations with non-zero initial length + - predeclared # Finds code that shadows predeclared identifiers + - reassign # Checks that package variables are not reassigned - usestdlibvars # Detects the possibility to use standard lib variables # Comments & documentation - - godot # Checks if comments end in a period + - godot # Checks if comments end in a period # Linter management - - nolintlint # Reports ill-formed or insufficient nolint directives - -linters-settings: - errcheck: - check-type-assertions: true - check-blank: true - exclude-functions: - - (*database/sql.Rows).Close - - (*database/sql.Stmt).Close - - errorlint: - errorf: true - asserts: true - comparison: true - - exhaustive: - check: - - switch - - map - default-signifies-exhaustive: false - - funlen: - lines: 120 - statements: 60 - ignore-comments: true - - gocognit: - min-complexity: 20 - - gocyclo: - min-complexity: 15 - - goconst: - min-len: 3 - min-occurrences: 3 - ignore-tests: true - - gocritic: - enabled-tags: - - diagnostic - - style - - performance - - experimental - - opinionated - disabled-checks: - - whyNoLint - - unnamedResult - settings: - hugeParam: - sizeThreshold: 256 - - gofumpt: - extra-rules: true - - goimports: - local-prefixes: github.com/scanoss/folder-hashing-api - - gosec: - severity: medium - confidence: medium - excludes: - - G204 # Audit use of command execution - we'll check these manually - - G304 # File path provided as taint input - check manually - - govet: - enable-all: true - disable: - - fieldalignment # Can be too noisy - - shadow # Often produces false positives - - misspell: - locale: US - ignore-words: - - scanoss - - nestif: - min-complexity: 5 - - nolintlint: - allow-unused: false - allow-no-explanation: [] - require-explanation: true - require-specific: true - - prealloc: - simple: true - range-loops: true - for-loops: true - - revive: + - nolintlint # Reports ill-formed or insufficient nolint directives + + settings: + errcheck: + check-type-assertions: true + check-blank: true + exclude-functions: + - (*database/sql.Rows).Close + - (*database/sql.Stmt).Close + + errorlint: + errorf: true + asserts: true + comparison: true + + exhaustive: + check: + - switch + - map + default-signifies-exhaustive: false + + funlen: + lines: 120 + statements: 60 + ignore-comments: true + + gocognit: + min-complexity: 20 + + gocyclo: + min-complexity: 15 + + goconst: + min-len: 3 + min-occurrences: 3 + + gocritic: + enabled-tags: + - diagnostic + - style + - performance + - experimental + - opinionated + disabled-checks: + - whyNoLint + - unnamedResult + settings: + hugeParam: + sizeThreshold: 256 + + gosec: + severity: medium + confidence: medium + excludes: + - G204 # Audit use of command execution - we'll check these manually + - G304 # File path provided as taint input - check manually + + govet: + enable-all: true + disable: + - fieldalignment # Can be too noisy + - shadow # Often produces false positives + + misspell: + locale: US + + nestif: + min-complexity: 5 + + nolintlint: + allow-unused: false + allow-no-explanation: [] + require-explanation: true + require-specific: true + + prealloc: + simple: true + range-loops: true + for-loops: true + + revive: + rules: + - name: blank-imports + - name: context-as-argument + - name: context-keys-type + - name: dot-imports + - name: error-return + - name: error-strings + - name: error-naming + - name: exported + - name: increment-decrement + - name: var-naming + - name: var-declaration + - name: package-comments + - name: range + - name: receiver-naming + - name: time-naming + - name: unexported-return + - name: indent-error-flow + - name: errorf + - name: empty-block + - name: superfluous-else + - name: unused-parameter + - name: unreachable-code + - name: redefines-builtin-id + + staticcheck: + checks: ["all"] + + unparam: + check-exported: false + + exclusions: + warn-unused: false + rules: - - name: blank-imports - - name: context-as-argument - - name: context-keys-type - - name: dot-imports - - name: error-return - - name: error-strings - - name: error-naming - - name: exported - - name: increment-decrement - - name: var-naming - - name: var-declaration - - name: package-comments - - name: range - - name: receiver-naming - - name: time-naming - - name: unexported-return - - name: indent-error-flow - - name: errorf - - name: empty-block - - name: superfluous-else - - name: unused-parameter - - name: unreachable-code - - name: redefines-builtin-id - - staticcheck: - checks: ["all"] - - stylecheck: - checks: ["all", "-ST1000", "-ST1003"] - dot-import-whitelist: [] - http-status-code-whitelist: [] - - unparam: - check-exported: false + - path: _test\.go + linters: + - gocognit + - gocyclo + - funlen + - goconst + - gosec + - errcheck + + # Exclude godot for TODO comments + - source: "// TODO" + linters: + - godot + + # Allow complex main functions + - path: cmd/ + linters: + - funlen + - gocognit + + paths: + - vendor + - third_party + - ".*\\.pb\\.go$" + - ".*_gen\\.go$" issues: max-issues-per-linter: 0 max-same-issues: 0 - - exclude-dirs: - - vendor - - third_party - - exclude-files: - - ".*\\.pb\\.go$" - - ".*_gen\\.go$" - - exclude-rules: - # Exclude some linters from running on tests files - - path: _test\.go - linters: - - gocognit - - gocyclo - - funlen - - goconst - - gosec - - errcheck - - # Exclude godot for TODO comments - - source: "// TODO" - linters: - - godot - - # Allow complex main functions - - path: cmd/ - linters: - - funlen - - gocognit - - # Exclude ineffassign for err variables that are shadowed - - linters: - - ineffassign - text: "ineffectual assignment to err" - - exclude-use-default: false - exclude-case-sensitive: false diff --git a/cmd/import/main.go b/cmd/import/main.go index 8c7c206..5746c38 100644 --- a/cmd/import/main.go +++ b/cmd/import/main.go @@ -131,7 +131,7 @@ func main() { log.Printf("Collection %s exists and overwrite flag is set. Dropping collection...", collectionName) err = client.DeleteCollection(ctx, collectionName) if err != nil { - log.Fatalf("Error dropping collection %s: %v", collectionName, err) + cleanupAndExit(client, "Error dropping collection %s: %v", collectionName, err) } log.Printf("Collection '%s' dropped successfully", collectionName) collectionExists = false @@ -149,7 +149,7 @@ func main() { log.Printf("Reading directory '%s' for CSV files...", *csvDir) files, err := os.ReadDir(*csvDir) if err != nil { - log.Fatalf("Error reading directory: %v", err) + cleanupAndExit(client, "Error reading directory: %v", err) } var csvFiles []string @@ -174,24 +174,24 @@ func main() { // Start workers to process files in parallel log.Printf("Starting %d worker(s) to process CSV files...", MaxWorkers) - for workerId := range MaxWorkers { + for workerID := range MaxWorkers { wg.Add(1) - go func(workerId int) { + go func(workerID int) { defer wg.Done() for file := range filesChan { sectorName := filepath.Base(file) sectorName = strings.TrimSuffix(sectorName, ".csv") - log.Printf("Worker %d: Processing sector %s", workerId, sectorName) + log.Printf("Worker %d: Processing sector %s", workerID, sectorName) err := importCSVFile(ctx, client, file, sectorName) if err != nil { - log.Printf("Worker %d: Error importing file %s: %v", workerId, file, err) + log.Printf("Worker %d: Error importing file %s: %v", workerID, file, err) errorsChan <- fmt.Errorf("error importing file %s: %w", file, err) } else { - log.Printf("Worker %d: Successfully processed sector %s", workerId, sectorName) + log.Printf("Worker %d: Successfully processed sector %s", workerID, sectorName) } } - }(workerId) + }(workerID) } // Send files to workers @@ -253,6 +253,17 @@ func verifyQdrantHealth(ctx context.Context, client *qdrant.Client) error { return nil } +// Gracefully terminate qdrant client. +func cleanupAndExit(client *qdrant.Client, format string, args ...any) { + log.Printf(format, args...) + if client != nil { + if err := client.Close(); err != nil { + log.Printf("Failed to close client: %v", err) + } + } + os.Exit(1) +} + // Create a language-based collection with named vectors (dirs, names, contents). // Always creates collections with HNSW indexing disabled for fast import. func createCollection(ctx context.Context, client *qdrant.Client, collectionName string) { @@ -311,7 +322,7 @@ func createCollection(ctx context.Context, client *qdrant.Client, collectionName }, }) if err != nil { - log.Fatalf("Error creating collection %s: %v", collectionName, err) + cleanupAndExit(client, "Error creating collection %s: %v", collectionName, err) } log.Printf("Collection '%s' with named vectors created successfully", collectionName) diff --git a/internal/handler/scan_handler.go b/internal/handler/scan_handler.go index c030f0c..5d4a5ce 100644 --- a/internal/handler/scan_handler.go +++ b/internal/handler/scan_handler.go @@ -39,10 +39,10 @@ type ScanHandler struct { } // NewScanHandler creates a new scan handler. -func NewScanHandler(scanService service.ScanService, mapper mapper.ScanMapper) *ScanHandler { +func NewScanHandler(scanService service.ScanService, scanMapper mapper.ScanMapper) *ScanHandler { return &ScanHandler{ scanService: scanService, - mapper: mapper, + mapper: scanMapper, } } diff --git a/internal/protocol/grpc/server.go b/internal/protocol/grpc/server.go index 11498d4..77e9dc2 100644 --- a/internal/protocol/grpc/server.go +++ b/internal/protocol/grpc/server.go @@ -28,13 +28,13 @@ import ( ) // RunServer runs gRPC service to publish. -func RunServer(config *config.Config, handler pb.ScanningServer, port string, allowedIPs, deniedIPs []string, startTLS bool, version string) (*grpc.Server, error) { +func RunServer(cfg *config.Config, handler pb.ScanningServer, port string, allowedIPs, deniedIPs []string, startTLS bool, version string) (*grpc.Server, error) { // Start up Open Telemetry is requested oltpShutdown := func() {} - if config.Telemetry.Enabled { + if cfg.Telemetry.Enabled { var err error - oltpShutdown, err = otel.InitTelemetryProviders(config.App.Name, "scanoss-hfh", version, - config.Telemetry.OltpExporter, otel.GetTraceSampler(config.App.Mode), false) + oltpShutdown, err = otel.InitTelemetryProviders(cfg.App.Name, "scanoss-hfh", version, + cfg.Telemetry.OltpExporter, otel.GetTraceSampler(cfg.App.Mode), false) if err != nil { return nil, err } @@ -43,14 +43,14 @@ func RunServer(config *config.Config, handler pb.ScanningServer, port string, al // Configure the port, interceptors, TLS and register the service listen, server, err := gs.SetupGrpcServer( port, - config.TLS.CertFile, - config.TLS.KeyFile, + cfg.TLS.CertFile, + cfg.TLS.KeyFile, allowedIPs, deniedIPs, startTLS, - config.Filtering.BlockByDefault, - config.Filtering.TrustProxy, - config.Telemetry.Enabled, + cfg.Filtering.BlockByDefault, + cfg.Filtering.TrustProxy, + cfg.Telemetry.Enabled, ) if err != nil { oltpShutdown() diff --git a/internal/protocol/rest/server.go b/internal/protocol/rest/server.go index f567c2c..dde1e02 100644 --- a/internal/protocol/rest/server.go +++ b/internal/protocol/rest/server.go @@ -32,17 +32,17 @@ import ( // RunServer runs REST grpc gateway to forward requests onto the gRPC server. // //nolint:revive // context position is determined by existing API design -func RunServer(config *config.Config, ctx context.Context, grpcPort, httpPort string, allowedIPs, deniedIPs []string, startTLS bool) (*http.Server, error) { +func RunServer(cfg *config.Config, ctx context.Context, grpcPort, httpPort string, allowedIPs, deniedIPs []string, startTLS bool) (*http.Server, error) { // configure the gateway for forwarding to gRPC srv, mux, grpcGateway, opts, err := gw.SetupGateway( grpcPort, httpPort, - config.TLS.CertFile, - config.TLS.CN, + cfg.TLS.CertFile, + cfg.TLS.CN, allowedIPs, deniedIPs, - config.Filtering.BlockByDefault, - config.Filtering.TrustProxy, + cfg.Filtering.BlockByDefault, + cfg.Filtering.TrustProxy, startTLS, ) if err != nil { @@ -56,7 +56,7 @@ func RunServer(config *config.Config, ctx context.Context, grpcPort, httpPort st if err := pb.RegisterScanningHandlerFromEndpoint(ctx2, mux, grpcGateway, opts); err != nil { zlog.S.Panicf("Failed to start HTTP gateway %v", err) } - gw.StartGateway(srv, config.TLS.CertFile, config.TLS.KeyFile, startTLS) + gw.StartGateway(srv, cfg.TLS.CertFile, cfg.TLS.KeyFile, startTLS) }() return srv, nil diff --git a/internal/service/scan_service_impl.go b/internal/service/scan_service_impl.go index c2abb2e..bd8d8b7 100644 --- a/internal/service/scan_service_impl.go +++ b/internal/service/scan_service_impl.go @@ -120,7 +120,7 @@ func (s *ScanServiceImpl) scanNode(ctx context.Context, node *entities.FolderNod logger.Debugf("SearchByHashes returned %d component groups for node %s", len(componentGroups), node.PathID) // Skip recursive threshold check for root node when depth is enabled (has children) - shouldCheckThreshold := !(isRoot && len(node.Children) > 0) + shouldCheckThreshold := !isRoot || len(node.Children) == 0 // Check if any component group has a version with score >= recursiveThreshold if shouldCheckThreshold && recursiveThreshold > 0 && s.hasHighScoreMatch(componentGroups, recursiveThreshold) { From 6af826692df4ed662770d785efd60fadb699103a Mon Sep 17 00:00:00 2001 From: Matias Daloia Date: Mon, 13 Oct 2025 18:26:25 +0200 Subject: [PATCH 4/5] fix: update lint workflow --- .github/workflows/golangci-lint.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/golangci-lint.yml b/.github/workflows/golangci-lint.yml index ccd19bb..b5965db 100644 --- a/.github/workflows/golangci-lint.yml +++ b/.github/workflows/golangci-lint.yml @@ -4,9 +4,9 @@ name: Golang CI Lint on: workflow_dispatch: push: - branches: [ "main" ] + branches: ["main"] pull_request: - branches: [ "main" ] + branches: ["main"] jobs: build: @@ -14,7 +14,7 @@ jobs: steps: - uses: actions/checkout@v4 with: - fetch-depth: 0 # Get tags to allow build script to get build version + fetch-depth: 0 # Get tags to allow build script to get build version - name: Set up Go uses: actions/setup-go@v5 @@ -25,6 +25,6 @@ jobs: run: make version - name: golangci-lint - uses: golangci/golangci-lint-action@v6 + uses: golangci/golangci-lint-action@v8 with: args: --timeout=5m From 85e020de2527a29950e5a5dcb7250a148d94eb65 Mon Sep 17 00:00:00 2001 From: Matias Daloia Date: Mon, 13 Oct 2025 18:27:49 +0200 Subject: [PATCH 5/5] fix: pr comment --- cmd/import/main.go | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/cmd/import/main.go b/cmd/import/main.go index 5746c38..2db18b3 100644 --- a/cmd/import/main.go +++ b/cmd/import/main.go @@ -408,7 +408,9 @@ func importCSVFile(ctx context.Context, client *qdrant.Client, filePath, sectorN batchesProcessed := 0 for i := 0; i < totalRecords; i += BatchSize { end := i + BatchSize - end = min(end, totalRecords) + if end > totalRecords { + end = totalRecords + } batch := validRecords[i:end] batchNum := i/BatchSize + 1 log.Printf("Processing batch %d/%d (%d records) for sector %s",