diff --git a/CHANGELOG.md b/CHANGELOG.md index 94576247a..0942241f3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Added + +- Added config option `settings.maxFileSize` to control the maximum file size zoekt will index. ([#118](https://github.com/sourcebot-dev/sourcebot/pull/118)) + ## [2.6.0] - 2024-12-02 ### Added diff --git a/packages/backend/src/constants.ts b/packages/backend/src/constants.ts index 6db4289e6..579b0b169 100644 --- a/packages/backend/src/constants.ts +++ b/packages/backend/src/constants.ts @@ -1,3 +1,4 @@ +import { Settings } from "./types.js"; /** * The interval to reindex a given repository. @@ -7,4 +8,11 @@ export const REINDEX_INTERVAL_MS = 1000 * 60 * 60; /** * The interval to re-sync the config. */ -export const RESYNC_CONFIG_INTERVAL_MS = 1000 * 60 * 60 * 24; \ No newline at end of file +export const RESYNC_CONFIG_INTERVAL_MS = 1000 * 60 * 60 * 24; + +/** + * Default settings. + */ +export const DEFAULT_SETTINGS: Settings = { + maxFileSize: 2 * 1024 * 1024, // 2MB in bytes +} \ No newline at end of file diff --git a/packages/backend/src/db.test.ts b/packages/backend/src/db.test.ts new file mode 100644 index 000000000..fc03eccf2 --- /dev/null +++ b/packages/backend/src/db.test.ts @@ -0,0 +1,32 @@ +import { expect, test } from 'vitest'; +import { migration_addMaxFileSize, migration_addSettings, Schema } from './db'; +import { DEFAULT_SETTINGS } from './constants'; +import { DeepPartial } from './types'; + + +test('migration_addSettings adds the `settings` field with defaults if it does not exist', () => { + const schema: DeepPartial = {}; + + const migratedSchema = migration_addSettings(schema as Schema); + expect(migratedSchema).toStrictEqual({ + settings: DEFAULT_SETTINGS, + }); +}); + +test('migration_addMaxFileSize adds the `maxFileSize` field with the default value if it does not exist', () => { + const schema: DeepPartial = { + settings: {}, + } + + const migratedSchema = migration_addMaxFileSize(schema as Schema); + expect(migratedSchema).toStrictEqual({ + settings: { + maxFileSize: DEFAULT_SETTINGS.maxFileSize, + } + }); +}); + +test('migration_addMaxFileSize will throw if `settings` is not defined', () => { + const schema: DeepPartial = {}; + expect(() => migration_addMaxFileSize(schema as Schema)).toThrow(); +}); \ No newline at end of file diff --git a/packages/backend/src/db.ts b/packages/backend/src/db.ts index 6cc59cd89..a9e603640 100644 --- a/packages/backend/src/db.ts +++ b/packages/backend/src/db.ts @@ -1,8 +1,13 @@ import { JSONFilePreset } from "lowdb/node"; import { type Low } from "lowdb"; -import { AppContext, Repository } from "./types.js"; +import { AppContext, Repository, Settings } from "./types.js"; +import { DEFAULT_SETTINGS } from "./constants.js"; +import { createLogger } from "./logger.js"; -type Schema = { +const logger = createLogger('db'); + +export type Schema = { + settings: Settings, repos: { [key: string]: Repository; } @@ -11,9 +16,16 @@ type Schema = { export type Database = Low; export const loadDB = async (ctx: AppContext): Promise => { - const db = await JSONFilePreset(`${ctx.cachePath}/db.json`, { repos: {} }); + const db = await JSONFilePreset(`${ctx.cachePath}/db.json`, { + repos: {}, + settings: DEFAULT_SETTINGS, + }); + + await applyMigrations(db); + return db; } + export const updateRepository = async (repoId: string, data: Repository, db: Database) => { db.data.repos[repoId] = { ...db.data.repos[repoId], @@ -22,7 +34,49 @@ export const updateRepository = async (repoId: string, data: Repository, db: Dat await db.write(); } +export const updateSettings = async (settings: Settings, db: Database) => { + db.data.settings = settings; + await db.write(); +} + export const createRepository = async (repo: Repository, db: Database) => { db.data.repos[repo.id] = repo; await db.write(); +} + +export const applyMigrations = async (db: Database) => { + const log = (name: string) => { + logger.info(`Applying migration '${name}'`); + } + + await db.update((schema) => { + // @NOTE: please ensure new migrations are added after older ones! + schema = migration_addSettings(schema, log); + schema = migration_addMaxFileSize(schema, log); + return schema; + }); +} + +/** + * @see: https://github.com/sourcebot-dev/sourcebot/pull/118 + */ +export const migration_addSettings = (schema: Schema, log?: (name: string) => void) => { + if (!schema.settings) { + log?.("addSettings"); + schema.settings = DEFAULT_SETTINGS; + } + + return schema; +} + +/** + * @see: https://github.com/sourcebot-dev/sourcebot/pull/118 + */ +export const migration_addMaxFileSize = (schema: Schema, log?: (name: string) => void) => { + if (!schema.settings.maxFileSize) { + log?.("addMaxFileSize"); + schema.settings.maxFileSize = DEFAULT_SETTINGS.maxFileSize; + } + + return schema; } \ No newline at end of file diff --git a/packages/backend/src/main.test.ts b/packages/backend/src/main.test.ts index 2fc6946c9..37e009a08 100644 --- a/packages/backend/src/main.test.ts +++ b/packages/backend/src/main.test.ts @@ -1,6 +1,6 @@ import { expect, test } from 'vitest'; -import { isRepoReindxingRequired } from './main'; -import { Repository } from './types'; +import { isAllRepoReindexingRequired, isRepoReindexingRequired } from './main'; +import { Repository, Settings } from './types'; test('isRepoReindexingRequired should return false when no changes are made', () => { const previous: Repository = { @@ -15,7 +15,7 @@ test('isRepoReindexingRequired should return false when no changes are made', () }; const current = previous; - expect(isRepoReindxingRequired(previous, current)).toBe(false); + expect(isRepoReindexingRequired(previous, current)).toBe(false); }) test('isRepoReindexingRequired should return true when git branches change', () => { @@ -35,7 +35,7 @@ test('isRepoReindexingRequired should return true when git branches change', () branches: ['main', 'feature'] }; - expect(isRepoReindxingRequired(previous, current)).toBe(true); + expect(isRepoReindexingRequired(previous, current)).toBe(true); }); test('isRepoReindexingRequired should return true when git tags change', () => { @@ -55,7 +55,7 @@ test('isRepoReindexingRequired should return true when git tags change', () => { tags: ['v1.0', 'v2.0'] }; - expect(isRepoReindxingRequired(previous, current)).toBe(true); + expect(isRepoReindexingRequired(previous, current)).toBe(true); }); test('isRepoReindexingRequired should return true when local excludedPaths change', () => { @@ -74,5 +74,26 @@ test('isRepoReindexingRequired should return true when local excludedPaths chang excludedPaths: ['node_modules', 'dist'] }; - expect(isRepoReindxingRequired(previous, current)).toBe(true); + expect(isRepoReindexingRequired(previous, current)).toBe(true); }); + +test('isAllRepoReindexingRequired should return false when fileLimitSize has not changed', () => { + const previous: Settings = { + maxFileSize: 1000, + } + const current: Settings = { + ...previous, + } + expect(isAllRepoReindexingRequired(previous, current)).toBe(false); +}); + +test('isAllRepoReindexingRequired should return true when fileLimitSize has changed', () => { + const previous: Settings = { + maxFileSize: 1000, + } + const current: Settings = { + ...previous, + maxFileSize: 2000, + } + expect(isAllRepoReindexingRequired(previous, current)).toBe(true); +}); \ No newline at end of file diff --git a/packages/backend/src/main.ts b/packages/backend/src/main.ts index 8459a3558..50064d92c 100644 --- a/packages/backend/src/main.ts +++ b/packages/backend/src/main.ts @@ -5,12 +5,12 @@ import { getGitHubReposFromConfig } from "./github.js"; import { getGitLabReposFromConfig } from "./gitlab.js"; import { getGiteaReposFromConfig } from "./gitea.js"; import { getGerritReposFromConfig } from "./gerrit.js"; -import { AppContext, LocalRepository, GitRepository, Repository } from "./types.js"; +import { AppContext, LocalRepository, GitRepository, Repository, Settings } from "./types.js"; import { cloneRepository, fetchRepository } from "./git.js"; import { createLogger } from "./logger.js"; -import { createRepository, Database, loadDB, updateRepository } from './db.js'; +import { createRepository, Database, loadDB, updateRepository, updateSettings } from './db.js'; import { arraysEqualShallow, isRemotePath, measure } from "./utils.js"; -import { REINDEX_INTERVAL_MS, RESYNC_CONFIG_INTERVAL_MS } from "./constants.js"; +import { DEFAULT_SETTINGS, REINDEX_INTERVAL_MS, RESYNC_CONFIG_INTERVAL_MS } from "./constants.js"; import stripJsonComments from 'strip-json-comments'; import { indexGitRepository, indexLocalRepository } from "./zoekt.js"; import { getLocalRepoFromConfig, initLocalRepoFileWatchers } from "./local.js"; @@ -18,7 +18,7 @@ import { captureEvent } from "./posthog.js"; const logger = createLogger('main'); -const syncGitRepository = async (repo: GitRepository, ctx: AppContext) => { +const syncGitRepository = async (repo: GitRepository, settings: Settings, ctx: AppContext) => { let fetchDuration_s: number | undefined = undefined; let cloneDuration_s: number | undefined = undefined; @@ -46,7 +46,7 @@ const syncGitRepository = async (repo: GitRepository, ctx: AppContext) => { } logger.info(`Indexing ${repo.id}...`); - const { durationMs } = await measure(() => indexGitRepository(repo, ctx)); + const { durationMs } = await measure(() => indexGitRepository(repo, settings, ctx)); const indexDuration_s = durationMs / 1000; logger.info(`Indexed ${repo.id} in ${indexDuration_s}s`); @@ -57,9 +57,9 @@ const syncGitRepository = async (repo: GitRepository, ctx: AppContext) => { } } -const syncLocalRepository = async (repo: LocalRepository, ctx: AppContext, signal?: AbortSignal) => { +const syncLocalRepository = async (repo: LocalRepository, settings: Settings, ctx: AppContext, signal?: AbortSignal) => { logger.info(`Indexing ${repo.id}...`); - const { durationMs } = await measure(() => indexLocalRepository(repo, ctx, signal)); + const { durationMs } = await measure(() => indexLocalRepository(repo, settings, ctx, signal)); const indexDuration_s = durationMs / 1000; logger.info(`Indexed ${repo.id} in ${indexDuration_s}s`); return { @@ -67,8 +67,11 @@ const syncLocalRepository = async (repo: LocalRepository, ctx: AppContext, signa } } -export const isRepoReindxingRequired = (previous: Repository, current: Repository) => { - +/** + * Certain configuration changes (e.g., a branch is added) require + * a reindexing of the repository. + */ +export const isRepoReindexingRequired = (previous: Repository, current: Repository) => { /** * Checks if the any of the `revisions` properties have changed. */ @@ -100,6 +103,16 @@ export const isRepoReindxingRequired = (previous: Repository, current: Repositor ) } +/** + * Certain settings changes (e.g., the file limit size is changed) require + * a reindexing of _all_ repositories. + */ +export const isAllRepoReindexingRequired = (previous: Settings, current: Settings) => { + return ( + previous?.maxFileSize !== current?.maxFileSize + ) +} + const syncConfig = async (configPath: string, db: Database, signal: AbortSignal, ctx: AppContext) => { const configContent = await (async () => { if (isRemotePath(configPath)) { @@ -121,6 +134,13 @@ const syncConfig = async (configPath: string, db: Database, signal: AbortSignal, // @todo: we should validate the configuration file's structure here. const config = JSON.parse(stripJsonComments(configContent)) as SourcebotConfigurationSchema; + // Update the settings + const updatedSettings: Settings = { + maxFileSize: config.settings?.maxFileSize ?? DEFAULT_SETTINGS.maxFileSize, + } + const _isAllRepoReindexingRequired = isAllRepoReindexingRequired(db.data.settings, updatedSettings); + await updateSettings(updatedSettings, db); + // Fetch all repositories from the config file let configRepos: Repository[] = []; for (const repoConfig of config.repos ?? []) { @@ -172,7 +192,7 @@ const syncConfig = async (configPath: string, db: Database, signal: AbortSignal, for (const newRepo of configRepos) { if (newRepo.id in db.data.repos) { const existingRepo = db.data.repos[newRepo.id]; - const isReindexingRequired = isRepoReindxingRequired(existingRepo, newRepo); + const isReindexingRequired = _isAllRepoReindexingRequired || isRepoReindexingRequired(existingRepo, newRepo); if (isReindexingRequired) { logger.info(`Marking ${newRepo.id} for reindexing due to configuration change.`); } @@ -244,7 +264,7 @@ export const main = async (context: AppContext) => { const localRepos = Object.values(db.data.repos).filter(repo => repo.vcs === 'local'); initLocalRepoFileWatchers(localRepos, async (repo, signal) => { logger.info(`Change detected to local repository ${repo.id}. Re-syncing...`); - await syncLocalRepository(repo, context, signal); + await syncLocalRepository(repo, db.data.settings, context, signal); await db.update(({ repos }) => repos[repo.id].lastIndexedDate = new Date().toUTCString()); }); } @@ -285,12 +305,12 @@ export const main = async (context: AppContext) => { let cloneDuration_s: number | undefined; if (repo.vcs === 'git') { - const stats = await syncGitRepository(repo, context); + const stats = await syncGitRepository(repo, db.data.settings, context); indexDuration_s = stats.indexDuration_s; fetchDuration_s = stats.fetchDuration_s; cloneDuration_s = stats.cloneDuration_s; } else if (repo.vcs === 'local') { - const stats = await syncLocalRepository(repo, context); + const stats = await syncLocalRepository(repo, db.data.settings, context); indexDuration_s = stats.indexDuration_s; } diff --git a/packages/backend/src/schemas/v2.ts b/packages/backend/src/schemas/v2.ts index 446eea8f3..9e95a264d 100644 --- a/packages/backend/src/schemas/v2.ts +++ b/packages/backend/src/schemas/v2.ts @@ -7,11 +7,21 @@ export type Repos = GitHubConfig | GitLabConfig | GiteaConfig | GerritConfig | L */ export interface SourcebotConfigurationSchema { $schema?: string; + settings?: Settings; /** * Defines a collection of repositories from varying code hosts that Sourcebot should sync with. */ repos?: Repos[]; } +/** + * Global settings. These settings are applied to all repositories. + */ +export interface Settings { + /** + * The maximum size of a file (in bytes) to be indexed. Files that exceed this maximum will not be inexed. Defaults to 2MB (2097152 bytes). + */ + maxFileSize?: number; +} export interface GitHubConfig { /** * GitHub Configuration diff --git a/packages/backend/src/types.ts b/packages/backend/src/types.ts index 7d699790d..b6893518b 100644 --- a/packages/backend/src/types.ts +++ b/packages/backend/src/types.ts @@ -1,4 +1,3 @@ - interface BaseRepository { vcs: 'git' | 'local'; id: string; @@ -42,3 +41,12 @@ export type AppContext = { configPath: string; } + +export type Settings = { + maxFileSize: number; +} + +// @see : https://stackoverflow.com/a/61132308 +export type DeepPartial = T extends object ? { + [P in keyof T]?: DeepPartial; +} : T; \ No newline at end of file diff --git a/packages/backend/src/zoekt.ts b/packages/backend/src/zoekt.ts index 04947b167..ffeadf7e9 100644 --- a/packages/backend/src/zoekt.ts +++ b/packages/backend/src/zoekt.ts @@ -1,16 +1,16 @@ import { exec } from "child_process"; -import { AppContext, GitRepository, LocalRepository } from "./types.js"; +import { AppContext, GitRepository, LocalRepository, Settings } from "./types.js"; const ALWAYS_EXCLUDED_DIRS = ['.git', '.hg', '.svn']; -export const indexGitRepository = async (repo: GitRepository, ctx: AppContext) => { +export const indexGitRepository = async (repo: GitRepository, settings: Settings, ctx: AppContext) => { const revisions = [ 'HEAD', ...repo.branches ?? [], ...repo.tags ?? [], ]; - const command = `zoekt-git-index -allow_missing_branches -index ${ctx.indexPath} -branches ${revisions.join(',')} ${repo.path}`; + const command = `zoekt-git-index -allow_missing_branches -index ${ctx.indexPath} -file_limit ${settings.maxFileSize} -branches ${revisions.join(',')} ${repo.path}`; return new Promise<{ stdout: string, stderr: string }>((resolve, reject) => { exec(command, (error, stdout, stderr) => { @@ -26,9 +26,9 @@ export const indexGitRepository = async (repo: GitRepository, ctx: AppContext) = }); } -export const indexLocalRepository = async (repo: LocalRepository, ctx: AppContext, signal?: AbortSignal) => { +export const indexLocalRepository = async (repo: LocalRepository, settings: Settings, ctx: AppContext, signal?: AbortSignal) => { const excludedDirs = [...ALWAYS_EXCLUDED_DIRS, repo.excludedPaths]; - const command = `zoekt-index -index ${ctx.indexPath} -ignore_dirs ${excludedDirs.join(',')} ${repo.path}`; + const command = `zoekt-index -index ${ctx.indexPath} -file_limit ${settings.maxFileSize} -ignore_dirs ${excludedDirs.join(',')} ${repo.path}`; return new Promise<{ stdout: string, stderr: string }>((resolve, reject) => { exec(command, { signal }, (error, stdout, stderr) => { diff --git a/schemas/v2/index.json b/schemas/v2/index.json index 899676687..90e86042f 100644 --- a/schemas/v2/index.json +++ b/schemas/v2/index.json @@ -477,12 +477,28 @@ "$ref": "#/definitions/LocalConfig" } ] + }, + "Settings": { + "type": "object", + "description": "Global settings. These settings are applied to all repositories.", + "properties": { + "maxFileSize": { + "type": "integer", + "description": "The maximum size of a file (in bytes) to be indexed. Files that exceed this maximum will not be inexed. Defaults to 2MB (2097152 bytes).", + "default": 2097152, + "minimum": 1 + } + }, + "additionalProperties": false } }, "properties": { "$schema": { "type": "string" }, + "settings": { + "$ref": "#/definitions/Settings" + }, "repos": { "type": "array", "description": "Defines a collection of repositories from varying code hosts that Sourcebot should sync with.",