diff --git a/apps/docs/content/docs/en/tools/firecrawl.mdx b/apps/docs/content/docs/en/tools/firecrawl.mdx index ee346b8e464..6c582bfc76d 100644 --- a/apps/docs/content/docs/en/tools/firecrawl.mdx +++ b/apps/docs/content/docs/en/tools/firecrawl.mdx @@ -234,4 +234,48 @@ Autonomous web data extraction agent. Searches and gathers information based on | `expiresAt` | string | Timestamp when the results expire \(24 hours\) | | `sources` | object | Array of source URLs used by the agent | +### `firecrawl_parse` + +Parse uploaded documents (PDF, DOCX, HTML, etc.) into clean markdown using Firecrawl. Supports .html, .htm, .pdf, .docx, .doc, .odt, .rtf, .xlsx, .xls. + +#### Input + +| Parameter | Type | Required | Description | +| --------- | ---- | -------- | ----------- | +| `file` | file | Yes | Document file to be parsed | +| `formats` | array | No | Output formats to return \(e.g., \["markdown"\]\). Defaults to markdown. | +| `onlyMainContent` | boolean | No | Exclude headers, navs, footers. Defaults to true. | +| `includeTags` | array | No | HTML tags to include | +| `excludeTags` | array | No | HTML tags to exclude | +| `timeout` | number | No | Timeout in milliseconds \(max 300000\). Defaults to 30000. | +| `parsers` | array | No | Parser configuration \(e.g., \[\{ "type": "pdf" \}\]\) | +| `removeBase64Images` | boolean | No | Remove base64 images, keep alt text. Defaults to true. | +| `blockAds` | boolean | No | Block ads and popups. Defaults to true. | +| `proxy` | string | No | Proxy mode: "basic" or "auto" | +| `zeroDataRetention` | boolean | No | Enable zero data retention. Defaults to false. | +| `apiKey` | string | Yes | Firecrawl API key | +| `rateLimit` | string | No | No description | + +#### Output + +| Parameter | Type | Description | +| --------- | ---- | ----------- | +| `markdown` | string | Parsed document content in markdown format | +| `summary` | string | Generated summary of the document | +| `html` | string | Processed HTML content | +| `rawHtml` | string | Unprocessed raw HTML content | +| `screenshot` | string | Screenshot URL or base64 \(when requested\) | +| `links` | array | URLs discovered in the document | +| `metadata` | object | Document metadata | +| ↳ `title` | string | Document title | +| ↳ `description` | string | Document description | +| ↳ `language` | string | Document language code | +| ↳ `sourceURL` | string | Source URL | +| ↳ `url` | string | Final URL | +| ↳ `keywords` | string | Document keywords | +| ↳ `statusCode` | number | HTTP status code | +| ↳ `contentType` | string | Document content type | +| ↳ `error` | string | Error message if parse failed | +| `warning` | string | Warning message from the parse operation | + diff --git a/apps/docs/content/docs/en/tools/notion.mdx b/apps/docs/content/docs/en/tools/notion.mdx index 9a7ac971572..392a52d685b 100644 --- a/apps/docs/content/docs/en/tools/notion.mdx +++ b/apps/docs/content/docs/en/tools/notion.mdx @@ -256,8 +256,6 @@ Create a new database in Notion with custom properties ### `notion_add_database_row` -Add a new row to a Notion database with specified properties - #### Input | Parameter | Type | Required | Description | diff --git a/apps/sim/app/(landing)/integrations/data/integrations.json b/apps/sim/app/(landing)/integrations/data/integrations.json index b11c74c5f47..4239620a845 100644 --- a/apps/sim/app/(landing)/integrations/data/integrations.json +++ b/apps/sim/app/(landing)/integrations/data/integrations.json @@ -4020,9 +4020,13 @@ { "name": "Agent", "description": "Autonomous web data extraction agent. Searches and gathers information based on natural language prompts without requiring specific URLs." + }, + { + "name": "Parse Document", + "description": "Parse uploaded documents (PDF, DOCX, HTML, etc.) into clean markdown using Firecrawl. Supports .html, .htm, .pdf, .docx, .doc, .odt, .rtf, .xlsx, .xls." } ], - "operationCount": 6, + "operationCount": 7, "triggers": [], "triggerCount": 0, "authType": "api-key", diff --git a/apps/sim/app/api/tools/firecrawl/parse/route.ts b/apps/sim/app/api/tools/firecrawl/parse/route.ts new file mode 100644 index 00000000000..36bf99204f5 --- /dev/null +++ b/apps/sim/app/api/tools/firecrawl/parse/route.ts @@ -0,0 +1,104 @@ +import { createLogger } from '@sim/logger' +import { toError } from '@sim/utils/errors' +import { type NextRequest, NextResponse } from 'next/server' +import { z } from 'zod' +import { checkInternalAuth } from '@/lib/auth/hybrid' +import { generateRequestId } from '@/lib/core/utils/request' +import { withRouteHandler } from '@/lib/core/utils/with-route-handler' +import { RawFileInputSchema } from '@/lib/uploads/utils/file-schemas' +import { processFilesToUserFiles } from '@/lib/uploads/utils/file-utils' +import { downloadFileFromStorage } from '@/lib/uploads/utils/file-utils.server' + +export const dynamic = 'force-dynamic' + +const logger = createLogger('FirecrawlParseAPI') + +const FirecrawlParseSchema = z.object({ + apiKey: z.string().min(1, 'API key is required'), + file: RawFileInputSchema, + options: z.record(z.unknown()).optional(), +}) + +export const POST = withRouteHandler(async (request: NextRequest) => { + const requestId = generateRequestId() + + try { + const authResult = await checkInternalAuth(request, { requireWorkflowId: false }) + + if (!authResult.success || !authResult.userId) { + logger.warn(`[${requestId}] Unauthorized Firecrawl parse attempt`, { + error: authResult.error || 'Missing userId', + }) + return NextResponse.json( + { success: false, error: authResult.error || 'Unauthorized' }, + { status: 401 } + ) + } + + const body = await request.json() + const validatedData = FirecrawlParseSchema.parse(body) + + const [userFile] = processFilesToUserFiles([validatedData.file], requestId, logger) + if (!userFile) { + return NextResponse.json({ success: false, error: 'File input is required' }, { status: 400 }) + } + + logger.info(`[${requestId}] Firecrawl parse request`, { + fileName: userFile.name, + size: userFile.size, + }) + + const buffer = await downloadFileFromStorage(userFile, requestId, logger) + + const formData = new FormData() + const blob = new Blob([new Uint8Array(buffer)], { + type: userFile.type || 'application/octet-stream', + }) + formData.append('file', blob, userFile.name) + + if (validatedData.options && Object.keys(validatedData.options).length > 0) { + formData.append('options', JSON.stringify(validatedData.options)) + } + + const firecrawlResponse = await fetch('https://api.firecrawl.dev/v2/parse', { + method: 'POST', + headers: { + Authorization: `Bearer ${validatedData.apiKey}`, + }, + body: formData, + }) + + if (!firecrawlResponse.ok) { + const errorText = await firecrawlResponse.text() + logger.error(`[${requestId}] Firecrawl API error:`, errorText) + return NextResponse.json( + { + success: false, + error: `Firecrawl API error: ${errorText || firecrawlResponse.statusText}`, + }, + { status: firecrawlResponse.status } + ) + } + + const firecrawlData = await firecrawlResponse.json() + + logger.info(`[${requestId}] Firecrawl parse successful`) + + return NextResponse.json({ + success: true, + output: firecrawlData.data ?? firecrawlData, + }) + } catch (error) { + if (error instanceof z.ZodError) { + logger.warn(`[${requestId}] Invalid request data`, { errors: error.errors }) + return NextResponse.json( + { success: false, error: 'Invalid request data', details: error.errors }, + { status: 400 } + ) + } + + logger.error(`[${requestId}] Error in Firecrawl parse:`, error) + + return NextResponse.json({ success: false, error: toError(error).message }, { status: 500 }) + } +}) diff --git a/apps/sim/app/workspace/[workspaceId]/w/[workflowId]/components/panel/components/editor/components/sub-block/components/short-input/short-input.tsx b/apps/sim/app/workspace/[workspaceId]/w/[workflowId]/components/panel/components/editor/components/sub-block/components/short-input/short-input.tsx index 6de90a072e2..ba5ff8461b1 100644 --- a/apps/sim/app/workspace/[workspaceId]/w/[workflowId]/components/panel/components/editor/components/sub-block/components/short-input/short-input.tsx +++ b/apps/sim/app/workspace/[workspaceId]/w/[workflowId]/components/panel/components/editor/components/sub-block/components/short-input/short-input.tsx @@ -347,7 +347,7 @@ export const ShortInput = memo(function ShortInput({ <> } - className='allow-scroll w-full overflow-auto text-transparent caret-foreground [-ms-overflow-style:none] [scrollbar-width:none] selection:text-transparent placeholder:text-muted-foreground/50 [&::-webkit-scrollbar]:hidden' + className='allow-scroll w-full overflow-auto text-transparent caret-foreground [-ms-overflow-style:none] [scrollbar-width:none] placeholder:text-muted-foreground/50 [&::-webkit-scrollbar]:hidden' readOnly={readOnly} placeholder={placeholder ?? ''} type='text' diff --git a/apps/sim/blocks/blocks/firecrawl.ts b/apps/sim/blocks/blocks/firecrawl.ts index f50f1731985..13c9710cf55 100644 --- a/apps/sim/blocks/blocks/firecrawl.ts +++ b/apps/sim/blocks/blocks/firecrawl.ts @@ -1,6 +1,7 @@ import { FirecrawlIcon } from '@/components/icons' -import type { BlockConfig } from '@/blocks/types' +import type { BlockConfig, SubBlockType } from '@/blocks/types' import { AuthMode, IntegrationType } from '@/blocks/types' +import { normalizeFileInput } from '@/blocks/utils' import type { FirecrawlResponse } from '@/tools/firecrawl/types' export const FirecrawlBlock: BlockConfig = { @@ -28,9 +29,39 @@ export const FirecrawlBlock: BlockConfig = { { label: 'Map', id: 'map' }, { label: 'Extract', id: 'extract' }, { label: 'Agent', id: 'agent' }, + { label: 'Parse Document', id: 'parse' }, ], value: () => 'scrape', }, + { + id: 'fileUpload', + title: 'Document', + type: 'file-upload' as SubBlockType, + canonicalParamId: 'document', + acceptedTypes: + 'application/pdf,application/vnd.openxmlformats-officedocument.wordprocessingml.document,application/msword,application/vnd.oasis.opendocument.text,application/rtf,text/rtf,application/vnd.openxmlformats-officedocument.spreadsheetml.sheet,application/vnd.ms-excel,text/html', + placeholder: 'Upload a document (PDF, DOCX, HTML, XLSX, etc.)', + mode: 'basic', + maxSize: 50, + condition: { + field: 'operation', + value: 'parse', + }, + required: true, + }, + { + id: 'fileReference', + title: 'File Reference', + type: 'short-input' as SubBlockType, + canonicalParamId: 'document', + placeholder: 'File reference from previous block', + mode: 'advanced', + condition: { + field: 'operation', + value: 'parse', + }, + required: true, + }, { id: 'url', title: 'Website URL', @@ -180,7 +211,7 @@ Example 2 - Product Data: type: 'switch', condition: { field: 'operation', - value: 'scrape', + value: ['scrape', 'parse'], }, }, { @@ -190,7 +221,7 @@ Example 2 - Product Data: placeholder: '["markdown", "html"]', condition: { field: 'operation', - value: 'scrape', + value: ['scrape', 'parse'], }, }, { @@ -219,7 +250,7 @@ Example 2 - Product Data: placeholder: '60000', condition: { field: 'operation', - value: ['scrape', 'search'], + value: ['scrape', 'search', 'parse'], }, }, { @@ -232,6 +263,83 @@ Example 2 - Product Data: value: ['crawl', 'map', 'search'], }, }, + { + id: 'includeTags', + title: 'Include Tags', + type: 'long-input', + placeholder: '["article", "main"]', + mode: 'advanced', + condition: { + field: 'operation', + value: 'parse', + }, + }, + { + id: 'excludeTags', + title: 'Exclude Tags', + type: 'long-input', + placeholder: '["nav", "footer"]', + mode: 'advanced', + condition: { + field: 'operation', + value: 'parse', + }, + }, + { + id: 'parsers', + title: 'Parsers', + type: 'long-input', + placeholder: '[{"type": "pdf", "mode": "auto"}]', + mode: 'advanced', + condition: { + field: 'operation', + value: 'parse', + }, + }, + { + id: 'removeBase64Images', + title: 'Remove Base64 Images', + type: 'switch', + mode: 'advanced', + condition: { + field: 'operation', + value: 'parse', + }, + }, + { + id: 'blockAds', + title: 'Block Ads', + type: 'switch', + mode: 'advanced', + condition: { + field: 'operation', + value: 'parse', + }, + }, + { + id: 'proxy', + title: 'Proxy Mode', + type: 'dropdown', + options: [ + { id: 'basic', label: 'Basic' }, + { id: 'auto', label: 'Auto' }, + ], + mode: 'advanced', + condition: { + field: 'operation', + value: 'parse', + }, + }, + { + id: 'zeroDataRetention', + title: 'Zero Data Retention', + type: 'switch', + mode: 'advanced', + condition: { + field: 'operation', + value: 'parse', + }, + }, { id: 'query', title: 'Search Query', @@ -278,6 +386,7 @@ Example 2 - Product Data: 'firecrawl_map', 'firecrawl_extract', 'firecrawl_agent', + 'firecrawl_parse', ], config: { tool: (params) => { @@ -294,6 +403,8 @@ Example 2 - Product Data: return 'firecrawl_extract' case 'agent': return 'firecrawl_agent' + case 'parse': + return 'firecrawl_parse' default: return 'firecrawl_scrape' } @@ -375,6 +486,68 @@ Example 2 - Product Data: if (prompt) result.prompt = prompt break + case 'parse': { + const file = normalizeFileInput(params.document, { single: true }) + if (!file) { + throw new Error('A document file is required for the parse operation') + } + result.file = file + if (formats) { + if (Array.isArray(formats)) { + result.formats = formats + } else if (typeof formats === 'string') { + try { + const parsed = JSON.parse(formats) + result.formats = Array.isArray(parsed) ? parsed : ['markdown'] + } catch { + result.formats = ['markdown'] + } + } + } + if (onlyMainContent != null) result.onlyMainContent = onlyMainContent + if (timeout) result.timeout = Number.parseInt(timeout) + + const parseStringArray = (value: unknown): string[] | undefined => { + if (Array.isArray(value)) return value as string[] + if (typeof value === 'string' && value.trim() !== '') { + try { + const parsed = JSON.parse(value) + return Array.isArray(parsed) ? parsed : undefined + } catch { + return undefined + } + } + return undefined + } + + const includeTagsParsed = parseStringArray(params.includeTags) + if (includeTagsParsed) result.includeTags = includeTagsParsed + + const excludeTagsParsed = parseStringArray(params.excludeTags) + if (excludeTagsParsed) result.excludeTags = excludeTagsParsed + + if (params.parsers) { + if (Array.isArray(params.parsers)) { + result.parsers = params.parsers + } else if (typeof params.parsers === 'string' && params.parsers.trim() !== '') { + try { + const parsed = JSON.parse(params.parsers) + if (Array.isArray(parsed)) result.parsers = parsed + } catch { + // Skip invalid parsers config + } + } + } + + if (params.removeBase64Images != null) + result.removeBase64Images = params.removeBase64Images + if (params.blockAds != null) result.blockAds = params.blockAds + if (params.proxy) result.proxy = params.proxy + if (params.zeroDataRetention != null) + result.zeroDataRetention = params.zeroDataRetention + break + } + case 'agent': if (agentPrompt) result.prompt = agentPrompt if (agentUrls) { @@ -451,6 +624,14 @@ Example 2 - Product Data: }, maxCredits: { type: 'number', description: 'Maximum credits to spend' }, strictConstrainToURLs: { type: 'boolean', description: 'Limit agent to provided URLs only' }, + document: { type: 'json', description: 'Document input (file upload or file reference)' }, + includeTags: { type: 'json', description: 'HTML tags to include during parsing' }, + excludeTags: { type: 'json', description: 'HTML tags to exclude during parsing' }, + parsers: { type: 'json', description: 'Parser configuration (e.g., [{"type": "pdf"}])' }, + removeBase64Images: { type: 'boolean', description: 'Remove base64 images, keep alt text' }, + blockAds: { type: 'boolean', description: 'Block ads and popups during parsing' }, + proxy: { type: 'string', description: 'Proxy mode (basic or auto)' }, + zeroDataRetention: { type: 'boolean', description: 'Enable zero data retention' }, }, outputs: { // Scrape output @@ -471,5 +652,9 @@ Example 2 - Product Data: // Agent output status: { type: 'string', description: 'Agent job status' }, expiresAt: { type: 'string', description: 'Result expiration timestamp' }, + // Parse output + summary: { type: 'string', description: 'Generated summary of the parsed document' }, + rawHtml: { type: 'string', description: 'Unprocessed raw HTML from the parsed document' }, + screenshot: { type: 'string', description: 'Screenshot URL or base64 (when requested)' }, }, } diff --git a/apps/sim/tools/firecrawl/index.ts b/apps/sim/tools/firecrawl/index.ts index 94060bdf491..9d868ba7d37 100644 --- a/apps/sim/tools/firecrawl/index.ts +++ b/apps/sim/tools/firecrawl/index.ts @@ -2,6 +2,7 @@ import { agentTool } from '@/tools/firecrawl/agent' import { crawlTool } from '@/tools/firecrawl/crawl' import { extractTool } from '@/tools/firecrawl/extract' import { mapTool } from '@/tools/firecrawl/map' +import { parseTool } from '@/tools/firecrawl/parse' import { scrapeTool } from '@/tools/firecrawl/scrape' import { searchTool } from '@/tools/firecrawl/search' @@ -11,3 +12,4 @@ export const firecrawlCrawlTool = crawlTool export const firecrawlMapTool = mapTool export const firecrawlExtractTool = extractTool export const firecrawlAgentTool = agentTool +export const firecrawlParseTool = parseTool diff --git a/apps/sim/tools/firecrawl/parse.ts b/apps/sim/tools/firecrawl/parse.ts new file mode 100644 index 00000000000..756c53b36a3 --- /dev/null +++ b/apps/sim/tools/firecrawl/parse.ts @@ -0,0 +1,225 @@ +import type { ParseParams, ParseResponse } from '@/tools/firecrawl/types' +import type { ToolConfig } from '@/tools/types' + +export const parseTool: ToolConfig = { + id: 'firecrawl_parse', + name: 'Firecrawl Document Parser', + description: + 'Parse uploaded documents (PDF, DOCX, HTML, etc.) into clean markdown using Firecrawl. Supports .html, .htm, .pdf, .docx, .doc, .odt, .rtf, .xlsx, .xls.', + version: '1.0.0', + + params: { + file: { + type: 'file', + required: true, + visibility: 'user-only', + description: 'Document file to be parsed', + }, + formats: { + type: 'array', + required: false, + visibility: 'user-or-llm', + description: 'Output formats to return (e.g., ["markdown"]). Defaults to markdown.', + }, + onlyMainContent: { + type: 'boolean', + required: false, + visibility: 'user-or-llm', + description: 'Exclude headers, navs, footers. Defaults to true.', + }, + includeTags: { + type: 'array', + required: false, + visibility: 'user-or-llm', + description: 'HTML tags to include', + }, + excludeTags: { + type: 'array', + required: false, + visibility: 'user-or-llm', + description: 'HTML tags to exclude', + }, + timeout: { + type: 'number', + required: false, + visibility: 'user-or-llm', + description: 'Timeout in milliseconds (max 300000). Defaults to 30000.', + }, + parsers: { + type: 'array', + required: false, + visibility: 'user-or-llm', + description: 'Parser configuration (e.g., [{ "type": "pdf" }])', + }, + removeBase64Images: { + type: 'boolean', + required: false, + visibility: 'user-or-llm', + description: 'Remove base64 images, keep alt text. Defaults to true.', + }, + blockAds: { + type: 'boolean', + required: false, + visibility: 'user-or-llm', + description: 'Block ads and popups. Defaults to true.', + }, + proxy: { + type: 'string', + required: false, + visibility: 'user-or-llm', + description: 'Proxy mode: "basic" or "auto"', + }, + zeroDataRetention: { + type: 'boolean', + required: false, + visibility: 'user-or-llm', + description: 'Enable zero data retention. Defaults to false.', + }, + apiKey: { + type: 'string', + required: true, + visibility: 'user-only', + description: 'Firecrawl API key', + }, + }, + + hosting: { + envKeyPrefix: 'FIRECRAWL_API_KEY', + apiKeyParam: 'apiKey', + byokProviderId: 'firecrawl', + pricing: { + type: 'custom', + getCost: (_params, output) => { + const creditsUsed = (output.metadata as { creditsUsed?: number })?.creditsUsed + if (creditsUsed == null) { + throw new Error('Firecrawl response missing creditsUsed field') + } + + if (Number.isNaN(creditsUsed)) { + throw new Error('Firecrawl response returned a non-numeric creditsUsed field') + } + + return { + cost: creditsUsed * 0.001, + metadata: { creditsUsed }, + } + }, + }, + rateLimit: { + mode: 'per_request', + requestsPerMinute: 100, + }, + }, + + request: { + method: 'POST', + url: '/api/tools/firecrawl/parse', + headers: () => ({ + 'Content-Type': 'application/json', + Accept: 'application/json', + }), + body: (params) => { + if (!params.apiKey || typeof params.apiKey !== 'string' || params.apiKey.trim() === '') { + throw new Error('Missing or invalid API key: A valid Firecrawl API key is required') + } + if (!params.file || typeof params.file !== 'object') { + throw new Error('File input is required') + } + + const options: Record = {} + if (params.formats) options.formats = params.formats + if (typeof params.onlyMainContent === 'boolean') + options.onlyMainContent = params.onlyMainContent + if (params.includeTags) options.includeTags = params.includeTags + if (params.excludeTags) options.excludeTags = params.excludeTags + if (params.timeout != null) options.timeout = Number(params.timeout) + if (params.parsers) options.parsers = params.parsers + if (typeof params.removeBase64Images === 'boolean') + options.removeBase64Images = params.removeBase64Images + if (typeof params.blockAds === 'boolean') options.blockAds = params.blockAds + if (params.proxy) options.proxy = params.proxy + if (typeof params.zeroDataRetention === 'boolean') + options.zeroDataRetention = params.zeroDataRetention + + return { + apiKey: params.apiKey, + file: params.file, + options, + } + }, + }, + + transformResponse: async (response: Response) => { + const data = await response.json() + if (!data || typeof data !== 'object') { + throw new Error('Invalid response format from Firecrawl parse API') + } + + const result = data.output ?? data.data ?? data + + return { + success: true, + output: { + markdown: result.markdown ?? '', + summary: result.summary ?? null, + html: result.html ?? null, + rawHtml: result.rawHtml ?? null, + screenshot: result.screenshot ?? null, + links: result.links ?? [], + metadata: result.metadata ?? null, + warning: result.warning ?? null, + }, + } + }, + + outputs: { + markdown: { type: 'string', description: 'Parsed document content in markdown format' }, + summary: { + type: 'string', + description: 'Generated summary of the document', + optional: true, + }, + html: { + type: 'string', + description: 'Processed HTML content', + optional: true, + }, + rawHtml: { + type: 'string', + description: 'Unprocessed raw HTML content', + optional: true, + }, + screenshot: { + type: 'string', + description: 'Screenshot URL or base64 (when requested)', + optional: true, + }, + links: { + type: 'array', + description: 'URLs discovered in the document', + optional: true, + items: { type: 'string', description: 'Discovered URL' }, + }, + metadata: { + type: 'object', + description: 'Document metadata', + optional: true, + properties: { + title: { type: 'string', description: 'Document title', optional: true }, + description: { type: 'string', description: 'Document description', optional: true }, + language: { type: 'string', description: 'Document language code', optional: true }, + sourceURL: { type: 'string', description: 'Source URL', optional: true }, + url: { type: 'string', description: 'Final URL', optional: true }, + keywords: { type: 'string', description: 'Document keywords', optional: true }, + statusCode: { type: 'number', description: 'HTTP status code', optional: true }, + contentType: { type: 'string', description: 'Document content type', optional: true }, + error: { type: 'string', description: 'Error message if parse failed', optional: true }, + }, + }, + warning: { + type: 'string', + description: 'Warning message from the parse operation', + optional: true, + }, + }, +} diff --git a/apps/sim/tools/firecrawl/types.ts b/apps/sim/tools/firecrawl/types.ts index ff2a44fe2b6..28a64eef0db 100644 --- a/apps/sim/tools/firecrawl/types.ts +++ b/apps/sim/tools/firecrawl/types.ts @@ -521,6 +521,44 @@ export interface AgentResponse extends ToolResponse { } } +export interface ParseParams { + apiKey: string + file: unknown + formats?: Array<{ type: string } | string> + onlyMainContent?: boolean + includeTags?: string[] + excludeTags?: string[] + timeout?: number + parsers?: Array<{ type: string; mode?: string } | string> + removeBase64Images?: boolean + blockAds?: boolean + proxy?: 'basic' | 'auto' + zeroDataRetention?: boolean +} + +export interface ParseResponse extends ToolResponse { + output: { + markdown: string + summary?: string | null + html?: string | null + rawHtml?: string | null + screenshot?: string | null + links?: string[] + metadata?: { + title?: string | string[] + description?: string | string[] + language?: string | string[] | null + sourceURL?: string + url?: string + keywords?: string | string[] + statusCode?: number + contentType?: string + error?: string | null + } | null + warning?: string | null + } +} + export type FirecrawlResponse = | ScrapeResponse | SearchResponse @@ -528,3 +566,4 @@ export type FirecrawlResponse = | MapResponse | ExtractResponse | AgentResponse + | ParseResponse diff --git a/apps/sim/tools/registry.ts b/apps/sim/tools/registry.ts index dac92d1d879..8ee2f4f45f7 100644 --- a/apps/sim/tools/registry.ts +++ b/apps/sim/tools/registry.ts @@ -637,6 +637,7 @@ import { firecrawlCrawlTool, firecrawlExtractTool, firecrawlMapTool, + firecrawlParseTool, firecrawlScrapeTool, firecrawlSearchTool, } from '@/tools/firecrawl' @@ -3114,6 +3115,7 @@ export const tools: Record = { firecrawl_map: firecrawlMapTool, firecrawl_extract: firecrawlExtractTool, firecrawl_agent: firecrawlAgentTool, + firecrawl_parse: firecrawlParseTool, fireflies_list_transcripts: firefliesListTranscriptsTool, fireflies_get_transcript: firefliesGetTranscriptTool, fireflies_get_user: firefliesGetUserTool,