Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 44 additions & 0 deletions apps/docs/content/docs/en/tools/firecrawl.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -234,4 +234,48 @@ Autonomous web data extraction agent. Searches and gathers information based on
| `expiresAt` | string | Timestamp when the results expire \(24 hours\) |
| `sources` | object | Array of source URLs used by the agent |

### `firecrawl_parse`

Parse uploaded documents (PDF, DOCX, HTML, etc.) into clean markdown using Firecrawl. Supports .html, .htm, .pdf, .docx, .doc, .odt, .rtf, .xlsx, .xls.

#### Input

| Parameter | Type | Required | Description |
| --------- | ---- | -------- | ----------- |
| `file` | file | Yes | Document file to be parsed |
| `formats` | array | No | Output formats to return \(e.g., \["markdown"\]\). Defaults to markdown. |
| `onlyMainContent` | boolean | No | Exclude headers, navs, footers. Defaults to true. |
| `includeTags` | array | No | HTML tags to include |
| `excludeTags` | array | No | HTML tags to exclude |
| `timeout` | number | No | Timeout in milliseconds \(max 300000\). Defaults to 30000. |
| `parsers` | array | No | Parser configuration \(e.g., \[\{ "type": "pdf" \}\]\) |
| `removeBase64Images` | boolean | No | Remove base64 images, keep alt text. Defaults to true. |
| `blockAds` | boolean | No | Block ads and popups. Defaults to true. |
| `proxy` | string | No | Proxy mode: "basic" or "auto" |
| `zeroDataRetention` | boolean | No | Enable zero data retention. Defaults to false. |
| `apiKey` | string | Yes | Firecrawl API key |
| `rateLimit` | string | No | No description |

#### Output

| Parameter | Type | Description |
| --------- | ---- | ----------- |
| `markdown` | string | Parsed document content in markdown format |
| `summary` | string | Generated summary of the document |
| `html` | string | Processed HTML content |
| `rawHtml` | string | Unprocessed raw HTML content |
| `screenshot` | string | Screenshot URL or base64 \(when requested\) |
| `links` | array | URLs discovered in the document |
| `metadata` | object | Document metadata |
| ↳ `title` | string | Document title |
| ↳ `description` | string | Document description |
| ↳ `language` | string | Document language code |
| ↳ `sourceURL` | string | Source URL |
| ↳ `url` | string | Final URL |
| ↳ `keywords` | string | Document keywords |
| ↳ `statusCode` | number | HTTP status code |
| ↳ `contentType` | string | Document content type |
| ↳ `error` | string | Error message if parse failed |
| `warning` | string | Warning message from the parse operation |


2 changes: 0 additions & 2 deletions apps/docs/content/docs/en/tools/notion.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -256,8 +256,6 @@ Create a new database in Notion with custom properties

### `notion_add_database_row`

Add a new row to a Notion database with specified properties

#### Input

| Parameter | Type | Required | Description |
Expand Down
6 changes: 5 additions & 1 deletion apps/sim/app/(landing)/integrations/data/integrations.json
Original file line number Diff line number Diff line change
Expand Up @@ -4020,9 +4020,13 @@
{
"name": "Agent",
"description": "Autonomous web data extraction agent. Searches and gathers information based on natural language prompts without requiring specific URLs."
},
{
"name": "Parse Document",
"description": "Parse uploaded documents (PDF, DOCX, HTML, etc.) into clean markdown using Firecrawl. Supports .html, .htm, .pdf, .docx, .doc, .odt, .rtf, .xlsx, .xls."
}
],
"operationCount": 6,
"operationCount": 7,
"triggers": [],
"triggerCount": 0,
"authType": "api-key",
Expand Down
104 changes: 104 additions & 0 deletions apps/sim/app/api/tools/firecrawl/parse/route.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
import { createLogger } from '@sim/logger'
import { toError } from '@sim/utils/errors'
import { type NextRequest, NextResponse } from 'next/server'
import { z } from 'zod'
import { checkInternalAuth } from '@/lib/auth/hybrid'
import { generateRequestId } from '@/lib/core/utils/request'
import { withRouteHandler } from '@/lib/core/utils/with-route-handler'
import { RawFileInputSchema } from '@/lib/uploads/utils/file-schemas'
import { processFilesToUserFiles } from '@/lib/uploads/utils/file-utils'
import { downloadFileFromStorage } from '@/lib/uploads/utils/file-utils.server'

export const dynamic = 'force-dynamic'

const logger = createLogger('FirecrawlParseAPI')

const FirecrawlParseSchema = z.object({
apiKey: z.string().min(1, 'API key is required'),
file: RawFileInputSchema,
options: z.record(z.unknown()).optional(),
})

export const POST = withRouteHandler(async (request: NextRequest) => {
const requestId = generateRequestId()

try {
const authResult = await checkInternalAuth(request, { requireWorkflowId: false })

if (!authResult.success || !authResult.userId) {
logger.warn(`[${requestId}] Unauthorized Firecrawl parse attempt`, {
error: authResult.error || 'Missing userId',
})
return NextResponse.json(
{ success: false, error: authResult.error || 'Unauthorized' },
{ status: 401 }
)
}

const body = await request.json()
const validatedData = FirecrawlParseSchema.parse(body)

const [userFile] = processFilesToUserFiles([validatedData.file], requestId, logger)
if (!userFile) {
return NextResponse.json({ success: false, error: 'File input is required' }, { status: 400 })
}

logger.info(`[${requestId}] Firecrawl parse request`, {
fileName: userFile.name,
size: userFile.size,
})

const buffer = await downloadFileFromStorage(userFile, requestId, logger)

const formData = new FormData()
const blob = new Blob([new Uint8Array(buffer)], {
type: userFile.type || 'application/octet-stream',
})
formData.append('file', blob, userFile.name)

if (validatedData.options && Object.keys(validatedData.options).length > 0) {
formData.append('options', JSON.stringify(validatedData.options))
}

const firecrawlResponse = await fetch('https://api.firecrawl.dev/v2/parse', {
method: 'POST',
headers: {
Authorization: `Bearer ${validatedData.apiKey}`,
},
body: formData,
})

if (!firecrawlResponse.ok) {
Comment thread
waleedlatif1 marked this conversation as resolved.
const errorText = await firecrawlResponse.text()
logger.error(`[${requestId}] Firecrawl API error:`, errorText)
return NextResponse.json(
{
success: false,
error: `Firecrawl API error: ${errorText || firecrawlResponse.statusText}`,
},
{ status: firecrawlResponse.status }
)
Comment thread
waleedlatif1 marked this conversation as resolved.
}

const firecrawlData = await firecrawlResponse.json()

logger.info(`[${requestId}] Firecrawl parse successful`)

return NextResponse.json({
success: true,
output: firecrawlData.data ?? firecrawlData,
})
} catch (error) {
if (error instanceof z.ZodError) {
logger.warn(`[${requestId}] Invalid request data`, { errors: error.errors })
return NextResponse.json(
{ success: false, error: 'Invalid request data', details: error.errors },
{ status: 400 }
)
}

logger.error(`[${requestId}] Error in Firecrawl parse:`, error)

return NextResponse.json({ success: false, error: toError(error).message }, { status: 500 })
}
})
Original file line number Diff line number Diff line change
Expand Up @@ -347,7 +347,7 @@ export const ShortInput = memo(function ShortInput({
<>
<Input
ref={ref as React.RefObject<HTMLInputElement>}
className='allow-scroll w-full overflow-auto text-transparent caret-foreground [-ms-overflow-style:none] [scrollbar-width:none] selection:text-transparent placeholder:text-muted-foreground/50 [&::-webkit-scrollbar]:hidden'
className='allow-scroll w-full overflow-auto text-transparent caret-foreground [-ms-overflow-style:none] [scrollbar-width:none] placeholder:text-muted-foreground/50 [&::-webkit-scrollbar]:hidden'
readOnly={readOnly}
placeholder={placeholder ?? ''}
type='text'
Expand Down
Loading
Loading