Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
84 changes: 49 additions & 35 deletions apps/sim/app/api/files/parse/route.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,6 @@ const {
mockFsWriteFile,
mockJoin,
actualPath,
mockFileExistsInWorkspace,
mockListWorkspaceFiles,
mockUploadWorkspaceFile,
} = vi.hoisted(() => {
// eslint-disable-next-line @typescript-eslint/no-require-imports
Expand Down Expand Up @@ -62,9 +60,19 @@ const {
return actualPath.join(...args)
}),
actualPath,
mockFileExistsInWorkspace: vi.fn().mockResolvedValue(false),
mockListWorkspaceFiles: vi.fn().mockResolvedValue([]),
mockUploadWorkspaceFile: vi.fn().mockResolvedValue({}),
mockUploadWorkspaceFile: vi
.fn()
.mockImplementation(
async (workspaceId: string, _userId: string, _buffer: Buffer, fileName: string) => ({
id: 'wf_test',
name: fileName,
size: 0,
type: 'application/octet-stream',
url: `/api/files/serve/${workspaceId}/${fileName}`,
key: `${workspaceId}/${fileName}`,
context: 'workspace',
})
),
}
})

Expand Down Expand Up @@ -110,9 +118,7 @@ vi.mock('@/lib/uploads/contexts/execution', () => ({
uploadExecutionFile: vi.fn(),
}))

vi.mock('@/lib/uploads/contexts/workspace', () => ({
fileExistsInWorkspace: mockFileExistsInWorkspace,
listWorkspaceFiles: mockListWorkspaceFiles,
vi.mock('@/lib/uploads/contexts/workspace/workspace-file-manager', () => ({
uploadWorkspaceFile: mockUploadWorkspaceFile,
}))

Expand Down Expand Up @@ -190,9 +196,7 @@ describe('File Parse API Route', () => {
mockFsStat.mockResolvedValue({ isFile: () => true, size: 17 })
mockFsReadFile.mockResolvedValue(Buffer.from('test file content'))
mockIsSupportedFileType.mockReturnValue(true)
mockFileExistsInWorkspace.mockResolvedValue(false)
mockListWorkspaceFiles.mockResolvedValue([])
mockUploadWorkspaceFile.mockResolvedValue({})
mockUploadWorkspaceFile.mockClear()
mockParseFile.mockResolvedValue({
content: 'parsed content',
metadata: { pageCount: 1 },
Expand Down Expand Up @@ -384,34 +388,32 @@ describe('File Parse API Route', () => {
)
})

it('should preserve the full download cap when an external URL reuses a workspace file', async () => {
it('should never dedup external URL fetches by path filename — two URLs sharing image.png both download', async () => {
inputValidationMockFns.mockValidateUrlWithDNS.mockResolvedValue({
isValid: true,
resolvedIP: '203.0.113.10',
})
inputValidationMockFns.mockSecureFetchWithPinnedIP.mockResolvedValue(
new Response('file content', {
status: 200,
headers: { 'content-type': 'text/plain' },
})
)
mockFileExistsInWorkspace.mockResolvedValueOnce(false).mockResolvedValueOnce(true)
mockListWorkspaceFiles.mockResolvedValueOnce([
{ name: 'file2.txt', key: 'workspace-file2.txt' },
])

mockParseBuffer
.mockResolvedValueOnce({
content: 'a'.repeat(4 * 1024 * 1024),
metadata: { pageCount: 1 },
})
.mockResolvedValueOnce({
content: 'second file',
metadata: { pageCount: 1 },
})
inputValidationMockFns.mockSecureFetchWithPinnedIP
.mockResolvedValueOnce(
new Response('first image bytes', {
status: 200,
headers: { 'content-type': 'image/png' },
})
)
.mockResolvedValueOnce(
new Response('second image bytes — different content', {
status: 200,
headers: { 'content-type': 'image/png' },
})
)
mockIsSupportedFileType.mockReturnValue(false)
permissionsMockFns.mockGetUserEntityPermissions.mockResolvedValue('write')

const req = createMockRequest('POST', {
filePath: ['https://example.com/file1.txt', 'https://example.com/file2.txt'],
filePath: [
'https://files.slack.com/files-pri/T07-FAAA/download/image.png',
'https://files.slack.com/files-pri/T07-FBBB/download/image.png',
],
workspaceId: 'workspace-id',
})

Expand All @@ -420,9 +422,21 @@ describe('File Parse API Route', () => {

expect(response.status).toBe(200)
expect(data.results).toHaveLength(2)
expect(storageServiceMockFns.mockDownloadFile).toHaveBeenCalledWith(
expect.objectContaining({ key: 'workspace-file2.txt', maxBytes: 100 * 1024 * 1024 })
expect(inputValidationMockFns.mockSecureFetchWithPinnedIP).toHaveBeenCalledTimes(2)
expect(inputValidationMockFns.mockSecureFetchWithPinnedIP).toHaveBeenNthCalledWith(
1,
'https://files.slack.com/files-pri/T07-FAAA/download/image.png',
'203.0.113.10',
expect.any(Object)
)
expect(inputValidationMockFns.mockSecureFetchWithPinnedIP).toHaveBeenNthCalledWith(
2,
'https://files.slack.com/files-pri/T07-FBBB/download/image.png',
'203.0.113.10',
expect.any(Object)
)
expect(mockUploadWorkspaceFile).toHaveBeenCalledTimes(2)
expect(storageServiceMockFns.mockDownloadFile).not.toHaveBeenCalled()
})

it('should stop multi-file parsing once the combined parsed output is too large', async () => {
Expand Down
153 changes: 34 additions & 119 deletions apps/sim/app/api/files/parse/route.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,21 +10,15 @@ import { type NextRequest, NextResponse } from 'next/server'
import { fileParseContract } from '@/lib/api/contracts/storage-transfer'
import { getValidationErrorMessage, parseRequest } from '@/lib/api/server'
import { checkInternalAuth } from '@/lib/auth/hybrid'
import {
secureFetchWithPinnedIP,
validateUrlWithDNS,
} from '@/lib/core/security/input-validation.server'
import { sanitizeUrlForLog } from '@/lib/core/utils/logging'
import {
assertKnownSizeWithinLimit,
DEFAULT_MAX_ERROR_BODY_BYTES,
isPayloadSizeLimitError,
readResponseTextWithLimit,
readResponseToBufferWithLimit,
} from '@/lib/core/utils/stream-limits'
import { assertKnownSizeWithinLimit, isPayloadSizeLimitError } from '@/lib/core/utils/stream-limits'
import { isSupportedFileType, parseFile } from '@/lib/file-parsers'
import { isUsingCloudStorage, type StorageContext, StorageService } from '@/lib/uploads'
import { uploadExecutionFile } from '@/lib/uploads/contexts/execution'
import {
ExternalUrlValidationError,
fetchExternalUrlToWorkspace,
} from '@/lib/uploads/contexts/workspace'
import { UPLOAD_DIR_SERVER } from '@/lib/uploads/core/setup.server'
import { getFileMetadataByKey } from '@/lib/uploads/server/metadata'
import {
Expand All @@ -36,7 +30,6 @@ import {
inferContextFromKey,
isInternalFileUrl,
} from '@/lib/uploads/utils/file-utils'
import { getUserEntityPermissions } from '@/lib/workspaces/permissions/utils'
import { verifyFileAccess } from '@/app/api/files/authorization'
import type { UserFile } from '@/executor/types'
import '@/lib/uploads/core/setup.server'
Expand Down Expand Up @@ -453,9 +446,16 @@ function validateFilePath(filePath: string): { isValid: boolean; error?: string
}

/**
* Handle external URL
* If workspaceId is provided, checks if file already exists and saves to workspace if not
* If executionContext is provided, also stores the file in execution storage and returns UserFile
* Handle external URL.
*
* Always fetches the URL fresh — there is no filename-based dedup. Distinct URLs
* commonly share a path tail (e.g. every Slack clipboard paste is `image.png`),
* so keying a cache by filename returns stale bytes. `fetchExternalUrlToWorkspace`
* delegates to `uploadWorkspaceFile`, which suffix-disambiguates collisions on save.
*
* Workspace save is skipped when the URL already points at our execution-files
* bucket (re-uploading our own bytes is wasteful and would generate `image (1).png`
* style aliases for files we already own).
*/
async function handleExternalUrl(
url: string,
Expand All @@ -470,23 +470,6 @@ async function handleExternalUrl(
): Promise<ParseResult> {
try {
logger.info('Fetching external URL:', url)
logger.info('WorkspaceId for URL save:', workspaceId)

const urlValidation = await validateUrlWithDNS(url, 'fileUrl')
if (!urlValidation.isValid) {
logger.warn(`Blocked external URL request: ${urlValidation.error}`)
return {
success: false,
error: urlValidation.error || 'Invalid external URL',
filePath: url,
}
}

const urlPath = new URL(url).pathname
const filename = urlPath.split('/').pop() || 'download'
const extension = path.extname(filename).toLowerCase().substring(1)

logger.info(`Extracted filename: ${filename}, workspaceId: ${workspaceId}`)

const {
S3_EXECUTION_FILES_CONFIG,
Expand All @@ -511,104 +494,27 @@ async function handleExternalUrl(
isExecutionFile = false
}

// Only apply workspace deduplication if:
// 1. WorkspaceId is provided
// 2. URL is NOT from execution files bucket/container
const shouldCheckWorkspace = workspaceId && !isExecutionFile

if (shouldCheckWorkspace) {
const permission = await getUserEntityPermissions(userId, 'workspace', workspaceId)
if (permission === null) {
logger.warn('User does not have workspace access for file parse', {
userId,
workspaceId,
filename,
})
return {
success: false,
error: 'File not found',
filePath: url,
}
}

const { fileExistsInWorkspace, listWorkspaceFiles } = await import(
'@/lib/uploads/contexts/workspace'
)
const exists = await fileExistsInWorkspace(workspaceId, filename)

if (exists) {
logger.info(`File ${filename} already exists in workspace, using existing file`)
const workspaceFiles = await listWorkspaceFiles(workspaceId)
const existingFile = workspaceFiles.find((f) => f.name === filename)

if (existingFile) {
const storageFilePath = `/api/files/serve/${existingFile.key}`
return handleCloudFile(
storageFilePath,
fileType,
'workspace',
userId,
executionContext,
maxDownloadBytes,
maxParsedOutputBytes
)
}
}
}

const response = await secureFetchWithPinnedIP(url, urlValidation.resolvedIP!, {
timeout: DOWNLOAD_TIMEOUT_MS,
maxResponseBytes: maxDownloadBytes,
signal,
...(headers && Object.keys(headers).length > 0 && { headers }),
})
if (!response.ok) {
await readResponseTextWithLimit(response, {
maxBytes: DEFAULT_MAX_ERROR_BODY_BYTES,
label: 'file download error response',
signal,
}).catch(() => '')
throw new Error(`Failed to fetch URL: ${response.status} ${response.statusText}`)
}

const buffer = await readResponseToBufferWithLimit(response, {
maxBytes: maxDownloadBytes,
label: 'file download',
const { filename, buffer, mimeType } = await fetchExternalUrlToWorkspace({
url,
userId,
workspaceId: workspaceId || undefined,
saveToWorkspace: Boolean(workspaceId) && !isExecutionFile,
headers,
signal,
maxDownloadBytes,
timeoutMs: DOWNLOAD_TIMEOUT_MS,
})
const extension = path.extname(filename).toLowerCase().substring(1)

logger.info(`Downloaded file from URL: ${url}, size: ${buffer.length} bytes`)

let userFile: UserFile | undefined
const mimeType = response.headers.get('content-type') || getMimeTypeFromExtension(extension)

if (executionContext) {
try {
userFile = await uploadExecutionFile(executionContext, buffer, filename, mimeType, userId)
logger.info(`Stored file in execution storage: ${filename}`, { key: userFile.key })
} catch (uploadError) {
logger.warn(`Failed to store file in execution storage:`, uploadError)
// Continue without userFile - parsing can still work
}
}

if (shouldCheckWorkspace) {
try {
const permission = await getUserEntityPermissions(userId, 'workspace', workspaceId)
if (permission !== 'admin' && permission !== 'write') {
logger.warn('User does not have write permission for workspace file save', {
userId,
workspaceId,
filename,
permission,
})
} else {
const { uploadWorkspaceFile } = await import('@/lib/uploads/contexts/workspace')
await uploadWorkspaceFile(workspaceId, userId, buffer, filename, mimeType)
logger.info(`Saved URL file to workspace storage: ${filename}`)
}
} catch (saveError) {
logger.warn(`Failed to save URL file to workspace:`, saveError)
logger.warn('Failed to store file in execution storage:', uploadError)
}
}

Expand Down Expand Up @@ -657,6 +563,15 @@ async function handleExternalUrl(
}
}

if (error instanceof ExternalUrlValidationError) {
logger.warn(`Blocked external URL request: ${error.message}`)
return {
success: false,
error: error.message,
filePath: url,
}
}

return {
success: false,
error: `Error fetching URL: ${(error as Error).message}`,
Expand Down
Loading
Loading