diff --git a/.github/ISSUE_TEMPLATE/agent_scenario_request.yml b/.github/ISSUE_TEMPLATE/agent_scenario_request.yml new file mode 100644 index 00000000000..a76f4c31653 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/agent_scenario_request.yml @@ -0,0 +1,46 @@ +name: "❤️🔥ᴬᴳᴱᴺᵀ Agent scenario request" +description: Propose a agent scenario request for RAGFlow. +title: "[Agent Scenario Request]: " +labels: ["❤️🔥ᴬᴳᴱᴺᵀ agent scenario"] +body: + - type: checkboxes + attributes: + label: Self Checks + description: "Please check the following in order to be responded in time :)" + options: + - label: I have searched for existing issues [search for existing issues](https://github.com/infiniflow/ragflow/issues), including closed ones. + required: true + - label: I confirm that I am using English to submit this report ([Language Policy](https://github.com/infiniflow/ragflow/issues/5910)). + required: true + - label: Non-english title submitions will be closed directly ( 非英文标题的提交将会被直接关闭 ) ([Language Policy](https://github.com/infiniflow/ragflow/issues/5910)). + required: true + - label: "Please do not modify this template :) and fill in all the required fields." + required: true + - type: textarea + attributes: + label: Is your feature request related to a scenario? + description: | + A clear and concise description of what the scenario is. Ex. I'm always frustrated when [...] + render: Markdown + validations: + required: false + - type: textarea + attributes: + label: Describe the feature you'd like + description: A clear and concise description of what you want to happen. + validations: + required: true + - type: textarea + attributes: + label: Documentation, adoption, use case + description: If you can, explain some scenarios how users might use this, situations it would be helpful in. Any API designs, mockups, or diagrams are also helpful. + render: Markdown + validations: + required: false + - type: textarea + attributes: + label: Additional information + description: | + Add any other context or screenshots about the feature request here. + validations: + required: false \ No newline at end of file diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 489daab3e74..cfdb3c15a97 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -16,52 +16,52 @@ concurrency: jobs: release: - runs-on: [ "self-hosted", "overseas" ] + runs-on: [ "self-hosted", "ragflow-test" ] steps: - name: Ensure workspace ownership - run: echo "chown -R $USER $GITHUB_WORKSPACE" && sudo chown -R $USER $GITHUB_WORKSPACE + run: echo "chown -R ${USER} ${GITHUB_WORKSPACE}" && sudo chown -R ${USER} ${GITHUB_WORKSPACE} # https://github.com/actions/checkout/blob/v3/README.md - name: Check out code uses: actions/checkout@v4 with: - token: ${{ secrets.MY_GITHUB_TOKEN }} # Use the secret as an environment variable + token: ${{ secrets.GITHUB_TOKEN }} # Use the secret as an environment variable fetch-depth: 0 fetch-tags: true - name: Prepare release body run: | - if [[ $GITHUB_EVENT_NAME == 'create' ]]; then + if [[ ${GITHUB_EVENT_NAME} == "create" ]]; then RELEASE_TAG=${GITHUB_REF#refs/tags/} - if [[ $RELEASE_TAG == 'nightly' ]]; then + if [[ ${RELEASE_TAG} == "nightly" ]]; then PRERELEASE=true else PRERELEASE=false fi - echo "Workflow triggered by create tag: $RELEASE_TAG" + echo "Workflow triggered by create tag: ${RELEASE_TAG}" else RELEASE_TAG=nightly PRERELEASE=true echo "Workflow triggered by schedule" fi - echo "RELEASE_TAG=$RELEASE_TAG" >> $GITHUB_ENV - echo "PRERELEASE=$PRERELEASE" >> $GITHUB_ENV + echo "RELEASE_TAG=${RELEASE_TAG}" >> ${GITHUB_ENV} + echo "PRERELEASE=${PRERELEASE}" >> ${GITHUB_ENV} RELEASE_DATETIME=$(date --rfc-3339=seconds) - echo Release $RELEASE_TAG created from $GITHUB_SHA at $RELEASE_DATETIME > release_body.md + echo Release ${RELEASE_TAG} created from ${GITHUB_SHA} at ${RELEASE_DATETIME} > release_body.md - name: Move the existing mutable tag # https://github.com/softprops/action-gh-release/issues/171 run: | git fetch --tags - if [[ $GITHUB_EVENT_NAME == 'schedule' ]]; then + if [[ ${GITHUB_EVENT_NAME} == "schedule" ]]; then # Determine if a given tag exists and matches a specific Git commit. # actions/checkout@v4 fetch-tags doesn't work when triggered by schedule - if [ "$(git rev-parse -q --verify "refs/tags/$RELEASE_TAG")" = "$GITHUB_SHA" ]; then - echo "mutable tag $RELEASE_TAG exists and matches $GITHUB_SHA" + if [ "$(git rev-parse -q --verify "refs/tags/${RELEASE_TAG}")" = "${GITHUB_SHA}" ]; then + echo "mutable tag ${RELEASE_TAG} exists and matches ${GITHUB_SHA}" else - git tag -f $RELEASE_TAG $GITHUB_SHA - git push -f origin $RELEASE_TAG:refs/tags/$RELEASE_TAG - echo "created/moved mutable tag $RELEASE_TAG to $GITHUB_SHA" + git tag -f ${RELEASE_TAG} ${GITHUB_SHA} + git push -f origin ${RELEASE_TAG}:refs/tags/${RELEASE_TAG} + echo "created/moved mutable tag ${RELEASE_TAG} to ${GITHUB_SHA}" fi fi @@ -69,50 +69,26 @@ jobs: # https://github.com/actions/upload-release-asset has been replaced by https://github.com/softprops/action-gh-release uses: softprops/action-gh-release@v2 with: - token: ${{ secrets.MY_GITHUB_TOKEN }} # Use the secret as an environment variable + token: ${{ secrets.GITHUB_TOKEN }} # Use the secret as an environment variable prerelease: ${{ env.PRERELEASE }} tag_name: ${{ env.RELEASE_TAG }} # The body field does not support environment variable substitution directly. body_path: release_body.md - # https://github.com/marketplace/actions/docker-login - - name: Login to Docker Hub - uses: docker/login-action@v3 - with: - username: infiniflow - password: ${{ secrets.DOCKERHUB_TOKEN }} - - # https://github.com/marketplace/actions/build-and-push-docker-images - - name: Build and push full image - uses: docker/build-push-action@v6 - with: - context: . - push: true - tags: infiniflow/ragflow:${{ env.RELEASE_TAG }} - file: Dockerfile - platforms: linux/amd64 - - # https://github.com/marketplace/actions/build-and-push-docker-images - - name: Build and push slim image - uses: docker/build-push-action@v6 - with: - context: . - push: true - tags: infiniflow/ragflow:${{ env.RELEASE_TAG }}-slim - file: Dockerfile - build-args: LIGHTEN=1 - platforms: linux/amd64 - - - name: Build ragflow-sdk + - name: Build and push ragflow-sdk if: startsWith(github.ref, 'refs/tags/v') run: | - cd sdk/python && \ - uv build + cd sdk/python && uv build && uv publish --token ${{ secrets.PYPI_API_TOKEN }} - - name: Publish package distributions to PyPI + - name: Build and push ragflow-cli if: startsWith(github.ref, 'refs/tags/v') - uses: pypa/gh-action-pypi-publish@release/v1 - with: - packages-dir: sdk/python/dist/ - password: ${{ secrets.PYPI_API_TOKEN }} - verbose: true + run: | + cd admin/client && uv build && uv publish --token ${{ secrets.PYPI_API_TOKEN }} + + - name: Build and push image + run: | + sudo docker login --username infiniflow --password-stdin <<< ${{ secrets.DOCKERHUB_TOKEN }} + sudo docker build --build-arg NEED_MIRROR=1 -t infiniflow/ragflow:${RELEASE_TAG} -f Dockerfile . + sudo docker tag infiniflow/ragflow:${RELEASE_TAG} infiniflow/ragflow:latest + sudo docker push infiniflow/ragflow:${RELEASE_TAG} + sudo docker push infiniflow/ragflow:latest diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index b5ea772d5e2..4357bf98278 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -9,8 +9,11 @@ on: - 'docs/**' - '*.md' - '*.mdx' - pull_request: - types: [ opened, synchronize, reopened, labeled ] + # The only difference between pull_request and pull_request_target is the context in which the workflow runs: + # — pull_request_target workflows use the workflow files from the default branch, and secrets are available. + # — pull_request workflows use the workflow files from the pull request branch, and secrets are unavailable. + pull_request_target: + types: [ synchronize, ready_for_review ] paths-ignore: - 'docs/**' - '*.md' @@ -28,26 +31,63 @@ jobs: name: ragflow_tests # https://docs.github.com/en/actions/using-jobs/using-conditions-to-control-job-execution # https://github.com/orgs/community/discussions/26261 - if: ${{ github.event_name != 'pull_request' || contains(github.event.pull_request.labels.*.name, 'ci') }} - runs-on: [ "self-hosted", "debug" ] + if: ${{ github.event_name != 'pull_request_target' || contains(github.event.pull_request.labels.*.name, 'ci') }} + runs-on: [ "self-hosted", "ragflow-test" ] steps: # https://github.com/hmarr/debug-action #- uses: hmarr/debug-action@v2 - - name: Show who triggered this workflow + - name: Ensure workspace ownership run: | echo "Workflow triggered by ${{ github.event_name }}" - - - name: Ensure workspace ownership - run: echo "chown -R $USER $GITHUB_WORKSPACE" && sudo chown -R $USER $GITHUB_WORKSPACE + echo "chown -R ${USER} ${GITHUB_WORKSPACE}" && sudo chown -R ${USER} ${GITHUB_WORKSPACE} # https://github.com/actions/checkout/issues/1781 - name: Check out code uses: actions/checkout@v4 with: + ref: ${{ (github.event_name == 'pull_request' || github.event_name == 'pull_request_target') && format('refs/pull/{0}/merge', github.event.pull_request.number) || github.sha }} fetch-depth: 0 fetch-tags: true + - name: Check workflow duplication + if: ${{ !cancelled() && !failure() }} + run: | + if [[ ${GITHUB_EVENT_NAME} != "pull_request_target" && ${GITHUB_EVENT_NAME} != "schedule" ]]; then + HEAD=$(git rev-parse HEAD) + # Find a PR that introduced a given commit + gh auth login --with-token <<< "${{ secrets.GITHUB_TOKEN }}" + PR_NUMBER=$(gh pr list --search ${HEAD} --state merged --json number --jq .[0].number) + echo "HEAD=${HEAD}" + echo "PR_NUMBER=${PR_NUMBER}" + if [[ -n "${PR_NUMBER}" ]]; then + PR_SHA_FP=${RUNNER_WORKSPACE_PREFIX}/artifacts/${GITHUB_REPOSITORY}/PR_${PR_NUMBER} + if [[ -f "${PR_SHA_FP}" ]]; then + read -r PR_SHA PR_RUN_ID < "${PR_SHA_FP}" + # Calculate the hash of the current workspace content + HEAD_SHA=$(git rev-parse HEAD^{tree}) + if [[ "${HEAD_SHA}" == "${PR_SHA}" ]]; then + echo "Cancel myself since the workspace content hash is the same with PR #${PR_NUMBER} merged. See ${GITHUB_SERVER_URL}/${GITHUB_REPOSITORY}/actions/runs/${PR_RUN_ID} for details." + gh run cancel ${GITHUB_RUN_ID} + while true; do + status=$(gh run view ${GITHUB_RUN_ID} --json status -q .status) + [ "${status}" = "completed" ] && break + sleep 5 + done + exit 1 + fi + fi + fi + elif [[ ${GITHUB_EVENT_NAME} == "pull_request_target" ]]; then + PR_NUMBER=${{ github.event.pull_request.number }} + PR_SHA_FP=${RUNNER_WORKSPACE_PREFIX}/artifacts/${GITHUB_REPOSITORY}/PR_${PR_NUMBER} + # Calculate the hash of the current workspace content + PR_SHA=$(git rev-parse HEAD^{tree}) + echo "PR #${PR_NUMBER} workspace content hash: ${PR_SHA}" + mkdir -p ${RUNNER_WORKSPACE_PREFIX}/artifacts/${GITHUB_REPOSITORY} + echo "${PR_SHA} ${GITHUB_RUN_ID}" > ${PR_SHA_FP} + fi + # https://github.com/astral-sh/ruff-action - name: Static check with Ruff uses: astral-sh/ruff-action@v3 @@ -55,121 +95,145 @@ jobs: version: ">=0.11.x" args: "check" - - name: Build ragflow:nightly-slim - run: | - RUNNER_WORKSPACE_PREFIX=${RUNNER_WORKSPACE_PREFIX:-$HOME} - sudo docker pull ubuntu:22.04 - sudo docker build --progress=plain --build-arg LIGHTEN=1 --build-arg NEED_MIRROR=1 -f Dockerfile -t infiniflow/ragflow:nightly-slim . - - name: Build ragflow:nightly run: | - sudo docker build --progress=plain --build-arg NEED_MIRROR=1 -f Dockerfile -t infiniflow/ragflow:nightly . - - - name: Start ragflow:nightly-slim - run: | - echo -e "\nRAGFLOW_IMAGE=infiniflow/ragflow:nightly-slim" >> docker/.env - sudo docker compose -f docker/docker-compose.yml up -d - - - name: Stop ragflow:nightly-slim - if: always() # always run this step even if previous steps failed - run: | - sudo docker compose -f docker/docker-compose.yml down -v + RUNNER_WORKSPACE_PREFIX=${RUNNER_WORKSPACE_PREFIX:-${HOME}} + RAGFLOW_IMAGE=infiniflow/ragflow:${GITHUB_RUN_ID} + echo "RAGFLOW_IMAGE=${RAGFLOW_IMAGE}" >> ${GITHUB_ENV} + sudo docker pull ubuntu:22.04 + sudo DOCKER_BUILDKIT=1 docker build --build-arg NEED_MIRROR=1 -f Dockerfile -t ${RAGFLOW_IMAGE} . + if [[ ${GITHUB_EVENT_NAME} == "schedule" ]]; then + export HTTP_API_TEST_LEVEL=p3 + else + export HTTP_API_TEST_LEVEL=p2 + fi + echo "HTTP_API_TEST_LEVEL=${HTTP_API_TEST_LEVEL}" >> ${GITHUB_ENV} + echo "RAGFLOW_CONTAINER=${GITHUB_RUN_ID}-ragflow-cpu-1" >> ${GITHUB_ENV} - name: Start ragflow:nightly run: | - echo -e "\nRAGFLOW_IMAGE=infiniflow/ragflow:nightly" >> docker/.env - sudo docker compose -f docker/docker-compose.yml up -d + # Determine runner number (default to 1 if not found) + RUNNER_NUM=$(sudo docker inspect $(hostname) --format '{{index .Config.Labels "com.docker.compose.container-number"}}' 2>/dev/null || true) + RUNNER_NUM=${RUNNER_NUM:-1} + + # Compute port numbers using bash arithmetic + ES_PORT=$((1200 + RUNNER_NUM * 10)) + OS_PORT=$((1201 + RUNNER_NUM * 10)) + INFINITY_THRIFT_PORT=$((23817 + RUNNER_NUM * 10)) + INFINITY_HTTP_PORT=$((23820 + RUNNER_NUM * 10)) + INFINITY_PSQL_PORT=$((5432 + RUNNER_NUM * 10)) + MYSQL_PORT=$((5455 + RUNNER_NUM * 10)) + MINIO_PORT=$((9000 + RUNNER_NUM * 10)) + MINIO_CONSOLE_PORT=$((9001 + RUNNER_NUM * 10)) + REDIS_PORT=$((6379 + RUNNER_NUM * 10)) + TEI_PORT=$((6380 + RUNNER_NUM * 10)) + KIBANA_PORT=$((6601 + RUNNER_NUM * 10)) + SVR_HTTP_PORT=$((9380 + RUNNER_NUM * 10)) + ADMIN_SVR_HTTP_PORT=$((9381 + RUNNER_NUM * 10)) + SVR_MCP_PORT=$((9382 + RUNNER_NUM * 10)) + SANDBOX_EXECUTOR_MANAGER_PORT=$((9385 + RUNNER_NUM * 10)) + SVR_WEB_HTTP_PORT=$((80 + RUNNER_NUM * 10)) + SVR_WEB_HTTPS_PORT=$((443 + RUNNER_NUM * 10)) + + # Persist computed ports into docker/.env so docker-compose uses the correct host bindings + echo "" >> docker/.env + echo -e "ES_PORT=${ES_PORT}" >> docker/.env + echo -e "OS_PORT=${OS_PORT}" >> docker/.env + echo -e "INFINITY_THRIFT_PORT=${INFINITY_THRIFT_PORT}" >> docker/.env + echo -e "INFINITY_HTTP_PORT=${INFINITY_HTTP_PORT}" >> docker/.env + echo -e "INFINITY_PSQL_PORT=${INFINITY_PSQL_PORT}" >> docker/.env + echo -e "MYSQL_PORT=${MYSQL_PORT}" >> docker/.env + echo -e "MINIO_PORT=${MINIO_PORT}" >> docker/.env + echo -e "MINIO_CONSOLE_PORT=${MINIO_CONSOLE_PORT}" >> docker/.env + echo -e "REDIS_PORT=${REDIS_PORT}" >> docker/.env + echo -e "TEI_PORT=${TEI_PORT}" >> docker/.env + echo -e "KIBANA_PORT=${KIBANA_PORT}" >> docker/.env + echo -e "SVR_HTTP_PORT=${SVR_HTTP_PORT}" >> docker/.env + echo -e "ADMIN_SVR_HTTP_PORT=${ADMIN_SVR_HTTP_PORT}" >> docker/.env + echo -e "SVR_MCP_PORT=${SVR_MCP_PORT}" >> docker/.env + echo -e "SANDBOX_EXECUTOR_MANAGER_PORT=${SANDBOX_EXECUTOR_MANAGER_PORT}" >> docker/.env + echo -e "SVR_WEB_HTTP_PORT=${SVR_WEB_HTTP_PORT}" >> docker/.env + echo -e "SVR_WEB_HTTPS_PORT=${SVR_WEB_HTTPS_PORT}" >> docker/.env + + echo -e "COMPOSE_PROFILES=\${COMPOSE_PROFILES},tei-cpu" >> docker/.env + echo -e "TEI_MODEL=BAAI/bge-small-en-v1.5" >> docker/.env + echo -e "RAGFLOW_IMAGE=${RAGFLOW_IMAGE}" >> docker/.env + echo "HOST_ADDRESS=http://host.docker.internal:${SVR_HTTP_PORT}" >> ${GITHUB_ENV} + + sudo docker compose -f docker/docker-compose.yml -p ${GITHUB_RUN_ID} up -d + uv sync --python 3.10 --only-group test --no-default-groups --frozen && uv pip install sdk/python - name: Run sdk tests against Elasticsearch run: | export http_proxy=""; export https_proxy=""; export no_proxy=""; export HTTP_PROXY=""; export HTTPS_PROXY=""; export NO_PROXY="" - export HOST_ADDRESS=http://host.docker.internal:9380 - until sudo docker exec ragflow-server curl -s --connect-timeout 5 ${HOST_ADDRESS} > /dev/null; do + until sudo docker exec ${RAGFLOW_CONTAINER} curl -s --connect-timeout 5 ${HOST_ADDRESS} > /dev/null; do echo "Waiting for service to be available..." sleep 5 done - if [[ $GITHUB_EVENT_NAME == 'schedule' ]]; then - export HTTP_API_TEST_LEVEL=p3 - else - export HTTP_API_TEST_LEVEL=p2 - fi - UV_LINK_MODE=copy uv sync --python 3.10 --only-group test --no-default-groups --frozen && uv pip install sdk/python && uv run --only-group test --no-default-groups pytest -s --tb=short --level=${HTTP_API_TEST_LEVEL} test/testcases/test_sdk_api + source .venv/bin/activate && pytest -s --tb=short --level=${HTTP_API_TEST_LEVEL} test/testcases/test_sdk_api - name: Run frontend api tests against Elasticsearch run: | export http_proxy=""; export https_proxy=""; export no_proxy=""; export HTTP_PROXY=""; export HTTPS_PROXY=""; export NO_PROXY="" - export HOST_ADDRESS=http://host.docker.internal:9380 - until sudo docker exec ragflow-server curl -s --connect-timeout 5 ${HOST_ADDRESS} > /dev/null; do + until sudo docker exec ${RAGFLOW_CONTAINER} curl -s --connect-timeout 5 ${HOST_ADDRESS} > /dev/null; do echo "Waiting for service to be available..." sleep 5 done - cd sdk/python && UV_LINK_MODE=copy uv sync --python 3.10 --group test --frozen && source .venv/bin/activate && cd test/test_frontend_api && pytest -s --tb=short get_email.py test_dataset.py + source .venv/bin/activate && pytest -s --tb=short sdk/python/test/test_frontend_api/get_email.py sdk/python/test/test_frontend_api/test_dataset.py - name: Run http api tests against Elasticsearch run: | export http_proxy=""; export https_proxy=""; export no_proxy=""; export HTTP_PROXY=""; export HTTPS_PROXY=""; export NO_PROXY="" - export HOST_ADDRESS=http://host.docker.internal:9380 - until sudo docker exec ragflow-server curl -s --connect-timeout 5 ${HOST_ADDRESS} > /dev/null; do + until sudo docker exec ${RAGFLOW_CONTAINER} curl -s --connect-timeout 5 ${HOST_ADDRESS} > /dev/null; do echo "Waiting for service to be available..." sleep 5 done - if [[ $GITHUB_EVENT_NAME == 'schedule' ]]; then - export HTTP_API_TEST_LEVEL=p3 - else - export HTTP_API_TEST_LEVEL=p2 - fi - UV_LINK_MODE=copy uv sync --python 3.10 --only-group test --no-default-groups --frozen && uv run --only-group test --no-default-groups pytest -s --tb=short --level=${HTTP_API_TEST_LEVEL} test/testcases/test_http_api + source .venv/bin/activate && pytest -s --tb=short --level=${HTTP_API_TEST_LEVEL} test/testcases/test_http_api - name: Stop ragflow:nightly if: always() # always run this step even if previous steps failed run: | - sudo docker compose -f docker/docker-compose.yml down -v + sudo docker compose -f docker/docker-compose.yml -p ${GITHUB_RUN_ID} down -v || true + sudo docker ps -a --filter "label=com.docker.compose.project=${GITHUB_RUN_ID}" -q | xargs -r sudo docker rm -f - name: Start ragflow:nightly run: | - sudo DOC_ENGINE=infinity docker compose -f docker/docker-compose.yml up -d + sed -i '1i DOC_ENGINE=infinity' docker/.env + sudo docker compose -f docker/docker-compose.yml -p ${GITHUB_RUN_ID} up -d - name: Run sdk tests against Infinity run: | export http_proxy=""; export https_proxy=""; export no_proxy=""; export HTTP_PROXY=""; export HTTPS_PROXY=""; export NO_PROXY="" - export HOST_ADDRESS=http://host.docker.internal:9380 - until sudo docker exec ragflow-server curl -s --connect-timeout 5 ${HOST_ADDRESS} > /dev/null; do + until sudo docker exec ${RAGFLOW_CONTAINER} curl -s --connect-timeout 5 ${HOST_ADDRESS} > /dev/null; do echo "Waiting for service to be available..." sleep 5 done - if [[ $GITHUB_EVENT_NAME == 'schedule' ]]; then - export HTTP_API_TEST_LEVEL=p3 - else - export HTTP_API_TEST_LEVEL=p2 - fi - UV_LINK_MODE=copy uv sync --python 3.10 --only-group test --no-default-groups --frozen && uv pip install sdk/python && DOC_ENGINE=infinity uv run --only-group test --no-default-groups pytest -s --tb=short --level=${HTTP_API_TEST_LEVEL} test/testcases/test_sdk_api + source .venv/bin/activate && DOC_ENGINE=infinity pytest -s --tb=short --level=${HTTP_API_TEST_LEVEL} test/testcases/test_sdk_api - name: Run frontend api tests against Infinity run: | export http_proxy=""; export https_proxy=""; export no_proxy=""; export HTTP_PROXY=""; export HTTPS_PROXY=""; export NO_PROXY="" - export HOST_ADDRESS=http://host.docker.internal:9380 - until sudo docker exec ragflow-server curl -s --connect-timeout 5 ${HOST_ADDRESS} > /dev/null; do + until sudo docker exec ${RAGFLOW_CONTAINER} curl -s --connect-timeout 5 ${HOST_ADDRESS} > /dev/null; do echo "Waiting for service to be available..." sleep 5 done - cd sdk/python && UV_LINK_MODE=copy uv sync --python 3.10 --group test --frozen && source .venv/bin/activate && cd test/test_frontend_api && pytest -s --tb=short get_email.py test_dataset.py + source .venv/bin/activate && DOC_ENGINE=infinity pytest -s --tb=short sdk/python/test/test_frontend_api/get_email.py sdk/python/test/test_frontend_api/test_dataset.py - name: Run http api tests against Infinity run: | export http_proxy=""; export https_proxy=""; export no_proxy=""; export HTTP_PROXY=""; export HTTPS_PROXY=""; export NO_PROXY="" - export HOST_ADDRESS=http://host.docker.internal:9380 - until sudo docker exec ragflow-server curl -s --connect-timeout 5 ${HOST_ADDRESS} > /dev/null; do + until sudo docker exec ${RAGFLOW_CONTAINER} curl -s --connect-timeout 5 ${HOST_ADDRESS} > /dev/null; do echo "Waiting for service to be available..." sleep 5 done - if [[ $GITHUB_EVENT_NAME == 'schedule' ]]; then - export HTTP_API_TEST_LEVEL=p3 - else - export HTTP_API_TEST_LEVEL=p2 - fi - UV_LINK_MODE=copy uv sync --python 3.10 --only-group test --no-default-groups --frozen && DOC_ENGINE=infinity uv run --only-group test --no-default-groups pytest -s --tb=short --level=${HTTP_API_TEST_LEVEL} test/testcases/test_http_api + source .venv/bin/activate && DOC_ENGINE=infinity pytest -s --tb=short --level=${HTTP_API_TEST_LEVEL} test/testcases/test_http_api - name: Stop ragflow:nightly if: always() # always run this step even if previous steps failed run: | - sudo DOC_ENGINE=infinity docker compose -f docker/docker-compose.yml down -v + # Sometimes `docker compose down` fail due to hang container, heavy load etc. Need to remove such containers to release resources(for example, listen ports). + sudo docker compose -f docker/docker-compose.yml -p ${GITHUB_RUN_ID} down -v || true + sudo docker ps -a --filter "label=com.docker.compose.project=${GITHUB_RUN_ID}" -q | xargs -r sudo docker rm -f + if [[ -n ${RAGFLOW_IMAGE} ]]; then + sudo docker rmi -f ${RAGFLOW_IMAGE} + fi diff --git a/.gitignore b/.gitignore index 52c53277043..fbf80b3aabd 100644 --- a/.gitignore +++ b/.gitignore @@ -149,7 +149,7 @@ out # Nuxt.js build / generate output .nuxt dist - +ragflow_cli.egg-info # Gatsby files .cache/ # Comment in the public line in if your project uses Gatsby and not Next.js @@ -193,3 +193,5 @@ dist # SvelteKit build / generate output .svelte-kit +# Default backup dir +backup diff --git a/.trivyignore b/.trivyignore new file mode 100644 index 00000000000..8f2725fe68d --- /dev/null +++ b/.trivyignore @@ -0,0 +1,15 @@ +**/*.md +**/*.min.js +**/*.min.css +**/*.svg +**/*.png +**/*.jpg +**/*.jpeg +**/*.gif +**/*.woff +**/*.woff2 +**/*.map +**/*.webp +**/*.ico +**/*.ttf +**/*.eot \ No newline at end of file diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 00000000000..7e5d43f9d68 --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,116 @@ +# CLAUDE.md + +This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository. + +## Project Overview + +RAGFlow is an open-source RAG (Retrieval-Augmented Generation) engine based on deep document understanding. It's a full-stack application with: +- Python backend (Flask-based API server) +- React/TypeScript frontend (built with UmiJS) +- Microservices architecture with Docker deployment +- Multiple data stores (MySQL, Elasticsearch/Infinity, Redis, MinIO) + +## Architecture + +### Backend (`/api/`) +- **Main Server**: `api/ragflow_server.py` - Flask application entry point +- **Apps**: Modular Flask blueprints in `api/apps/` for different functionalities: + - `kb_app.py` - Knowledge base management + - `dialog_app.py` - Chat/conversation handling + - `document_app.py` - Document processing + - `canvas_app.py` - Agent workflow canvas + - `file_app.py` - File upload/management +- **Services**: Business logic in `api/db/services/` +- **Models**: Database models in `api/db/db_models.py` + +### Core Processing (`/rag/`) +- **Document Processing**: `deepdoc/` - PDF parsing, OCR, layout analysis +- **LLM Integration**: `rag/llm/` - Model abstractions for chat, embedding, reranking +- **RAG Pipeline**: `rag/flow/` - Chunking, parsing, tokenization +- **Graph RAG**: `graphrag/` - Knowledge graph construction and querying + +### Agent System (`/agent/`) +- **Components**: Modular workflow components (LLM, retrieval, categorize, etc.) +- **Templates**: Pre-built agent workflows in `agent/templates/` +- **Tools**: External API integrations (Tavily, Wikipedia, SQL execution, etc.) + +### Frontend (`/web/`) +- React/TypeScript with UmiJS framework +- Ant Design + shadcn/ui components +- State management with Zustand +- Tailwind CSS for styling + +## Common Development Commands + +### Backend Development +```bash +# Install Python dependencies +uv sync --python 3.10 --all-extras +uv run download_deps.py +pre-commit install + +# Start dependent services +docker compose -f docker/docker-compose-base.yml up -d + +# Run backend (requires services to be running) +source .venv/bin/activate +export PYTHONPATH=$(pwd) +bash docker/launch_backend_service.sh + +# Run tests +uv run pytest + +# Linting +ruff check +ruff format +``` + +### Frontend Development +```bash +cd web +npm install +npm run dev # Development server +npm run build # Production build +npm run lint # ESLint +npm run test # Jest tests +``` + +### Docker Development +```bash +# Full stack with Docker +cd docker +docker compose -f docker-compose.yml up -d + +# Check server status +docker logs -f ragflow-server + +# Rebuild images +docker build --platform linux/amd64 -f Dockerfile -t infiniflow/ragflow:nightly . +``` + +## Key Configuration Files + +- `docker/.env` - Environment variables for Docker deployment +- `docker/service_conf.yaml.template` - Backend service configuration +- `pyproject.toml` - Python dependencies and project configuration +- `web/package.json` - Frontend dependencies and scripts + +## Testing + +- **Python**: pytest with markers (p1/p2/p3 priority levels) +- **Frontend**: Jest with React Testing Library +- **API Tests**: HTTP API and SDK tests in `test/` and `sdk/python/test/` + +## Database Engines + +RAGFlow supports switching between Elasticsearch (default) and Infinity: +- Set `DOC_ENGINE=infinity` in `docker/.env` to use Infinity +- Requires container restart: `docker compose down -v && docker compose up -d` + +## Development Environment Requirements + +- Python 3.10-3.12 +- Node.js >=18.20.4 +- Docker & Docker Compose +- uv package manager +- 16GB+ RAM, 50GB+ disk space \ No newline at end of file diff --git a/Dockerfile b/Dockerfile index 67fd2645682..b16a0d7d518 100644 --- a/Dockerfile +++ b/Dockerfile @@ -4,8 +4,6 @@ USER root SHELL ["/bin/bash", "-c"] ARG NEED_MIRROR=0 -ARG LIGHTEN=0 -ENV LIGHTEN=${LIGHTEN} WORKDIR /ragflow @@ -17,13 +15,6 @@ RUN --mount=type=bind,from=infiniflow/ragflow_deps:latest,source=/huggingface.co /huggingface.co/InfiniFlow/text_concat_xgb_v1.0 \ /huggingface.co/InfiniFlow/deepdoc \ | tar -xf - --strip-components=3 -C /ragflow/rag/res/deepdoc -RUN --mount=type=bind,from=infiniflow/ragflow_deps:latest,source=/huggingface.co,target=/huggingface.co \ - if [ "$LIGHTEN" != "1" ]; then \ - (tar -cf - \ - /huggingface.co/BAAI/bge-large-zh-v1.5 \ - /huggingface.co/maidalun1020/bce-embedding-base_v1 \ - | tar -xf - --strip-components=2 -C /root/.ragflow) \ - fi # https://github.com/chrismattmann/tika-python # This is the only way to run python-tika without internet access. Without this set, the default is to check the tika version and pull latest every time from Apache. @@ -63,11 +54,11 @@ RUN --mount=type=cache,id=ragflow_apt,target=/var/cache/apt,sharing=locked \ apt install -y ghostscript RUN if [ "$NEED_MIRROR" == "1" ]; then \ - pip3 config set global.index-url https://mirrors.aliyun.com/pypi/simple && \ - pip3 config set global.trusted-host mirrors.aliyun.com; \ + pip3 config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple && \ + pip3 config set global.trusted-host pypi.tuna.tsinghua.edu.cn; \ mkdir -p /etc/uv && \ echo "[[index]]" > /etc/uv/uv.toml && \ - echo 'url = "https://mirrors.aliyun.com/pypi/simple"' >> /etc/uv/uv.toml && \ + echo 'url = "https://pypi.tuna.tsinghua.edu.cn/simple"' >> /etc/uv/uv.toml && \ echo "default = true" >> /etc/uv/uv.toml; \ fi; \ pipx install uv @@ -151,15 +142,11 @@ COPY pyproject.toml uv.lock ./ # uv records index url into uv.lock but doesn't failover among multiple indexes RUN --mount=type=cache,id=ragflow_uv,target=/root/.cache/uv,sharing=locked \ if [ "$NEED_MIRROR" == "1" ]; then \ - sed -i 's|pypi.org|mirrors.aliyun.com/pypi|g' uv.lock; \ + sed -i 's|pypi.org|pypi.tuna.tsinghua.edu.cn|g' uv.lock; \ else \ - sed -i 's|mirrors.aliyun.com/pypi|pypi.org|g' uv.lock; \ + sed -i 's|pypi.tuna.tsinghua.edu.cn|pypi.org|g' uv.lock; \ fi; \ - if [ "$LIGHTEN" == "1" ]; then \ - uv sync --python 3.10 --frozen; \ - else \ - uv sync --python 3.10 --frozen --all-extras; \ - fi + uv sync --python 3.10 --frozen COPY web web COPY docs docs @@ -169,11 +156,7 @@ RUN --mount=type=cache,id=ragflow_npm,target=/root/.npm,sharing=locked \ COPY .git /ragflow/.git RUN version_info=$(git describe --tags --match=v* --first-parent --always); \ - if [ "$LIGHTEN" == "1" ]; then \ - version_info="$version_info slim"; \ - else \ - version_info="$version_info full"; \ - fi; \ + version_info="$version_info"; \ echo "RAGFlow version: $version_info"; \ echo $version_info > /ragflow/VERSION @@ -191,6 +174,7 @@ ENV PATH="${VIRTUAL_ENV}/bin:${PATH}" ENV PYTHONPATH=/ragflow/ COPY web web +COPY admin admin COPY api api COPY conf conf COPY deepdoc deepdoc @@ -201,6 +185,7 @@ COPY agentic_reasoning agentic_reasoning COPY pyproject.toml uv.lock ./ COPY mcp mcp COPY plugin plugin +COPY common common COPY docker/service_conf.yaml.template ./conf/service_conf.yaml.template COPY docker/entrypoint.sh ./ diff --git a/Dockerfile_tei b/Dockerfile_tei new file mode 100644 index 00000000000..539002359b8 --- /dev/null +++ b/Dockerfile_tei @@ -0,0 +1,14 @@ +FROM ghcr.io/huggingface/text-embeddings-inference:cpu-1.8 + +# uv tool install huggingface_hub +# hf download --local-dir tei_data/BAAI/bge-small-en-v1.5 BAAI/bge-small-en-v1.5 +# hf download --local-dir tei_data/BAAI/bge-m3 BAAI/bge-m3 +# hf download --local-dir tei_data/Qwen/Qwen3-Embedding-0.6B Qwen/Qwen3-Embedding-0.6B +COPY tei_data /data + +# curl -X POST http://localhost:6380/embed -H "Content-Type: application/json" -d '{"inputs": "Hello, world! This is a test sentence."}' +# curl -X POST http://tei:80/embed -H "Content-Type: application/json" -d '{"inputs": "Hello, world! This is a test sentence."}' +# [[-0.058816575,0.019564206,0.026697718,...]] + +# curl -X POST http://localhost:6380/v1/embeddings -H "Content-Type: application/json" -d '{"input": "Hello, world! This is a test sentence."}' +# {"object":"list","data":[{"object":"embedding","embedding":[-0.058816575,0.019564206,...],"index":0}],"model":"BAAI/bge-small-en-v1.5","usage":{"prompt_tokens":12,"total_tokens":12}} diff --git a/README.md b/README.md index c8e47cdbc92..299bd67fd0a 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@
@@ -22,7 +22,7 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+