🎨 桌面端支持搜索图片 OCR 文本 #3470

siyuan-note · Jan 16, 2023 · 6e1ce2b · 6e1ce2b
1 parent 6cdfde7
commit 6e1ce2b
Show file tree

Hide file tree

Showing 3 changed files with 312 additions and 3 deletions.
diff --git a/...808180117-6v0mkxr/20200923234011-ieuun1p/20210808180303-xaduj2o/20200924100744-br924ar.sy b/...808180117-6v0mkxr/20200923234011-ieuun1p/20210808180303-xaduj2o/20200924100744-br924ar.sy
@@ -6,7 +6,7 @@
 		"id": "20200924100744-br924ar",
 		"title": "Assets",
 		"type": "doc",
-		"updated": "20220920003954"
+		"updated": "20230117003842"
 	},
 	"Children": [
 		{
@@ -659,6 +659,109 @@
 					]
 				}
 			]
+		},
+		{
+			"ID": "20230117003636-epvk98g",
+			"Type": "NodeHeading",
+			"HeadingLevel": 2,
+			"Properties": {
+				"id": "20230117003636-epvk98g",
+				"updated": "20230117003636"
+			},
+			"Children": [
+				{
+					"Type": "NodeText",
+					"Data": "Image OCR to extract text"
+				}
+			]
+		},
+		{
+			"ID": "20230117003636-0buf00n",
+			"Type": "NodeParagraph",
+			"Properties": {
+				"id": "20230117003636-0buf00n",
+				"updated": "20230117003758"
+			},
+			"Children": [
+				{
+					"Type": "NodeText",
+					"Data": "This feature is only supported on the desktop, and the "
+				},
+				{
+					"Type": "NodeTextMark",
+					"TextMarkType": "a",
+					"TextMarkAHref": "https://github.com/tesseract-ocr/tesseract",
+					"TextMarkTextContent": "Tesseract OCR"
+				},
+				{
+					"Type": "NodeText",
+					"Data": " program needs to be manually installed first. "
+				},
+				{
+					"Type": "NodeTextMark",
+					"TextMarkType": "tag",
+					"TextMarkTextContent": "Note"
+				},
+				{
+					"Type": "NodeText",
+					"Data": " that you need to check the Chinese language pack when installing Tesseract OCR, and add the installed Tesseract-OCR directory path to the environment variable PATH, so that SiYuan can directly call the "
+				},
+				{
+					"Type": "NodeTextMark",
+					"TextMarkType": "code",
+					"TextMarkTextContent": "tesseract"
+				},
+				{
+					"Type": "NodeText",
+					"Data": " command to extract text from image OCR. If the installation is normal, you can search for "
+				},
+				{
+					"Type": "NodeTextMark",
+					"TextMarkType": "code",
+					"TextMarkTextContent": "tesseract-ocr enabled"
+				},
+				{
+					"Type": "NodeText",
+					"Data": " in the kernel boot log."
+				}
+			]
+		},
+		{
+			"ID": "20230117003636-ouxlaen",
+			"Type": "NodeParagraph",
+			"Properties": {
+				"id": "20230117003636-ouxlaen",
+				"updated": "20230117003636"
+			},
+			"Children": [
+				{
+					"Type": "NodeText",
+					"Data": "Image OCR extracts text automatically in the background, currently only supports PNG and JPG images. The extracted text is used in the search function, so that images can be located by text keywords."
+				}
+			]
+		},
+		{
+			"ID": "20230117003636-iel5zll",
+			"Type": "NodeParagraph",
+			"Properties": {
+				"id": "20230117003636-iel5zll",
+				"updated": "20230117003842"
+			},
+			"Children": [
+				{
+					"Type": "NodeText",
+					"Data": "The OCR results are saved in "
+				},
+				{
+					"Type": "NodeTextMark",
+					"TextMarkType": "code",
+					"TextMarkTextContent": "workspace/data/assets/ocr-texts.json"
+				},
+				{
+					"Type": "NodeText",
+					"Data": ". If you need to re-OCR or adjust the extraction results, you can delete or modify this file when SiYuan is closed, and manually rebuild the index after starting SiYuan."
+				}
+			]
 		}
 	]
 }
diff --git a/...808180117-czj9bvb/20200812220555-lj3enxa/20210808180321-hbvl5c2/20200915214115-42b8zma.sy b/...808180117-czj9bvb/20200812220555-lj3enxa/20210808180321-hbvl5c2/20200915214115-42b8zma.sy
@@ -6,7 +6,7 @@
 		"id": "20200915214115-42b8zma",
 		"title": "资源文件",
 		"type": "doc",
-		"updated": "20220905215650"
+		"updated": "20230117003750"
 	},
 	"Children": [
 		{
@@ -681,6 +681,109 @@
 					]
 				}
 			]
+		},
+		{
+			"ID": "20230117002036-vmc0mx4",
+			"Type": "NodeHeading",
+			"HeadingLevel": 2,
+			"Properties": {
+				"id": "20230117002036-vmc0mx4",
+				"updated": "20230117002509"
+			},
+			"Children": [
+				{
+					"Type": "NodeText",
+					"Data": "图片 OCR 提取文本"
+				}
+			]
+		},
+		{
+			"ID": "20230117002059-ssmlejt",
+			"Type": "NodeParagraph",
+			"Properties": {
+				"id": "20230117002059-ssmlejt",
+				"updated": "20230117003750"
+			},
+			"Children": [
+				{
+					"Type": "NodeText",
+					"Data": "该特性仅在桌面端支持，并且需要先手动安装 "
+				},
+				{
+					"Type": "NodeTextMark",
+					"TextMarkType": "a",
+					"TextMarkAHref": "https://github.com/tesseract-ocr/tesseract",
+					"TextMarkTextContent": "Tesseract OCR"
+				},
+				{
+					"Type": "NodeText",
+					"Data": " 程序。"
+				},
+				{
+					"Type": "NodeTextMark",
+					"TextMarkType": "tag",
+					"TextMarkTextContent": "注意"
+				},
+				{
+					"Type": "NodeText",
+					"Data": "在安装 Tesseract OCR 时需要勾选中文语言包，并将安装后的 Tesseract-OCR 目录路径添加到环境变量 PATH 中，这样思源才能直接调用 "
+				},
+				{
+					"Type": "NodeTextMark",
+					"TextMarkType": "code",
+					"TextMarkTextContent": "tesseract"
+				},
+				{
+					"Type": "NodeText",
+					"Data": " 命令进行图片 OCR 提取文本。如果安装正常的话，在内核启动日志中可以搜索到 "
+				},
+				{
+					"Type": "NodeTextMark",
+					"TextMarkType": "code",
+					"TextMarkTextContent": "tesseract-ocr enabled"
+				},
+				{
+					"Type": "NodeText",
+					"Data": "。"
+				}
+			]
+		},
+		{
+			"ID": "20230117002511-ex9qrjy",
+			"Type": "NodeParagraph",
+			"Properties": {
+				"id": "20230117002511-ex9qrjy",
+				"updated": "20230117003207"
+			},
+			"Children": [
+				{
+					"Type": "NodeText",
+					"Data": "图片 OCR 提取文本过程是自动在后台进行的，目前仅支持 PNG、JPG 图片。提取的文本会被用于搜索功能，这样就能通过文本关键字来定位图片。"
+				}
+			]
+		},
+		{
+			"ID": "20230117003118-da7wjvx",
+			"Type": "NodeParagraph",
+			"Properties": {
+				"id": "20230117003118-da7wjvx",
+				"updated": "20230117003434"
+			},
+			"Children": [
+				{
+					"Type": "NodeText",
+					"Data": "OCR 的结果保存在 "
+				},
+				{
+					"Type": "NodeTextMark",
+					"TextMarkType": "code",
+					"TextMarkTextContent": "工作空间/data/assets/ocr-texts.json"
+				},
+				{
+					"Type": "NodeText",
+					"Data": " 中，如果需要重新 OCR 或者调整提取结果，可以在关闭思源的情况下删除或修改该文件，启动思源以后手动重建索引。"
+				}
+			]
 		}
 	]
 }
diff --git a/...226090932-5lcq56f/20211226115423-d5z1joq/20211226121203-rjjngpz/20211226123038-4umgpxy.sy b/...226090932-5lcq56f/20211226115423-d5z1joq/20211226121203-rjjngpz/20211226123038-4umgpxy.sy
@@ -5,7 +5,7 @@
 	"Properties": {
 		"id": "20211226123038-4umgpxy",
 		"title": "資料文件",
-		"updated": "20220905215759"
+		"updated": "20230117003908"
 	},
 	"Children": [
 		{
@@ -660,6 +660,109 @@
 					]
 				}
 			]
+		},
+		{
+			"ID": "20230117003901-ze4mbzz",
+			"Type": "NodeHeading",
+			"HeadingLevel": 2,
+			"Properties": {
+				"id": "20230117003901-ze4mbzz",
+				"updated": "20230117003901"
+			},
+			"Children": [
+				{
+					"Type": "NodeText",
+					"Data": "圖片 OCR 提取文本"
+				}
+			]
+		},
+		{
+			"ID": "20230117003901-5pfzttc",
+			"Type": "NodeParagraph",
+			"Properties": {
+				"id": "20230117003901-5pfzttc",
+				"updated": "20230117003908"
+			},
+			"Children": [
+				{
+					"Type": "NodeText",
+					"Data": "該特性僅在桌面端支持，並且需要先手動安裝 "
+				},
+				{
+					"Type": "NodeTextMark",
+					"TextMarkType": "a",
+					"TextMarkAHref": "https://github.com/tesseract-ocr/tesseract",
+					"TextMarkTextContent": "Tesseract OCR"
+				},
+				{
+					"Type": "NodeText",
+					"Data": " 程序。"
+				},
+				{
+					"Type": "NodeTextMark",
+					"TextMarkType": "tag",
+					"TextMarkTextContent": "注意"
+				},
+				{
+					"Type": "NodeText",
+					"Data": "在安裝 Tesseract OCR 時需要勾選中文語言包，並將安裝後的 Tesseract-OCR 目錄路徑添加到環境變量 PATH 中，這樣思源才能直接調用 "
+				},
+				{
+					"Type": "NodeTextMark",
+					"TextMarkType": "code",
+					"TextMarkTextContent": "tesseract"
+				},
+				{
+					"Type": "NodeText",
+					"Data": " 命令進行圖片 OCR 提取文本。如果安裝正常的話，在內核啟動日誌中可以搜索到 "
+				},
+				{
+					"Type": "NodeTextMark",
+					"TextMarkType": "code",
+					"TextMarkTextContent": "tesseract-ocr enabled"
+				},
+				{
+					"Type": "NodeText",
+					"Data": "。"
+				}
+			]
+		},
+		{
+			"ID": "20230117003901-j7r2o7f",
+			"Type": "NodeParagraph",
+			"Properties": {
+				"id": "20230117003901-j7r2o7f",
+				"updated": "20230117003901"
+			},
+			"Children": [
+				{
+					"Type": "NodeText",
+					"Data": "圖片 OCR 提取文本過程是自動在後台進行的，目前僅支持 PNG、JPG 圖片。提取的文本會被用於搜索功能，這樣就能通過文本關鍵字來定位圖片。"
+				}
+			]
+		},
+		{
+			"ID": "20230117003901-ds4ok9s",
+			"Type": "NodeParagraph",
+			"Properties": {
+				"id": "20230117003901-ds4ok9s",
+				"updated": "20230117003901"
+			},
+			"Children": [
+				{
+					"Type": "NodeText",
+					"Data": "OCR 的結果保存在 "
+				},
+				{
+					"Type": "NodeTextMark",
+					"TextMarkType": "code",
+					"TextMarkTextContent": "工作空間/data/assets/ocr-texts.json"
+				},
+				{
+					"Type": "NodeText",
+					"Data": " 中，如果需要重新 OCR 或者調整提取結果，可以在關閉思源的情況下刪除或修改該文件，啟動思源以後手動重建索引。"
+				}
+			]
 		}
 	]
 }