Skip to content

Commit

Permalink
add quality field
Browse files Browse the repository at this point in the history
  • Loading branch information
versun committed Apr 28, 2024
1 parent 4e1c9b5 commit daf65fe
Show file tree
Hide file tree
Showing 6 changed files with 76 additions and 32 deletions.
2 changes: 1 addition & 1 deletion core/admin.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ def __init__(self, *args, **kwargs):

class Meta:
model = O_Feed
fields = ['feed_url', 'update_frequency', 'max_posts', 'translator', 'translation_display', 'summary_engine', 'summary_detail', 'additional_prompt', 'name']
fields = ['feed_url', 'update_frequency', 'max_posts', 'translator', 'translation_display', 'summary_engine', 'summary_detail', 'additional_prompt', 'name', 'quality']

# 重写save方法,以处理自定义字段的数据
def save(self, commit=True):
Expand Down
22 changes: 22 additions & 0 deletions core/migrations/0011_o_feed_quality.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
# Generated by Django 5.0.4 on 2024-04-28 06:38

from django.db import migrations, models


class Migration(migrations.Migration):

dependencies = [
("core", "0010_o_feed_additional_prompt_o_feed_content_type_summary_and_more"),
]

operations = [
migrations.AddField(
model_name="o_feed",
name="quality",
field=models.BooleanField(
default=False,
help_text="Formatting such as hyperlinks, bold, italics, etc. will be lost for optimal translation quality.",
verbose_name="Best Quality",
),
),
]
1 change: 1 addition & 0 deletions core/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ class O_Feed(models.Model):
valid = models.BooleanField(_("Valid"), null=True,editable=False, )
update_frequency = models.IntegerField(_("Update Frequency"), default=os.getenv("default_update_frequency", 30), help_text=_("Minutes"))
max_posts = models.IntegerField(_("Max Posts"), default=os.getenv("default_max_posts", 20), help_text=_("Max number of posts to be translated"))
quality = models.BooleanField(_("Best Quality"), default=False, help_text=_("Formatting such as hyperlinks, bold, italics, etc. will be lost for optimal translation quality."))

content_type = models.ForeignKey(ContentType, on_delete=models.SET_NULL, null=True, related_name='translator')
object_id = models.PositiveIntegerField(null=True)
Expand Down
19 changes: 12 additions & 7 deletions core/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,7 @@ def update_translated_feed(sid: str, force=False):
if original_feed.entries:
o_feed = obj.o_feed
logging.info("Start translate feed: [%s]%s", obj.language, o_feed.feed_url)
results = translate_feed.call_local(
results = translate_feed(
feed=original_feed,
target_language=obj.language,
translate_engine=o_feed.translator,
Expand All @@ -153,7 +153,8 @@ def update_translated_feed(sid: str, force=False):
summary_engine=o_feed.summary_engine,
summary_detail=o_feed.summary_detail,
max_posts=o_feed.max_posts,
translation_display=o_feed.translation_display
translation_display=o_feed.translation_display,
quality=o_feed.quality
)

if not results:
Expand Down Expand Up @@ -193,7 +194,6 @@ def update_translated_feed(sid: str, force=False):
obj.save()


@db_task()
def translate_feed(
feed: feedparser.FeedParserDict,
target_language: str,
Expand All @@ -204,7 +204,8 @@ def translate_feed(
summary_detail: float,
summary_engine: TranslatorEngine,
max_posts: int = 20,
translation_display: int = 0) -> dict:
translation_display: int = 0,
quality: bool = False) -> dict:
logging.info("Call task translate_feed: %s(%s items)", target_language, len(feed.entries))
translated_feed = feed
total_tokens = 0
Expand Down Expand Up @@ -278,7 +279,8 @@ def translate_feed(

translated_summary, tokens, characters, need_cache = content_translate(content,
target_language,
translate_engine)
translate_engine,
quality)
total_tokens += tokens
translated_characters += characters

Expand Down Expand Up @@ -346,13 +348,16 @@ def bulk_save_cache(need_cache_objs):
return True


def content_translate(original_content: str, target_language: str, engine: TranslatorEngine):
def content_translate(original_content: str, target_language: str, engine: TranslatorEngine, quality: bool = False):
total_tokens = 0
total_characters = 0
need_cache_objs = {}
soup = BeautifulSoup(original_content, 'html.parser')
soup = BeautifulSoup(original_content, 'lxml')

try:
if quality:
soup = BeautifulSoup(text_handler.unwrap_tags(soup), 'lxml')

for element in soup.find_all(string=True):
if text_handler.should_skip(element):
continue
Expand Down
55 changes: 32 additions & 23 deletions locale/zh_Hans/LC_MESSAGES/django.po
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ msgid ""
msgstr ""
"Project-Id-Version: PACKAGE VERSION\n"
"Report-Msgid-Bugs-To: \n"
"POT-Creation-Date: 2024-04-23 06:43+0000\n"
"POT-Creation-Date: 2024-04-28 07:05+0000\n"
"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
"Language-Team: LANGUAGE <LL@li.org>\n"
Expand Down Expand Up @@ -105,7 +105,7 @@ msgstr "原文 | 译文"
msgid "Translation Display"
msgstr "译文显示"

#: core/models.py:27 core/models.py:81
#: core/models.py:27 core/models.py:82
msgid "Size"
msgstr "文件大小"

Expand All @@ -125,84 +125,93 @@ msgstr "最大条目"
msgid "Max number of posts to be translated"
msgstr "要翻译的最大帖子数"

#: core/models.py:40
#: core/models.py:31
msgid "Best Quality"
msgstr "最佳质量"

#: core/models.py:31
msgid ""
"Formatting such as hyperlinks, bold, italics, etc. will be lost for optimal "
"translation quality."
msgstr "获取最佳翻译质量,将会丢失超链接、加粗、斜体等格式"

#: core/models.py:41
msgid "Summary Detail"
msgstr "摘要细粒度"

#: core/models.py:46
#: core/models.py:47
msgid ""
"Level of detail of summaries of longer articles. 0: Normal, 1: Most detailed "
"(cost more tokens)"
msgstr ""
"较长文章摘要的详细程度。0:正常,1:最详细(花费更多token)"
msgstr "较长文章摘要的详细程度。0:正常,1:最详细(花费更多token)"

#: core/models.py:48
#: core/models.py:49
msgid "Addtional Prompt"
msgstr "额外的提示词"

#: core/models.py:48
#: core/models.py:49
msgid "Addtional Prompt for translation and summary"
msgstr "用于AI翻译和摘要(可选)"

#: core/models.py:54 core/models.py:70
#: core/models.py:55 core/models.py:71
msgid "Original Feed"
msgstr "原始源"

#: core/models.py:55
#: core/models.py:56
msgid "Original Feeds"
msgstr "原始源"

#: core/models.py:68
#: core/models.py:69
msgid "URL Slug(Optional)"
msgstr "URL别名(可选)"

#: core/models.py:68
#: core/models.py:69
msgid ""
"Example: if set to hacker_news, the subscription address will be "
"http://127.0.0.1:8000/rss/hacker_news"
msgstr ""
"比如设置为'hacker_news',则订阅地址为'http://127.0.0.1:8000/rss/hacker_news'"

#: core/models.py:69
#: core/models.py:70
msgid "Language"
msgstr "语言"

#: core/models.py:71
#: core/models.py:72
msgid "Translation Status"
msgstr "翻译状态"

#: core/models.py:73
#: core/models.py:74
msgid "Translate Title"
msgstr "翻译标题"

#: core/models.py:74
#: core/models.py:75
msgid "Translate Content"
msgstr "翻译内容"

#: core/models.py:75
#: core/models.py:76
msgid "Summary"
msgstr "摘要"

#: core/models.py:77
#: core/models.py:78
msgid "Tokens Cost"
msgstr "花费Token数"

#: core/models.py:78
#: core/models.py:79
msgid "Characters Cost"
msgstr "花费字符数"

#: core/models.py:80
#: core/models.py:81
msgid "Last Modified"
msgstr "最后更新"

#: core/models.py:80
#: core/models.py:81
msgid "Last time the feed was translated"
msgstr "最后翻译的时间"

#: core/models.py:86
#: core/models.py:87
msgid "Translated Feed"
msgstr "翻译源"

#: core/models.py:87
#: core/models.py:88
msgid "Translated Feeds"
msgstr "翻译源"
9 changes: 8 additions & 1 deletion utils/text_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,7 @@ def group_chunks(split_chunks: dict, min_size: int, max_size: int,
'''

def should_skip(element):
skip_tags = ['pre', 'code', 'script', 'style', 'head', 'title', 'meta']
skip_tags = ['pre', 'code', 'script', 'style', 'head', 'title', 'meta', 'abbr', 'address', 'samp', 'kbd', 'bdo', 'cite', 'dfn']
if isinstance(element, Comment):
return True
if element.find_parents(skip_tags):
Expand All @@ -171,6 +171,13 @@ def should_skip(element):

return False

def unwrap_tags(soup)->str:
tags_to_unwrap = ['i', 'a', 'strong', 'b', 'em', 'span', 'sup', 'sub', 'mark', 'del', 'ins', 'u', 's', 'small']
for tag_name in tags_to_unwrap:
for tag in soup.find_all(tag_name):
tag.unwrap()
return str(soup)

def set_translation_display(original:str, translation:str, translation_display:int, seprator:str = ' || ') -> str:
if translation_display == 0: #'Only Translation'
return translation
Expand Down

0 comments on commit daf65fe

Please sign in to comment.