In [1]:
import jieba
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from youdaoai import Translation
import re
import os
from tqdm import tqdm
import time

In [None]:

def extract_ass_events(file_path):
    """
    从ASS文件中提取[Events]字段的Start, End, Text，并将其组合成字典列表。
    同时移除Text字段中的格式标签（例如{\\move(1920, 0, -480, 0)}等）。

    :param file_path: ASS文件的路径
    :return: 包含Start, End, Text字段字典的列表
    """
    # 读取ASS文件内容
    with open(file_path, 'r', encoding='utf-8') as file:
        ass_content = file.read()

    # 正则表达式匹配[Events]字段的Dialogue行，提取Start, End和Text字段
    pattern = r'Dialogue:.*?,(.*?),(.*?),(.*?),.*?,.*?,.*?,.*?,.*?,(.*)'

    # 用来存储提取出来的数据
    events = []

    # 查找所有匹配的行
    matches = re.findall(pattern, ass_content)

    # 正则表达式用于清除Text中的格式标签
    format_tag_pattern = r'{.*?}'

    # 将每个匹配项处理成字典并添加到列表中
    for match in matches:
        start, end, _, text = match
        # 移除Text字段中的格式标签
        cleaned_text = re.sub(format_tag_pattern, '', text).strip()

        event_dict = {
            'Timeline': start+"-"+end,
            'Text': cleaned_text
        }
        events.append(event_dict)

    return events


def compute_similarity(sentence1, sentence2):
    # 中文分词
    sentence1 = " ".join(jieba.cut(sentence1))
    sentence2 = " ".join(jieba.cut(sentence2))

    # 初始化TF-IDF向量化器
    vectorizer = TfidfVectorizer()

    # 将两句话转换为TF-IDF矩阵
    tfidf_matrix = vectorizer.fit_transform([sentence1, sentence2])

    # 计算两个向量之间的余弦相似度
    cosine_sim = cosine_similarity(tfidf_matrix[0], tfidf_matrix[1])

    return cosine_sim[0][0]


def rank_and_save_similar_texts(events, templates, output_file):
    """
    对每个弹幕的Text字段与模板句子列表中的句子进行相似度计算，
    并返回与最相似模板句子的前10项相似度最高的文本，
    最后将结果保存到指定的文件中。

    :param events: 含有Start, End, Text字段的事件列表
    :param templates: 用于比较相似度的模板句子列表
    :param output_file: 输出文件的路径
    """
    similarities = []

    # 对每个事件计算与所有模板句子的相似度
    for event in events:
        text = event['Text']
        # 计算与模板句子的相似度，选择最高的相似度
        best_similarity = max(compute_similarity(text, template)
                              for template in templates)

        # 将相似度得分和事件数据一起存储
        event_with_score = event.copy()  # 防止修改原始事件
        event_with_score['Similarity'] = best_similarity
        similarities.append((event_with_score, best_similarity))

    # 按照相似度降序排序
    similarities.sort(key=lambda x: x[1], reverse=True)

    # 返回相似度最高的前10项，带有得分
    top_10_similar_events = [event for event, _ in similarities[:10]]

    # 保存结果到文件
    with open(output_file, 'w', encoding='utf-8') as f:
        for i, event in enumerate(top_10_similar_events, start=1):
            f.write(
                f"{i}. Timeline: {event['Timeline']}, Context: {event['Text']}, Similarity: {event['Similarity']:.4f}\n")

    print(f"Top 10 similar events have been saved to {output_file}")


def read_file_to_list(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()
    return [line.strip() for line in lines]


def get_file_paths(file_pairs_path):
    """
    从给定的文件对文本文件中读取每一行，返回两个文件的完整路径。

    :param file_pairs_path: 包含文件对的文本文件路径
    :return: 一个列表，包含元组，每个元组由两个文件的完整路径组成
    """
    paths = []

    # 打开并读取文件
    with open(file_pairs_path, 'r', encoding='utf-8') as f:
        for line in f:
            # 去掉可能存在的换行符
            line = line.strip()

            # 分割每一行的两个文件名
            standard_file, bullet_file = line.split(',')

            # 构建文件的完整路径
            bullet_path = f"bullet/{bullet_file}"
            standard_path = f"standard/{standard_file}"

            # 将文件对的路径添加到列表中
            paths.append((bullet_path, standard_path))

    return paths


def generate_output_filename(bullet_file, standard_file):
    """
    根据bullet_file和standard_file生成output_file的路径，路径保存在result文件夹下，
    文件名格式为：bullet_file_name-standard_file_name.txt

    :param bullet_file: bullet文件的文件名
    :param standard_file: standard文件的文件名
    :return: 生成的output_file路径
    """
    # 提取文件名（不带路径部分），去掉扩展名
    bullet_name = os.path.splitext(os.path.basename(bullet_file))[0]
    standard_name = os.path.splitext(os.path.basename(standard_file))[0]

    # 生成输出文件的文件名
    output_filename = f"{bullet_name}-{standard_name}.txt"

    # 确保result文件夹存在
    os.makedirs('result', exist_ok=True)

    # 返回完整的文件路径
    output_file_path = os.path.join('result', output_filename)
    return output_file_path


task_list = get_file_paths("task_list.txt")

for pair in tqdm(task_list):
    bullet_file, standard_file = pair
    danmaku_list = extract_ass_events(bullet_file)

    templates = read_file_to_list(standard_file)
    output_file = generate_output_filename(bullet_file, standard_file)
    rank_and_save_similar_texts(danmaku_list, templates, output_file)

In [2]:
import os
import re
from youdaoai import Translation


def replace_and_translate(file_path, output_dir, app_key, app_secret, from_lang='zh-CHS', to_lang='en', delay=3):
    # Initialize the translation service
    translator = Translation(app_key, app_secret)

    # Read the content of the provided file
    with open(file_path, 'r', encoding='utf-8') as file:
        txt_content = file.read()

    # Extract the base name of the file without extension
    base_name = os.path.splitext(os.path.basename(file_path))[0]

    # Translate the base name using the translation function
    translated_file_name = translator.translate(base_name, from_lang, to_lang)[
        'translation'][0] + ".txt"

    # Build the full output path
    full_output_path = os.path.join(output_dir, translated_file_name)

    # Regex pattern to capture context field
    context_pattern = r"Context: (.*?),"

    # Function to replace the context with its translation
    def replace_with_translation(match):
        original_text = match.group(1)
        try:
            time.sleep(delay)  # Delay the request to avoid hitting rate limits
            result = translator.translate(original_text, from_lang, to_lang)
            if 'translation' in result and result['translation']:
                return f"Context: {result['translation'][0]},"
            else:
                return f"Context: Translation failed,"
        except Exception as e:
            print(f"Error translating text {original_text}: {e}")
            return f"Context: Error,"

    # Replace all contexts with their translations
    modified_content = re.sub(
        context_pattern, replace_with_translation, txt_content)

    # Write the modified content to a new file
    with open(full_output_path, 'w', encoding='utf-8') as file:
        file.write(modified_content)

    print(f"Modified content saved to {full_output_path}")


def process_all_txt_files(input_dir, output_dir, app_key, app_secret):
    # List all files in the input directory
    for file_name in tqdm(os.listdir(input_dir)):
        # Check if the file is a .txt file
        if file_name.endswith('.txt'):
            # Construct full file path
            file_path = os.path.join(input_dir, file_name)
            # Use the replace_and_translate function on the file
            replace_and_translate(file_path, output_dir, app_key, app_secret)


process_all_txt_files('result', 'result_translated',
                      "253b589fa955a5d7", "GTMCbm2iApzmIQXnmsJDGgWnuGhLYfDL")

  3%|▎         | 1/37 [00:31<18:50, 31.41s/it]

Modified content saved to result_translated/Totoro 1- Harmony in Diversity.txt


  5%|▌         | 2/37 [01:02<18:22, 31.50s/it]

Modified content saved to result_translated/Princess Mononoke 1- One Man.txt


  8%|▊         | 3/37 [01:35<18:01, 31.81s/it]

Modified content saved to result_translated/Make Heavenly palace 1- artificial.txt


 11%|█         | 4/37 [02:07<17:32, 31.90s/it]

Modified content saved to result_translated/Wushan Five Elements 2- Yin and Yang.txt


 14%|█▎        | 5/37 [02:38<16:58, 31.82s/it]

Modified content saved to result_translated/Five centimetres per second. - One thing.txt


 16%|█▌        | 6/37 [03:10<16:29, 31.92s/it]

Modified content saved to result_translated/Son of the Sea Beast 1- The philosophy of Water.txt


 19%|█▉        | 7/37 [03:42<15:56, 31.88s/it]

Modified content saved to result_translated/Spirited Away 1- Harmony in Diversity.txt


 22%|██▏       | 8/37 [04:14<15:25, 31.90s/it]

Modified content saved to result_translated/Deep Sea 1- Inner peace.txt


 24%|██▍       | 9/37 [04:46<14:53, 31.89s/it]

Modified content saved to result_translated/panda2- Yin and Yang.txt


 27%|██▋       | 10/37 [05:18<14:21, 31.91s/it]

Modified content saved to result_translated/Monkey King 1- One of the three poisons in Buddhism.txt


 30%|██▉       | 11/37 [05:50<13:47, 31.81s/it]

Modified content saved to result_translated/Spirited Away 2- Harmony in Diversity.txt


 32%|███▏      | 12/37 [06:22<13:17, 31.91s/it]

Modified content saved to result_translated/panda2- The philosophy of water.txt


 35%|███▌      | 13/37 [06:54<12:47, 31.97s/it]

Modified content saved to result_translated/Sun Wukong 1- Nature is good and nature is evil.txt


 38%|███▊      | 14/37 [07:26<12:16, 32.04s/it]

Modified content saved to result_translated/Reincarnation Room 2- Reincarnation.txt


 41%|████      | 15/37 [07:58<11:43, 31.96s/it]

Modified content saved to result_translated/Spirited Away 1- one of the three poisons in Buddhism.txt


 43%|████▎     | 16/37 [08:30<11:13, 32.07s/it]

Modified content saved to result_translated/Goose Goose Goose 2- one of the three poisons in Buddhism.txt


 46%|████▌     | 17/37 [09:02<10:41, 32.08s/it]

Modified content saved to result_translated/Goose goose goose 2- Reincarnation.txt


 49%|████▊     | 18/37 [09:34<10:09, 32.08s/it]

Modified content saved to result_translated/panda2- Inner peace.txt


 51%|█████▏    | 19/37 [10:06<09:37, 32.08s/it]

Modified content saved to result_translated/Spirited Away 2- Unity of nature and man.txt


 54%|█████▍    | 20/37 [10:38<09:04, 32.04s/it]

Modified content saved to result_translated/Monkey King 2- Destiny.txt


 57%|█████▋    | 21/37 [11:10<08:31, 32.00s/it]

Modified content saved to result_translated/Totoro 2- Harmony in Diversity.txt


 59%|█████▉    | 22/37 [11:42<08:00, 32.01s/it]

Modified content saved to result_translated/Nezha 1- Destiny.txt


 62%|██████▏   | 23/37 [12:14<07:28, 32.01s/it]

Modified content saved to result_translated/Spirited Away 2- one of the three poisons in Buddhism.txt


 65%|██████▍   | 24/37 [12:46<06:55, 31.93s/it]

Modified content saved to result_translated/Goose Goose Goose 1- one of the three poisons in Buddhism.txt


 68%|██████▊   | 25/37 [13:18<06:22, 31.90s/it]

Modified content saved to result_translated/Five centimetres per second. - One thing.txt


 70%|███████   | 26/37 [13:50<05:51, 31.92s/it]

Modified content saved to result_translated/panda1- Inner peace.txt


 73%|███████▎  | 27/37 [14:22<05:19, 31.94s/it]

Modified content saved to result_translated/Princess Mononoke 2- One Man.txt


 76%|███████▌  | 28/37 [14:54<04:47, 31.98s/it]

Modified content saved to result_translated/Misty Mountain five elements 1- Yin and Yang.txt


 78%|███████▊  | 29/37 [15:26<04:15, 31.98s/it]

Modified content saved to result_translated/Spirited Away 1- Unity of nature and man.txt


 81%|████████  | 30/37 [15:58<03:43, 31.90s/it]

Modified content saved to result_translated/Goose goose goose 1- Reincarnation.txt


 84%|████████▍ | 31/37 [16:30<03:11, 31.94s/it]

Modified content saved to result_translated/panda1- Yin and Yang.txt


 86%|████████▋ | 32/37 [17:01<02:39, 31.90s/it]

Modified content saved to result_translated/Nezha 1- Human.txt


 89%|████████▉ | 33/37 [17:33<02:07, 31.85s/it]

Modified content saved to result_translated/Nezha 1- Human effort.txt


 92%|█████████▏| 34/37 [18:05<01:35, 31.90s/it]

Modified content saved to result_translated/The Forest of firefly 1- mourning.txt


 95%|█████████▍| 35/37 [18:38<01:04, 32.16s/it]

Modified content saved to result_translated/Reincarnation Room 1- Reincarnation.txt


 97%|█████████▋| 36/37 [19:11<00:32, 32.38s/it]

Modified content saved to result_translated/Uproar in Heaven 1- Human effort.txt


100%|██████████| 37/37 [19:43<00:00, 32.00s/it]

Modified content saved to result_translated/panda1- The philosophy of water.txt



