In [1]:
import jieba
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re
import os
from tqdm import tqdm

def extract_ass_events(file_path):
    """
    从ASS文件中提取[Events]字段的Start, End, Text，并将其组合成字典列表。
    同时移除Text字段中的格式标签（例如{\\move(1920, 0, -480, 0)}等）。

    :param file_path: ASS文件的路径
    :return: 包含Start, End, Text字段字典的列表
    """
    # 读取ASS文件内容
    with open(file_path, 'r', encoding='utf-8') as file:
        ass_content = file.read()

    # 正则表达式匹配[Events]字段的Dialogue行，提取Start, End和Text字段
    pattern = r'Dialogue:.*?,(.*?),(.*?),(.*?),.*?,.*?,.*?,.*?,.*?,(.*)'

    # 用来存储提取出来的数据
    events = []

    # 查找所有匹配的行
    matches = re.findall(pattern, ass_content)

    # 正则表达式用于清除Text中的格式标签
    format_tag_pattern = r'{.*?}'

    # 将每个匹配项处理成字典并添加到列表中
    for match in matches:
        start, end, _, text = match
        # 移除Text字段中的格式标签
        cleaned_text = re.sub(format_tag_pattern, '', text).strip()

        event_dict = {
            'Timeline': start+"-"+end,
            'Text': cleaned_text
        }
        events.append(event_dict)

    return events


def compute_similarity(sentence1, sentence2):
    # 中文分词
    sentence1 = " ".join(jieba.cut(sentence1))
    sentence2 = " ".join(jieba.cut(sentence2))

    # 初始化TF-IDF向量化器
    vectorizer = TfidfVectorizer()

    # 将两句话转换为TF-IDF矩阵
    tfidf_matrix = vectorizer.fit_transform([sentence1, sentence2])

    # 计算两个向量之间的余弦相似度
    cosine_sim = cosine_similarity(tfidf_matrix[0], tfidf_matrix[1])

    return cosine_sim[0][0]


def rank_and_save_similar_texts(events, templates, output_file):
    """
    对每个弹幕的Text字段与模板句子列表中的句子进行相似度计算，
    并返回与最相似模板句子的前10项相似度最高的文本，
    最后将结果保存到指定的文件中。

    :param events: 含有Start, End, Text字段的事件列表
    :param templates: 用于比较相似度的模板句子列表
    :param output_file: 输出文件的路径
    """
    similarities = []

    # 对每个事件计算与所有模板句子的相似度
    for event in events:
        text = event['Text']
        # 计算与模板句子的相似度，选择最高的相似度
        best_similarity = max(compute_similarity(text, template)
                              for template in templates)

        # 将相似度得分和事件数据一起存储
        event_with_score = event.copy()  # 防止修改原始事件
        event_with_score['Similarity'] = best_similarity
        similarities.append((event_with_score, best_similarity))

    # 按照相似度降序排序
    similarities.sort(key=lambda x: x[1], reverse=True)

    # 返回相似度最高的前10项，带有得分
    top_10_similar_events = [event for event, _ in similarities[:10]]

    # 保存结果到文件
    with open(output_file, 'w', encoding='utf-8') as f:
        for i, event in enumerate(top_10_similar_events, start=1):
            f.write(
                f"{i}. Timeline: {event['Timeline']}, Context: {event['Text']}, Similarity: {event['Similarity']:.4f}\n")

    print(f"Top 10 similar events have been saved to {output_file}")


def read_file_to_list(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()
    return [line.strip() for line in lines]


def get_file_paths(file_pairs_path):
    """
    从给定的文件对文本文件中读取每一行，返回两个文件的完整路径。

    :param file_pairs_path: 包含文件对的文本文件路径
    :return: 一个列表，包含元组，每个元组由两个文件的完整路径组成
    """
    paths = []

    # 打开并读取文件
    with open(file_pairs_path, 'r', encoding='utf-8') as f:
        for line in f:
            # 去掉可能存在的换行符
            line = line.strip()

            # 分割每一行的两个文件名
            standard_file, bullet_file = line.split(',')

            # 构建文件的完整路径
            bullet_path = f"bullet/{bullet_file}"
            standard_path = f"standard/{standard_file}"

            # 将文件对的路径添加到列表中
            paths.append((bullet_path, standard_path))

    return paths


def generate_output_filename(bullet_file, standard_file):
    """
    根据bullet_file和standard_file生成output_file的路径，路径保存在result文件夹下，
    文件名格式为：bullet_file_name-standard_file_name.txt

    :param bullet_file: bullet文件的文件名
    :param standard_file: standard文件的文件名
    :return: 生成的output_file路径
    """
    # 提取文件名（不带路径部分），去掉扩展名
    bullet_name = os.path.splitext(os.path.basename(bullet_file))[0]
    standard_name = os.path.splitext(os.path.basename(standard_file))[0]

    # 生成输出文件的文件名
    output_filename = f"{bullet_name}-{standard_name}.txt"

    # 确保result文件夹存在
    os.makedirs('result', exist_ok=True)

    # 返回完整的文件路径
    output_file_path = os.path.join('result', output_filename)
    return output_file_path


task_list = get_file_paths("task_list.txt")

for pair in tqdm(task_list):
    bullet_file, standard_file = pair
    danmaku_list = extract_ass_events(bullet_file)

    templates = read_file_to_list(standard_file)
    output_file = generate_output_filename(bullet_file, standard_file)
    rank_and_save_similar_texts(danmaku_list, templates, output_file)

  0%|          | 0/34 [00:00<?, ?it/s]Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/yc/gy1c5m3s7bx188v4vp4dh3vm0000gn/T/jieba.cache
Loading model cost 0.277 seconds.
Prefix dict has been built successfully.
  3%|▎         | 1/34 [00:15<08:29, 15.43s/it]

Top 10 similar events have been saved to result/千与千寻1-天人合一.txt


  6%|▌         | 2/34 [00:19<04:36,  8.64s/it]

Top 10 similar events have been saved to result/千与千寻2-天人合一.txt


  9%|▉         | 3/34 [00:30<05:07,  9.93s/it]

Top 10 similar events have been saved to result/幽灵公主1-天人合一.txt


 12%|█▏        | 4/34 [00:35<03:52,  7.74s/it]

Top 10 similar events have been saved to result/幽灵公主2-天人合一.txt


 15%|█▍        | 5/34 [00:42<03:37,  7.51s/it]

Top 10 similar events have been saved to result/轮回的房间1-轮回.txt


 18%|█▊        | 6/34 [00:45<02:46,  5.95s/it]

Top 10 similar events have been saved to result/轮回的房间2-轮回.txt


 21%|██        | 7/34 [00:54<03:07,  6.95s/it]

Top 10 similar events have been saved to result/鹅鹅鹅1-轮回.txt


 24%|██▎       | 8/34 [01:03<03:16,  7.56s/it]

Top 10 similar events have been saved to result/鹅鹅鹅2-轮回.txt


 26%|██▋       | 9/34 [01:50<08:19, 19.97s/it]

Top 10 similar events have been saved to result/秒速五厘米1-物哀.txt


 29%|██▉       | 10/34 [02:38<11:25, 28.57s/it]

Top 10 similar events have been saved to result/秒速五厘米2-物哀.txt


 32%|███▏      | 11/34 [03:06<10:56, 28.56s/it]

Top 10 similar events have been saved to result/萤火之森1-物哀.txt


 35%|███▌      | 12/34 [03:08<07:27, 20.33s/it]

Top 10 similar events have been saved to result/哪吒1-天命.txt


 38%|███▊      | 13/34 [03:11<05:16, 15.09s/it]

Top 10 similar events have been saved to result/孙悟空2-天命.txt


 41%|████      | 14/34 [03:32<05:40, 17.03s/it]

Top 10 similar events have been saved to result/千与千寻1-多样性中的和谐.txt


 44%|████▍     | 15/34 [03:37<04:16, 13.49s/it]

Top 10 similar events have been saved to result/千与千寻2-多样性中的和谐.txt


 47%|████▋     | 16/34 [03:42<03:12, 10.69s/it]

Top 10 similar events have been saved to result/龙猫1-多样性中的和谐.txt


 50%|█████     | 17/34 [03:45<02:25,  8.58s/it]

Top 10 similar events have been saved to result/龙猫2-多样性中的和谐.txt


 53%|█████▎    | 18/34 [03:49<01:55,  7.21s/it]

Top 10 similar events have been saved to result/panda1-水的哲学.txt


 56%|█████▌    | 19/34 [03:54<01:34,  6.32s/it]

Top 10 similar events have been saved to result/panda2-水的哲学.txt


 59%|█████▉    | 20/34 [04:02<01:36,  6.91s/it]

Top 10 similar events have been saved to result/海兽之子1-水的哲学.txt


 62%|██████▏   | 21/34 [04:39<03:27, 15.98s/it]

Top 10 similar events have been saved to result/千与千寻1-佛教中的三毒之一.txt


 65%|██████▍   | 22/34 [04:49<02:50, 14.24s/it]

Top 10 similar events have been saved to result/千与千寻2-佛教中的三毒之一.txt


 68%|██████▊   | 23/34 [04:57<02:14, 12.22s/it]

Top 10 similar events have been saved to result/鹅鹅鹅1-佛教中的三毒之一.txt


 71%|███████   | 24/34 [05:04<01:47, 10.73s/it]

Top 10 similar events have been saved to result/鹅鹅鹅2-佛教中的三毒之一.txt


 74%|███████▎  | 25/34 [05:06<01:14,  8.24s/it]

Top 10 similar events have been saved to result/孙悟空1-佛教中的三毒之一.txt


 76%|███████▋  | 26/34 [05:10<00:55,  6.99s/it]

Top 10 similar events have been saved to result/panda1-内心的平静.txt


 79%|███████▉  | 27/34 [05:15<00:43,  6.16s/it]

Top 10 similar events have been saved to result/panda2-内心的平静.txt


 82%|████████▏ | 28/34 [05:21<00:37,  6.21s/it]

Top 10 similar events have been saved to result/深海1-内心的平静.txt


 85%|████████▌ | 29/34 [05:27<00:30,  6.03s/it]

Top 10 similar events have been saved to result/panda1-阴阳.txt


 88%|████████▊ | 30/34 [05:33<00:24,  6.01s/it]

Top 10 similar events have been saved to result/panda2-阴阳.txt


 91%|█████████ | 31/34 [05:48<00:26,  8.88s/it]

Top 10 similar events have been saved to result/雾山五行1-阴阳.txt


 94%|█████████▍| 32/34 [05:57<00:17,  9.00s/it]

Top 10 similar events have been saved to result/雾山五行2-阴阳.txt


 97%|█████████▋| 33/34 [05:58<00:06,  6.46s/it]

Top 10 similar events have been saved to result/哪吒1-人为.txt


100%|██████████| 34/34 [06:02<00:00, 10.66s/it]

Top 10 similar events have been saved to result/大闹天宫1-人为.txt



