In [50]:
import random
import pandas as pd
import string
import re
from sentence_transformers import util

def change_cha(word:str)->str:
    # 对于每个词，随机选择一个位置，将该位置的字母替换为另一个字母
    word = list(word)
    index = random.randint(0,len(word)-1)
    word[index] = random.choice(string.ascii_lowercase)
    return ''.join(word)

def add_cha(word:str)->str:
    # 对于每个词，随机选择一个位置，在该位置插入一个字母
    word = list(word)
    index = random.randint(0,len(word))
    word.insert(index,random.choice(string.ascii_lowercase))
    return ''.join(word)

def delete_cha(word:str)->str:
    # 对于每个词，随机选择一个位置，删除该位置的字母
    word = list(word)
    index = random.randint(0,len(word)-1)
    word.pop(index)
    return ''.join(word)

def swap_cha(word:str)->str:
    # 对于每个词，随机选择两个位置，交换这两个位置的字母
    word = list(word)
    index1 = random.randint(0,len(word)-1)
    index2 = random.randint(0,len(word)-1)
    word[index1], word[index2] = word[index2], word[index1]
    return ''.join(word)

def change_data(data:pd.DataFrame)->pd.DataFrame:
    data_label = data['s2'].reset_index(drop=True)
    
    # 对于每一行，将文本根据空格进行分割，分割后随机选择1-2个词，将单词切分后随机在一个位置填入一个字母，再讲字母重新拼接成单词
    data_text = data_label.apply(lambda x: x.split())
    for text in data_text:
        # change_or_not = random.random()
        # if change_or_not <= 0.1:
        for i in range(len(text)):
            word_change_or_not = random.random()
            if word_change_or_not <= 0.1 and len(text[i]) > 1:
                change_method = random.random()
                if change_method < 0.25:
                    text[i] = change_cha(text[i])
                elif change_method < 0.5:
                    text[i] = add_cha(text[i])
                elif change_method < 0.75:
                    text[i] = delete_cha(text[i])
                else:
                    text[i] = swap_cha(text[i])
    data_text = data_text.apply(lambda x: ' '.join(x))
    data['s2'] = data_text.copy()

    return data

def mask_data(data:pd.DataFrame)->pd.DataFrame:
    data_label = data['s2'].reset_index(drop=True)
    
    data_text = data_label.apply(lambda x: x.split())
    for text in data_text:
        # mask_or_not = random.random()
        # if mask_or_not <= 0.1:
        for i in range(len(text)):
            word_mask_or_not = random.random()
            if word_mask_or_not <= 0.1 and len(text[i]) > 1:
                text[i] = ' '
    data_text = data_text.apply(lambda x: ' '.join(x))

    data['s2'] = data_text.copy()
    
    return data

def expand_data(data_refer:pd.DataFrame, data:pd.DataFrame)->pd.DataFrame:
    data = data.reset_index(drop=True)
    text_code = data['code_sum']
    data_text = data['s2'].copy()
    for index_code, code in enumerate(text_code):
        # expand_or_not = random.random()
        # if expand_or_not <= 0.1:
        # 从data_refer中找到code_sum列包含code的行
        code = re.escape(code)
        data_refer_text = data_refer[data_refer['code_sum'].str.contains(code)]['narr_accf']
        if (data_refer_text.empty):
            continue
        # 随机选择一个行，将该行的narr_accf列的文本添加到原文本后面
        refer_text = data_refer_text.sample(1).iloc[0]
        orig_text = data_text.loc[index_code]
        data_text.loc[index_code] = orig_text + '。' + refer_text
    data['s2'] = data_text.copy()
    return data

def find_max_index(embeddings1, embeddings2):
    # 计算两个tensor之间的余弦相似度
    cosine_scores = util.pytorch_cos_sim(embeddings1, embeddings2)
    # 寻找tensor最大值的行号与列号
    values, indices = cosine_scores.max(dim=0)
    max_index = indices[values.argmax()].tolist()
    return max_index

def fliter(embeddings0, embeddings2):
    # 寻找最相关的事故发生阶段与事故发生原因
    index0 = find_max_index(embeddings0, embeddings2)
    embeddings = embeddings0[index0]
    return embeddings

In [44]:
# 导入并预处理语料库，创建语料库的句向量
data = pd.read_csv('./data/describe_sum_carrier.csv', encoding = 'utf-8')
data = pd.DataFrame(data)
keyword = data['Occurrence_description_sum'].tolist()
ntsb_no = data['ntsb_no'].tolist()
refer = data['narr_accf'].tolist()
if os.path.exists('./data/refer_embeddings.pkl'):
    with open('./data/refer_embeddings.pkl', 'rb') as f:
        refer_embeddings = pickle.load(f)
else:
    refer_new = [p(sentence) if len(sentence)<=512 else p(sentence[:128] + sentence[-384:]) 
                for sentence in refer]
    refer_embeddings = model.encode(refer_new, convert_to_tensor = True)
    pickle.dump(refer_embeddings, open('./data/refer_embeddings.pkl', 'wb'))
# 事故发生阶段+原因
if os.path.exists('./data/embeddings0.pkl'):
    with open('./data/embeddings0.pkl', 'rb') as f:
        embeddings0 = pickle.load(f)
else:
    dict = pd.read_csv('./data/describe_category_carrier.csv')
    dict = [phase for phase in dict['Occurrence_description_sum']]
    embeddings0 = model.encode(dict, convert_to_tensor = True)
    #存为pkl文件
    pickle.dump(embeddings0, open('./data/embeddings0.pkl', 'wb'))

In [56]:
string_test = 'Cruise - normal In flight encounter with weather'
data = pd.DataFrame({'Occurrence_description_sum':[string_test]})
data['code_sum'] = '541+240'
embeddings2 = model.encode(data['Occurrence_description_sum'].tolist(), convert_to_tensor = True)
expand_st = fliter(embeddings0, embeddings2)

2


In [51]:
# data_refer = pd.read_csv('./data/describe_sum_carrier.csv')
string_test = 'Cruise - normal In flight encounter with weather'
data = pd.DataFrame({'s2':[string_test]})
data['label'] = '541+240'
change_st = change_data(data)
print(change_st)

                                                 s2    label
0  Cruism - normal In flight encounter with weather  541+240


In [10]:
import pickle
import torch
type(pickle._Unpickler)
type(torch.nn)

module

In [1]:
import torch
torch.__version__

'2.4.0+cpu'

In [4]:
from neuspell import BertChecker

# 初始化并加载预训练模型
checker = BertChecker()
checker.from_pretrained("./neuspell_bert/")

# 进行拼写检查
text = "This is an exmple of a text with speling erors."
corrected_text = checker.correct(text)

print(f"Original: {text}")
print(f"Corrected: {corrected_text}")


'(MaxRetryError("HTTPSConnectionPool(host='huggingface.co', port=443): Max retries exceeded with url: /bert-base-cased/resolve/main/vocab.txt (Caused by ProxyError('Cannot connect to proxy.', OSError(0, 'Error')))"), '(Request ID: 173bc47a-4956-4bca-8713-fcd52189b3f2)')' thrown while requesting HEAD https://huggingface.co/bert-base-cased/resolve/main/vocab.txt


ProxyError: (MaxRetryError("HTTPSConnectionPool(host='huggingface.co', port=443): Max retries exceeded with url: /bert-base-cased/resolve/main/vocab.txt (Caused by ProxyError('Cannot connect to proxy.', OSError(0, 'Error')))"), '(Request ID: 173bc47a-4956-4bca-8713-fcd52189b3f2)')

In [13]:
import pickle
import torch
import os
from functools import partial
from neuspell import BertChecker

# Adjust pickle.load with encoding
pickle.load = partial(pickle.load, encoding="latin1")
pickle.Unpickler = partial(pickle._Unpickler, encoding="latin1")

# Custom Unpickler to handle encoding
class CustomUnpickler(pickle._Unpickler):
    def __init__(self, file, *args, **kwargs):
        kwargs['encoding'] = 'latin1'
        super().__init__(file, *args, **kwargs)

    def find_class(self, module, name):
        # Handle the 'code' type properly
        if module == "builtins" and name == "code":
            import types
            return types.CodeType
        return super().find_class(module, name)

# Override torch load to use the custom Unpickler
def custom_load(f, map_location=None, pickle_module=pickle, **pickle_load_args):
    if isinstance(f, str):
        with open(f, 'rb') as opened_file:
            return custom_load(opened_file, map_location, pickle_module, **pickle_load_args)
    return torch.serialization._legacy_load(f, map_location, CustomUnpickler, **pickle_load_args)

torch.load = custom_load

# Your existing code to initialize and use the checker
checker = BertChecker()
checker.from_pretrained()
checker.correct("I luk foward to receving your reply")

loading vocab from path:d:\anaconda\Lib\site-packages\neuspell\../data/checkpoints/subwordbert-probwordnoise\vocab.pkl
initializing model
SubwordBert(
  (bert_dropout): Dropout(p=0.2, inplace=False)
  (bert_model): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
         

AttributeError: type object 'CustomUnpickler' has no attribute 'Unpickler'

In [62]:
import neuspell
from neuspell import BertChecker
import pickle
from functools import partial

# Adjust pickle.load with encoding
pickle_load_with_encoding = partial(pickle.load, encoding="latin1")
pickle.Unpickler = lambda f: pickle._Unpickler(f, encoding="latin1")

""" select spell checkers & load """
checker = BertChecker()
checker.from_pretrained()

""" spell correction """
checker.correct("I luk foward to receving your reply")
# → "I look forward to receiving your reply"
checker.correct_strings(["I luk foward to receving your reply", ])
# → ["I look forward to receiving your reply"]

loading vocab from path:d:\anaconda\Lib\site-packages\neuspell\../data/checkpoints/subwordbert-probwordnoise\vocab.pkl
initializing model
SubwordBert(
  (bert_dropout): Dropout(p=0.2, inplace=False)
  (bert_model): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
         

AttributeError: type object 'UnpicklerWrapper' has no attribute 'Unpickler'

In [22]:
import pandas as pd

data_refer = pd.read_csv('./data/describe_sum_carrier.csv')
data_query = pd.read_csv("./data/describe_category_carrier.csv")
data_train = pd.read_csv("./data/corpus_short.csv")
# 随机提取10%的数据样本
data_change = data_train.sample(frac=0.1, random_state=1).reset_index(drop=True)
data_mask = data_train.sample(frac=0.1, random_state=2).reset_index(drop=True)
data_expand = data_train.sample(frac=0.1, random_state=3).reset_index(drop=True)

In [9]:
data_query_change = change_data(data_query.copy())
data_query_mask = mask_data(data_query.copy())
data_query_expand = expand_data(data_refer, data_query.copy())

data_query_change.to_csv('./data/describe_category_carrier_change.csv', index=False)
data_query_mask.to_csv('./data/describe_category_carrier_mask.csv', index=False)
data_query_expand.to_csv('./data/describe_category_carrier_expand.csv', index=False)

In [6]:
del data_query_change , data_query_mask, data_query_expand

In [12]:
import pandas as pd
data_query = pd.read_csv("./data/describe_category_carrier.csv")
data_example = data_query.sample(frac=0.1, random_state=4)

In [24]:
# 对数据进行处理
data_change_noise = change_data(data_change.reset_index(drop=True))
data_mask_noise = mask_data(data_mask.reset_index(drop=True))

In [27]:
#将数据与原数据合并
data01 = pd.concat([data_train, data_change_noise], axis=0)
data02 = pd.concat([data01, data_mask_noise], axis=0)

In [28]:
data02.to_csv('./data/corpus_short_noise.csv', index=False)

In [1]:
# -*- coding: utf-8 -*-
from sentence_transformers import SentenceTransformer, util, models
import torch
import pandas as pd
import preprocessor
import evaluation_SBERT
import torch
import os
import pickle

model_idx = 5
file_idx = 1
max_seq_length = 512


# 实例化文本预处理类
p = preprocessor.EnglishPreProcessor()


def get_model(model_save_path):
    model = SentenceTransformer(model_save_path)
    return model
    
def evaluation_model(model, ndcg = True, mean_ap = True, topk = 10, file_idx = 1):
    if os.path.exists('./data/refer_embeddings.pkl'):
        with open('./data/refer_embeddings.pkl', 'rb') as f:
            embeddings2 = pickle.load(f)
    else:
        # 导入语料库
        data = pd.read_csv('./data/describe_sum_carrier.csv', encoding = 'utf-8')
        data = pd.DataFrame(data)
        refer = [p(sentence) if len(sentence)<=512 else p(sentence[:128] + sentence[-384:]) 
                for sentence in data['narr_accf']]
        # 创建语料库的句向量
        embeddings2 = model.encode(refer, convert_to_tensor = True)
        pickle.dump(embeddings2, open('./data/refer_embeddings.pkl', 'wb'))
    # 基于NDCG@N的评价方法
    if ndcg:
        avg_ndcg = evaluation_SBERT.cal_ndcg(model, embeddings2, topk)
        print ('Dataset %s ndcg: %s' % (file_idx, avg_ndcg))
    # 基于MAP的评价方法
    if mean_ap:
        mean_ap = evaluation_SBERT.cal_map(model, embeddings2, topk)
        print ('Dataset %s map: %s' % (file_idx, mean_ap))
        
    return avg_ndcg, mean_ap

def evaluation_model_n(model, n = 100, file_idx = 1):
    if os.path.exists('./data/refer_embeddings.pkl'):
        with open('./data/refer_embeddings.pkl', 'rb') as f:
            embeddings2 = pickle.load(f)
    else:
        # 导入语料库
        data = pd.read_csv('./data/describe_sum_carrier.csv', encoding = 'utf-8')
        data = pd.DataFrame(data)
        refer = [p(sentence) if len(sentence)<=512 else p(sentence[:128] + sentence[-384:]) 
                for sentence in data['narr_accf']]
        # 创建语料库的句向量
        embeddings2 = model.encode(refer, convert_to_tensor = True)
        pickle.dump(embeddings2, open('./data/refer_embeddings.pkl', 'wb'))
    # 灵敏度测试：取前n个评价的影响
    score_ndcg = evaluation_SBERT.cal_ndcg(model, embeddings2, topk = 100, repeat=True)
    score_map = evaluation_SBERT.cal_map(model, embeddings2, topk = 100, repeat=True)
    # 保存评价结果
    ndcg = pd.DataFrame(pd.read_csv('./result_noise/ndcg.csv', encoding = 'utf-8'))
    map = pd.DataFrame(pd.read_csv('./result_noise/map.csv', encoding = 'utf-8'))
    ndcg['Dataset ' + str(file_idx)] = score_ndcg
    map['Dataset ' + str(file_idx)] = score_map
    ndcg.to_csv('./result_noise/ndcg.csv', index=0)
    map.to_csv('./result_noise/map.csv', index=0)
    return score_ndcg, score_map

def predict(model, queries, queries_ori = None, nosie = False, n = 10):
    """
    :param queries : 查询语句集, list, [str1, str2, ...]
    :
    """
    # 导入并预处理语料库，创建语料库的句向量
    data = pd.read_csv('./data/describe_sum_carrier.csv', encoding = 'utf-8')
    data = pd.DataFrame(data)
    keyword = data['Occurrence_description_sum'].tolist()
    ntsb_no = data['ntsb_no'].tolist()
    refer = data['narr_accf'].tolist()
    if os.path.exists('./data/refer_embeddings.pkl'):
        with open('./data/refer_embeddings.pkl', 'rb') as f:
            refer_embeddings = pickle.load(f)
    else:
        refer_new = [p(sentence) if len(sentence)<=512 else p(sentence[:128] + sentence[-384:]) 
                    for sentence in refer]
        refer_embeddings = model.encode(refer_new, convert_to_tensor = True)
        pickle.dump(refer_embeddings, open('./data/refer_embeddings.pkl', 'wb'))
    # 事故发生阶段+原因
    if os.path.exists('./data/embeddings0.pkl'):
        with open('./data/embeddings0.pkl', 'rb') as f:
            embeddings0 = pickle.load(f)
    else:
        dict = pd.read_csv('./data/describe_category_carrier.csv')
        dict = [phase for phase in dict['Occurrence_description_sum']]
        embeddings0 = model.encode(dict, convert_to_tensor = True)
        #存为pkl文件
        pickle.dump(embeddings0, open('./data/embeddings0.pkl', 'wb'))

    embeddings2 = []
    # 预处理查询语句集
    for i in range(len(queries)):
        query = queries[i]
        # 输入的query
        if (len(query) > 100):
            query = query.split('.')
            query = [sentence.split(',') for sentence in query]
            query = [p(sentence) for sentences in query for sentence in sentences if len(sentence) > 2]
        embeddings2.append(model.encode(query, convert_to_tensor = True))
    # 基于余弦相似度为每条查询语句寻找前n个相似的语料库文本
    top_k = min(n, refer_embeddings.shape[0])
    total_top_results = []
    contents = []
    for i in range(len(queries)):
        # 创建查询语句的句向量
        query_embedding = fliter(embeddings0, embeddings2[i])
        # 通过余弦相似度和torch.topk获得前n个最高分数
        cos_scores = util.cos_sim(query_embedding, refer_embeddings)[0]
        top_results = torch.topk(cos_scores, k = top_k)
        total_top_results.append(top_results)
        # 记录top_results对应的文本内容
        content = [keyword[idx] for idx in top_results[1]]
        contents.append(content)
        #eval_pr
        if not nosie:
            eval_pr = eval_predict(queries[i], top_results)
        else:
            eval_pr = eval_predict(queries_ori[i], top_results)
        # # 打印预测结果
        # print("\n\n========================================================\n\n")
        # print("查询语句:", queries[i])
        # print("\n语料库中前 {0} 个最相似的语句:".format(top_k))
        # j = 1
        # for score, idx in zip(top_results[0], top_results[1]):
        #     print("【%s】【NTSB NO.%s】" % (j, ntsb_no[idx]), refer[idx].strip(),
        #           "(Score: {:.4f})".format(score))
        #     j += 1
        print("(eval_pr: {:.4f})".format(eval_pr/len(top_results[0])))
    return contents

def eval_predict(query,results):
    # 导入语料库
    data = pd.read_csv('./data/describe_sum_carrier.csv', encoding = 'utf-8')
    data = pd.DataFrame(data)
    occurences = data['Occurrence_description_sum'].tolist()
    eval_qr = 0
    for score, idx in zip(results[0], results[1]):
        occurence = occurences[idx]
        if query in occurence:
            eval_qr += 1
    return eval_qr

  from tqdm.autonotebook import tqdm, trange





In [63]:
# -*- coding: utf-8 -*-
from sentence_transformers import SentenceTransformer, util, models
import torch
import pandas as pd
import evaluation_SBERT
import torch


model_idx = 5
file_idx = 1
max_seq_length = 512


# 实例化文本预处理类
p = preprocessor.EnglishPreProcessor()


def get_model(model_save_path):
    model = SentenceTransformer(model_save_path)
    return model
    
def evaluation_model(model, ndcg = True, mean_ap = True, topk = 10, file_idx = 1):
    # 导入语料库
    data = pd.read_csv('./data/describe_sum_carrier.csv', encoding = 'utf-8')
    data = pd.DataFrame(data)
    refer = [p(sentence) if len(sentence)<=512 else p(sentence[:128] + sentence[-384:]) 
             for sentence in data['narr_accf']]
    # 创建语料库的句向量
    embeddings2 = model.encode(refer, convert_to_tensor = True)
    # 基于NDCG@N的评价方法
    if ndcg:
        avg_ndcg = evaluation_SBERT.cal_ndcg(model, embeddings2, topk)
        print ('Dataset %s ndcg: %s' % (file_idx, avg_ndcg))
    # 基于MAP的评价方法
    if mean_ap:
        mean_ap = evaluation_SBERT.cal_map(model, embeddings2, topk)
        print ('Dataset %s map: %s' % (file_idx, mean_ap))
        
    return avg_ndcg, mean_ap

def evaluation_model_n(n = 100,file_idx = 1):
    # 导入语料库
    data = pd.read_csv('./data/describe_sum_carrier.csv', encoding = 'utf-8')
    data = pd.DataFrame(data)
    refer = [p(sentence) if len(sentence)<=512 else p(sentence[:128] + sentence[-384:]) 
             for sentence in data['narr_accf']]
    # 创建语料库的句向量
    embeddings2 = model.encode(refer, convert_to_tensor = True)
    # 灵敏度测试：取前n个评价的影响
    score_ndcg = evaluation_SBERT.cal_ndcg(model, embeddings2, repeat=True)
    score_map = evaluation_SBERT.cal_map(model, embeddings2, repeat=True)
    # 保存评价结果
    ndcg = pd.DataFrame(pd.read_csv('./result_noise/ndcg_change.csv', encoding = 'utf-8'))
    map = pd.DataFrame(pd.read_csv('./result_noise/map_change.csv', encoding = 'utf-8'))
    ndcg['Dataset ' + str(file_idx)] = score_ndcg
    map['Dataset ' + str(file_idx)] = score_map
    ndcg.to_csv('./result_noise/ndcg_change.csv', index=0)
    map.to_csv('./result_noise/map_change.csv', index=0)
    return score_ndcg, score_map

def predict(model, queries, queries_ori = None, nosie = False, n = 10):
    """
    :param queries : 查询语句集, list, [str1, str2, ...]
    :
    """
    # 导入并预处理语料库，创建语料库的句向量
    data = pd.read_csv('./data/describe_sum_carrier.csv', encoding = 'utf-8')
    data = pd.DataFrame(data)
    keyword = data['Occurrence_description_sum'].tolist()
    ntsb_no = data['ntsb_no'].tolist()
    refer = data['narr_accf'].tolist()
    refer_new = [p(sentence) if len(sentence)<=512 else p(sentence[:128] + sentence[-384:]) 
                 for sentence in refer]
    refer_embeddings = model.encode(refer_new, convert_to_tensor = True)
    # 预处理查询语句集
    queries_new = [p(sentence) if len(sentence)<=512 else p(sentence[:128] + sentence[-384:]) 
                   for sentence in queries]
    # 基于余弦相似度为每条查询语句寻找前n个相似的语料库文本
    top_k = min(n, len(refer))
    total_top_results = []
    contents = []
    for i in range(len(queries)):
        # 创建查询语句的句向量
        query_embedding = model.encode(queries_new[i], convert_to_tensor = True)
        # 通过余弦相似度和torch.topk获得前n个最高分数
        cos_scores = util.cos_sim(query_embedding, refer_embeddings)[0]
        top_results = torch.topk(cos_scores, k = top_k)
        total_top_results.append(top_results)
        # 记录top_results对应的文本内容
        content = [keyword[idx] for idx in top_results[1]]
        contents.append(content)
        #eval_pr
        if not nosie:
            eval_pr = eval_predict(queries[i], top_results)
        else:
            eval_pr = eval_predict(queries_ori[i], top_results)
        # # 打印预测结果
        # print("\n\n========================================================\n\n")
        # print("查询语句:", queries[i])
        # print("\n语料库中前 {0} 个最相似的语句:".format(top_k))
        # j = 1
        # for score, idx in zip(top_results[0], top_results[1]):
        #     print("【%s】【NTSB NO.%s】" % (j, ntsb_no[idx]), refer[idx].strip(),
        #           "(Score: {:.4f})".format(score))
        #     j += 1
        print("(eval_pr: {:.4f})".format(eval_pr/len(top_results[0])))
    return contents

def eval_predict(query,results):
    # 导入语料库
    data = pd.read_csv('./data/describe_sum_carrier.csv', encoding = 'utf-8')
    data = pd.DataFrame(data)
    occurences = data['Occurrence_description_sum'].tolist()
    eval_qr = 0
    for score, idx in zip(results[0], results[1]):
        occurence = occurences[idx]
        if query in occurence:
            eval_qr += 1
    return eval_qr

In [228]:
data_example

Unnamed: 0,Occurrence_description_sum,code_sum,count
677,Standing - engine(s) not operating Miscellaneo...,"504+430,504+430",1
653,Takeoff Airframe/component/system failure/malf...,"520+130,540+130",1
530,"Takeoff - initial climb Explosion,Takeoff - in...","522+172,522+352,571+180",1
104,Descent Near collision between aircraft,550+280,3
179,Landing - flare/touchdown In flight collision ...,571+220,2
...,...,...,...
358,"Maneuvering Abrupt maneuver,Maneuvering Abrupt...","580+100,580+100",1
641,Taxi - pushback/tow Propeller/rotor contact to...,511+370,1
575,Taxi - to takeoff On ground/water collision wi...,"512+310,512+130,574+232",1
665,"Taxi Miscellaneous/other,Takeoff Overrun,Takeo...","510+430,520+340,520+230",1


In [31]:
# 相关参数设定
model_idx = 5
file_idx = 1
max_seq_length = 512
model_save_path = './model/model_sbert_supervised_' + str(model_idx)

# 加载模型
model = get_model(model_save_path)

queries = data_example['Occurrence_description_sum'].tolist()
contents = predict(model, queries, n = 10)

(eval_pr: 0.0000)
(eval_pr: 0.1000)
(eval_pr: 0.1000)
(eval_pr: 0.2000)
(eval_pr: 0.2000)
(eval_pr: 0.1000)
(eval_pr: 0.1000)
(eval_pr: 0.1000)
(eval_pr: 0.1000)
(eval_pr: 0.0000)
(eval_pr: 0.1000)
(eval_pr: 0.5000)
(eval_pr: 0.1000)
(eval_pr: 0.1000)
(eval_pr: 0.1000)
(eval_pr: 0.1000)
(eval_pr: 0.1000)
(eval_pr: 0.1000)
(eval_pr: 0.1000)
(eval_pr: 0.0000)
(eval_pr: 0.1000)
(eval_pr: 0.2000)
(eval_pr: 0.6000)
(eval_pr: 0.4000)
(eval_pr: 0.1000)
(eval_pr: 0.0000)
(eval_pr: 0.1000)
(eval_pr: 0.2000)
(eval_pr: 0.1000)
(eval_pr: 0.1000)
(eval_pr: 0.0000)
(eval_pr: 0.4000)
(eval_pr: 0.3000)
(eval_pr: 0.0000)
(eval_pr: 0.4000)
(eval_pr: 0.1000)
(eval_pr: 0.0000)
(eval_pr: 0.1000)
(eval_pr: 0.1000)
(eval_pr: 0.0000)
(eval_pr: 0.1000)
(eval_pr: 0.1000)
(eval_pr: 0.1000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.1000)
(eval_pr: 0.1000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.1000)
(eval_pr: 0.0000)
(eval_pr: 0.1000)
(eval_pr: 0.2000)
(eval_pr: 0.1000)
(eval_pr: 

In [33]:
# 相关参数设定
model_idx = 1
file_idx = 1
max_seq_length = 512
model_save_path = './model/model.sbert_noise_' + str(model_idx)

# 加载模型
model = get_model(model_save_path)

queries = data_change_noise['Occurrence_description_sum'].tolist()
queries_ori = data_change['Occurrence_description_sum'].tolist()
contents = predict(model, queries, queries_ori, nosie= True, n = 10)

(eval_pr: 0.1000)
(eval_pr: 1.0000)
(eval_pr: 0.4000)
(eval_pr: 0.5000)
(eval_pr: 0.1000)
(eval_pr: 0.1000)
(eval_pr: 0.1000)
(eval_pr: 0.0000)
(eval_pr: 0.1000)
(eval_pr: 0.1000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.1000)
(eval_pr: 0.6000)
(eval_pr: 0.1000)
(eval_pr: 0.1000)
(eval_pr: 0.0000)
(eval_pr: 0.2000)
(eval_pr: 0.1000)
(eval_pr: 0.2000)
(eval_pr: 0.1000)
(eval_pr: 0.1000)
(eval_pr: 0.1000)
(eval_pr: 0.2000)
(eval_pr: 0.5000)
(eval_pr: 0.3000)
(eval_pr: 0.1000)
(eval_pr: 0.4000)
(eval_pr: 0.1000)
(eval_pr: 0.6000)
(eval_pr: 0.8000)
(eval_pr: 0.1000)
(eval_pr: 0.2000)
(eval_pr: 0.0000)
(eval_pr: 0.2000)
(eval_pr: 0.7000)
(eval_pr: 0.2000)
(eval_pr: 0.1000)
(eval_pr: 0.2000)
(eval_pr: 0.1000)
(eval_pr: 0.1000)
(eval_pr: 0.2000)
(eval_pr: 0.1000)
(eval_pr: 0.2000)
(eval_pr: 0.2000)
(eval_pr: 0.2000)
(eval_pr: 0.0000)
(eval_pr: 0.1000)
(eval_pr: 0.2000)
(eval_pr: 0.1000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.1000)
(eval_pr: 0.1000)
(eval_pr: 0.1000)
(eval_pr: 

In [35]:
# 相关参数设定
model_idx = 5
file_idx = 1
max_seq_length = 512
model_save_path = './model/model_sbert_supervised_' + str(model_idx)

# 加载模型
model = get_model(model_save_path)

queries = data_change_noise['Occurrence_description_sum'].tolist()
contents = predict(model, queries, n = 10)

(eval_pr: 0.0000)
(eval_pr: 0.9000)
(eval_pr: 0.2000)
(eval_pr: 0.6000)
(eval_pr: 0.0000)
(eval_pr: 0.1000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.7000)
(eval_pr: 0.1000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.5000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.6000)
(eval_pr: 0.0000)
(eval_pr: 0.1000)
(eval_pr: 0.4000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.6000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.2000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.3000)
(eval_pr: 0.1000)
(eval_pr: 0.0000)
(eval_pr: 0.2000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.1000)
(eval_pr: 0.0000)
(eval_pr: 0.1000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.1000)
(eval_pr: 0.0000)
(eval_pr: 

In [37]:
# 相关参数设定
model_idx = 1
model_save_path = './model/model.sbert_noise_' + str(model_idx)

# 加载模型
model = get_model(model_save_path)

queries = data_mask_noise['Occurrence_description_sum'].tolist()
queries_ori = data_mask['Occurrence_description_sum'].tolist()
contents = predict(model, queries, queries_ori, nosie=True, n = 10)

(eval_pr: 0.3000)
(eval_pr: 0.1000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.1000)
(eval_pr: 0.1000)
(eval_pr: 0.4000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.1000)
(eval_pr: 0.2000)
(eval_pr: 0.0000)
(eval_pr: 0.5000)
(eval_pr: 0.1000)
(eval_pr: 0.1000)
(eval_pr: 0.1000)
(eval_pr: 0.1000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.3000)
(eval_pr: 0.1000)
(eval_pr: 1.0000)
(eval_pr: 0.1000)
(eval_pr: 0.0000)
(eval_pr: 0.1000)
(eval_pr: 0.1000)
(eval_pr: 0.0000)
(eval_pr: 0.3000)
(eval_pr: 0.2000)
(eval_pr: 0.1000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.1000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.1000)
(eval_pr: 0.0000)
(eval_pr: 0.4000)
(eval_pr: 0.1000)
(eval_pr: 0.1000)
(eval_pr: 0.1000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.1000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.1000)
(eval_pr: 0.1000)
(eval_pr: 0.2000)
(eval_pr: 0.0000)
(eval_pr: 

In [39]:
# 相关参数设定
model_idx = 5
model_save_path = './model/model_sbert_supervised_' + str(model_idx)

# 加载模型
model = get_model(model_save_path)

queries = data_mask_noise['Occurrence_description_sum'].tolist()
contents = predict(model, queries, n = 10)

(eval_pr: 0.2000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.1000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.4000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.1000)
(eval_pr: 0.2000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.1000)
(eval_pr: 0.1000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.2000)
(eval_pr: 0.0000)
(eval_pr: 0.8000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.1000)
(eval_pr: 0.2000)
(eval_pr: 0.0000)
(eval_pr: 0.3000)
(eval_pr: 0.2000)
(eval_pr: 0.0000)
(eval_pr: 0.1000)
(eval_pr: 0.0000)
(eval_pr: 0.2000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.1000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.1000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 

In [40]:
# 相关参数设定
model_idx = 1
model_save_path = './model/model.sbert_noise_' + str(model_idx)

# 加载模型
model = get_model(model_save_path)

queries = data_expand_noise['Occurrence_description_sum'].tolist()
queries_ori = data_expand['Occurrence_description_sum'].tolist()
contents = predict(model, queries, n = 10)

(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 

In [42]:
# 相关参数设定
model_idx = 5
model_save_path = './model/model_sbert_supervised_' + str(model_idx)

# 加载模型
model = get_model(model_save_path)

queries = data_expand_noise['Occurrence_description_sum'].tolist()
contents = predict(model, queries, n = 10)

(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 0.0000)
(eval_pr: 

In [2]:
# 相关参数设定
model_idx = 1
model_save_path = './model/model.sbert_noise_' + str(model_idx)

# 加载模型
model = get_model(model_save_path)

score_ndcg, score_map = evaluation_model_n(model, n = 100, file_idx = model_idx)

In [2]:
# 相关参数设定
model_idx = 5
model_save_path = './model/model_sbert_supervised_' + str(model_idx)

# 加载模型
model = get_model(model_save_path)

score_ndcg, score_map = evaluation_model_n(model, n = 100, file_idx = model_idx)

  return self.fget.__get__(instance, owner)()


In [38]:
# 处理长文本，将其按照句号划分，组成分块，每个分块的长度不超过512个字符
def sliding_window(text, max_length=512, overlap=1):
    sentences = text.split(".")
    sentences = [s.strip() + "." for s in sentences if s.strip()] # Remove empty sentences and add back the period
    chunks = []
    chunk = ""
    
    for i, sentence in enumerate(sentences):
        if len(chunk) + len(sentence) <= max_length:
            chunk += sentence
        else:
            if len(chunk) == 0:
                chunk = sentence[:max_length]
            chunks.append(chunk)
            chunk = ""
            
            overlap_sen = ""
            if i > 0 and overlap > 0:
                if i < overlap:
                    overlap_sen = "".join(sentences[:i])
                else:
                    overlap_sen = "".join(sentences[i-overlap:i])
            chunk = overlap_sen + sentence
            
    if chunk:
        chunks.append(chunk)
        
    return chunks

In [40]:
text = "The captain briefed a no go-around for a night visual approach to a Special Airport. The approach was not stabilized, and the airspeed decreased to the point of a stall. The airplane struck the runway in a nose high pitch attitude, on the aft fuselage, and settled on the landing gear. The first officer made initial callouts of slow airspeed and then stopped when the captain failed to respond to her callouts.  After landing, the airplane was taxied to the gate where a post flight inspection limited to the main landing gear did not find the damage.  When interviewed, the captain reported that she briefed ""no go-around because no takeoffs were authorized on the runway at night or in IMC conditions; however, the first officer knew this was incorrect, but did not challenge the captain.  Both pilots had received CRM training, which included crewmember assertiveness, methods of fostering crew input, and situational awareness, and training on special use airports; however it was not followed by either pilot.  The captain's handling of the airplane was outside the parameters specified in the company manuals.  Both pilots were described to having good flying skills.  The captain said the first officer was passive and quiet.  The first officer reported the captain was defensive and did not take criticism very well.  A definition of stabilized approach criteria was not found in the company manuals.  An FAA Advisory Circular dated August 10, 2000 defined stabilized approach criteria, and actions to be taken if the approach was not stabilized.,Landing Loss of control - in flight,Landing - flare/touchdown Hard landing。The captain briefed a no go-around for a night visual approach to a Special Airport.  The approach was not  stabilized, and the airspeed decreased to the point of a stall.  The airplane struck the runway in a nose high pitch attitude, on the aft fuselage, and settled on the landing gear.  The first officer made initial callouts of slow airspeed and then stopped when the captain failed to respond to her callouts.  After landing, the airplane was taxied to the gate where a post flight inspection limited to the main landing gear did not find the damage.  When interviewed, the captain reported that she briefed no go-around because no takeoffs were authorized on the runway at night or in IMC conditions; however, the first officer knew this was incorrect, but did not challenge the captain.  Both pilots had received CRM training, which included crewmember assertiveness, methods of fostering crew input, and situational awareness, and training on special use airports; however it was not followed by either pilot.  The captain's handling of the airplane was outside the parameters specified in the company manuals.  Both pilots were described to having good flying skills.  The captain said the first officer was passive and quiet.  The first officer reported the captain was defensive and did not take criticism very well.  A definition of stabilized approach criteria was not found in the company manuals.  An FAA Advisory Circular dated August 10, 2000 defined stabilized approach criteria, and actions to be taken if the approach was not stabilized."
chunks = sliding_window(text)
print(len(chunks))

11


In [44]:
from transformers import BartTokenizer, BartForConditionalGeneration

# 加载预训练的BART模型和分词器
model_name = './model/fine_tuned_bart_narr32'
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name)

def summarize(text):
    # 将输入文本进行分词
    inputs = tokenizer.encode("summarize: " + text, return_tensors='pt', max_length=1024, truncation=True)
    
    # 使用模型生成摘要
    summary_ids = model.generate(inputs, max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True)
    
    # 将生成的摘要解码为文本
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

# 示例文本
text = "the pilot was flying over a prospective landing site located atop a mountain ridge, when he inadvertently allowed the wheels of the airplane to touchdown. the airplane bounced, and the pilot applied full engine power in an attempt to abort the landing. during the aborted landing, the right wing struck a tree at the departure end of the site, and the airplane descended onto soft tundra. the airplane nosed over and received damage to the right wing and the right wing lift strut. the pilot noted that there were no preaccident mechanical anomalies with the airplane."

# 生成摘要
summary = summarize(text)
print("Summary:", summary)

Summary: recovery from a bounced landing was delayed by the pilot, which resulted in an in-flight collision with a tree and soft terrain, and subsequent nose over and nose over.
