In [2]:
import re

def read_text(file_path):
    with open(file_path, "r", encoding="utf-8") as file:
        text = file.read()
    return text

def preprocess_text(text):
    """文本预处理：清洗、分句、去除无关字符"""
    # 分句
    sentences = re.split(r'(?<=[。！？\?])', text)
    sentences = [s.strip() for s in sentences if len(s.strip()) > 5]
    
    # 清洗
    cleaned_sentences = []
    for sentence in sentences:
        # 去除引号、括号等字符
        cleaned = re.sub(r'[《》“”\n（）()]', '', sentence)
        cleaned_sentences.append(cleaned)
    return cleaned_sentences

In [3]:
text1 = read_text("text1.txt")
preprocess_text(text1)

['据外媒报道，美国两架海军军机26日分别坠毁在南海，无人员伤亡。',
 '第一起坠机事件涉及一架MH-60R海鹰直升机。',
 '根据美国海军太平洋舰队的声明，这架直升机在尼米兹号航空母舰进行例行操作时坠入南海。',
 '声明称，直升机上的三名机组人员被搜救队救起。',
 '半小时后，一架波音F/A-18F超级大黄蜂战斗机在尼米兹号航空母舰执行例行任务时也坠毁在南海。',
 '机上两名机组人员成功弹射逃生，被安全救起。',
 '据美国海军称，所有相关人员均安全且情况稳定。',
 '两起事故原因正在调查中。',
 ' 军事专家张军社27日接受环球时报采访时表示，美国在南海一天内先后坠毁一架舰载战斗机和一架直升机，这一事件并非偶然。',
 '美军长期在南海、亚太及全球范围内维持高强度战备状态，不断进行军事部署，以维持其霸权地位和国际警察角色。',
 '长期高压运作使美军兵力紧张、人员疲惫，事故发生的风险自然随之上升。',
 '他认为，此次事故很可能与操作疏忽或过度疲劳等因素有关。',
 '军事专家宋忠平27日接受环球时报采访时也持相似观点。',
 '他指出，美军长期以所谓航行自由为借口，在南海频繁炫耀武力，意在彰显其军事存在。',
 '表面上看，美国作为军事霸主仍在维持强势姿态，但实际上，即便拥有11 艘航空母舰，面对如此繁重的任务，美军也已力不从心。',
 '宋忠平分析称，美军航母肩负全球部署和多重任务，长期在中东及其他地区高强度执行作战和训练，加之部分官兵存在懈怠、厌战情绪，导致安全风险上升。',
 '因此，同一天发生两起坠机事故虽令人震惊，但并不令人意外。']

In [44]:
import spacy
import jieba
import jieba.posseg as pseg
import re
from collections import defaultdict

class DependencyTripleExtractor:
    def __init__(self):
        self.nlp = spacy.load("zh_core_web_sm")
        
        # 初始化关系映射规则
        self.init_relation_relus()

    def init_relation_relus(self):
        """初始化依存关系到语义关系的映射规则"""
        # 基础依存关系映射
        self.dep_to_relation = {
            'nsubj': '',  # 主语，使用动词本身
            'dobj': '',   # 宾语，使用动词本身
            'nmod:poss': '的',  # 属格关系
            'prep': '',   # 介词，与动词组合
            'pobj': '',   # 介词宾语
            'appos': '是',  # 同位语关系
            'amod': '',   # 形容词修饰
            'nummod': ''  # 数量修饰
        }
    
    def preprocess_text(self, text):
        """文本预处理：清洗、分句、去除无关字符"""
        # 分句
        sentences = re.split(r'(?<=[。！？\?])', text)
        sentences = [s.strip() for s in sentences if len(s.strip()) > 5]
        
        # 清洗
        cleaned_sentences = []
        for sentence in sentences:
            # 去除引号、括号等字符
            cleaned = re.sub(r'[《》“”\n（）()]', '', sentence)
            cleaned_sentences.append(cleaned)
        return cleaned_sentences
    
    def analyze_sentence(self, sentence):
        """分析句子结构：分词、词性标注、依存句法分析"""
        # 实验spacy进行依存分析
        doc = self.nlp(sentence)

        # 构建依存关系图
        dependency_info = []
        root_token = None
        for token in doc:
            dep_info = {
                'text': token.text,
                'lemma': token.lemma_,
                'pos': token.pos_,
                'dep': token.dep_,
                'head_text': token.head.text,
                'head_pos': token.head.pos_,
                'is_root': token.dep == 'ROOT'
            }
            dependency_info.append(dep_info)
            if token.dep == 'ROOT':
                root_token = token
        return doc, dependency_info, root_token

    def extract_triples_by_rules(self, doc, root_token):
        """基于规则的三元组抽取"""
        triples = []

        # 规则1：主谓宾结构
        svo_triples = self.extract_svo_triples(doc, root_token)
        triples.extend(svo_triples)

        # 规则2：介词结构
        prep_triples = self.extract_preposition_triples(doc)
        triples.extend(prep_triples)

        # 规则3：属格关系
        poss_triples = self.extract_possessive_triples(doc)
        triples.extend(poss_triples)

        # 规则4：同位语关系
        appos_triples = self.extract_appos_triples(doc)
        triples.extend(appos_triples)

        return triples
    
    def extract_svo_triples(self, doc, root_token):
        """抽取主谓宾结构的三元组"""
        triples = []
        # 找到所有动词
        verbs = [token for token in doc if token.pos_ == 'VERB']
        if not verbs and root_token:
            verbs = [root_token]
        
        for verb in verbs:
            # 寻找主语
            subjects = []
            for child in verb.children:
                if child.dep_ in ['nsubj', 'nsubj:pass']:
                    subjects.append(child)
            
            # 寻找宾语
            objects = []
            for child in verb.children:
                if child.dep_ in ['dobj', 'obj']:
                    objects.append(child)
                elif child.dep_ == 'prep':
                    objects.append(child)
            
            # 生成三元组
            for subject in subjects:
                for object in objects:
                    # 获取完整的短语
                    subject_phrase = self.get_complete_phrase(subject)
                    object_phrase = self.get_complete_phrase(object)
                    relation = verb.text

                    # 验证是否合理
                    if self.is_valid_triple(subject_phrase, relation, object_phrase):
                        triple = {
                            'subject': subject_phrase,
                            'relation': relation,
                            'object': object_phrase,
                            'subject_type': self.classify_entity_type(subject),
                            'object_type': self.classify_entity_type(object),
                            'confidence': 0.8,
                            'rule': 'SVO'
                        }
                        triples.append(triple)
        return triples
    
    def extract_preposition_triples(self, doc):
        """抽取介词结构和三元组"""
        triples = []
        # 找到所有介词
        prepositions = [token for token in doc if token.dep_ in ['prep', 'case']]

        for prep in prepositions:
            head = prep.head
            # 寻找介词的宾语
            pobjects = []
            # 1. 直接查找 pobj 关系
            for child in prep.children:
                if child.dep_ == 'pobj':
                    pobjects.append(child)
            
            # 2. 如果介词是case关系，其父节点就是宾语的核心
            if prep.dep_ == 'case' and not pobjects:
                # prep.head 就是宾语的核心词
                # 找到以核心词为根的完整名词短语
                if head.pos_ != 'VERB':
                    pobjects.append(head)
            # 3. 查找其他可能的宾语关系
            if not pobjects:
                for child in head.children:
                    if child.dep_ in ['dobj', 'obj', 'nmod']:
                        pobjects.append(child)
            
            for pobj in pobjects:
                # 确定关系动词
                verb = None
                # 1. 介词依附于动词
                if head.pos_ == 'VERB':
                    verb = head
                else: # 2. 寻找包含这个介词结构的短语
                    # 向上查找动词祖先
                    current = head
                    while current.head != current:  
                        if current.head.pos_ == 'VERB':
                            verb = current.head
                            break
                        current = current.head
                if verb:
                    # 寻找动词的主语
                    subjects = [child for child in verb.children if child.dep_ in ['nsubj', 'nusbj:pass']]
                    # 如果找不到主语，尝试查找话题主语
                    if not subjects:
                        for child in verb.children:
                            if child.dep_ in ['csubj']:
                                subjects.append(child)
                    
                    for subject in subjects:
                        subject_phrase = self.get_complete_phrase(subject)
                        object_phrase = self.get_complete_phrase(pobj)
                        # 关系 = 动词 + 介词
                        relation = f"{verb.text}{prep.text}"
                        # 验证三元组是否合理
                        if self.is_valid_triple(subject_phrase, relation, object_phrase):
                            triple = {
                                'subject': subject_phrase,
                                'relation': relation,
                                'object': object_phrase,
                                'subject_type': self.classify_entity_type(subject),
                                'object_type': self.classify_entity_type(pobj),
                                'confidence': 0.7,
                                'rule': 'PREP',
                                'preposition': prep.text,
                                'verb': verb.text
                            }
                            triples.append(triple)
        return triples 


    def extract_possessive_triples(self, doc):
        """抽取属格关系的三元组"""
        triples = []
        # 找到所有属格关系
        poss_relations = [token for token in doc if token.dep_ in ['poss', 'nmod:poss']]
        
        for poss in poss_relations:
            possessor = poss # 拥有者
            possessed = poss.head # 被拥有物

            possessor_phrase = self.get_complete_phrase(possessor)
            possessed_phrase = self.get_complete_phrase(possessed)
            # 使用“的”关系
            relation = "的"
            if self.is_valid_triple(possessor_phrase, relation, possessed_phrase):
                triples = {
                    'subject': possessor_phrase,
                    'relation': relation,
                    'object': possessed_phrase,
                    'subject_type': self.classify_entity_type(possessor),
                    'object_type': self.classify_entity_type(possessed),
                    'confidence': 0.9,
                    'rule': 'POSS'
                }
                triples.append(triples)
        return triples
    
    def extract_appos_triples(self, doc):
        """
        抽取同位语关系的三元组
        """
        triples = []
        
        # 找到所有同位语关系
        appos_relations = [token for token in doc if token.dep_ == 'appos']
        
        for appos in appos_relations:
            entity1 = appos  # 同位语
            entity2 = appos.head  # 主要实体
            
            entity1_phrase = self.get_complete_phrase(entity1)
            entity2_phrase = self.get_complete_phrase(entity2)
            
            relation = "是"
            
            if self.is_valid_triple(entity1_phrase, relation, entity2_phrase):
                triple = {
                    'subject': entity1_phrase,
                    'relation': relation,
                    'object': entity2_phrase,
                    'subject_type': self.classify_entity_type(entity1),
                    'object_type': self.classify_entity_type(entity2),
                    'confidence': 0.85,
                    'rule': 'APPOS'
                }
                triples.append(triple)
        
        return triples

    def get_complete_phrase(self, token):
        """获取完整的短语"""
        phrase_tokens = []
        
        # 使用栈进行深度优先遍历
        stack = [token]
        visited = set()

        while stack:
            current_token = stack.pop()
            if current_token.i in visited:
                continue
            visited.add(current_token.i)
            # 添加当前token
            phrase_tokens.append((current_token.i, current_token.text))
            # 添加所有修饰当前token的子节点
            for child in current_token.children:
                if child.dep_ in ['amod', 'nummod', 'compound', 'nmod']:
                    stack.append(child)
        
        # 按原始顺序排序并组合
        phrase_tokens.sort(key=lambda x: x[0])
        phrase = ''.join([text for _, text in phrase_tokens])

        return phrase


    def optimize_relation(self, relation, rule_type):
        """
        基于领域知识优化关系表达
        """
        # 首先检查军事领域特定映射
        if relation in self.military_relation_map:
            return self.military_relation_map[relation]
        
        # 移除停用词
        stop_words = {'了', '的', '在', '是', '有', '着', '过'}
        for word in stop_words:
            relation = relation.replace(word, '')
        
        return relation.strip()
    
    def classify_entity_type(self, token):
        """基于词性和依存关系分类实体类型"""

        if token.pos_ in ['PROPN']: # 专有名词
            return 'ENTITY'
        elif token.pos_ == 'NOUN': # 普通名词
            return 'NOUN'
        elif token.pos_ == 'VERB': # 动词
            return 'ACTION'
        elif token.ent_type_ != '': # 已识别的实体类型
            return token.ent_type_
        else:
            return 'OTHER'
    
    def is_valid_triple(self, subject, relation, object):
        """验证三元组的有效性"""
        # 检查长度
        if len(subject) < 2 or len(object) < 2:
            return False
        # 检查关系有效性
        if len(relation) == 0 or relation in ['', ' ']:
            return False
        # 检查是否为无意义的组合
        meaningless = ['是是', '的的', '在在']
        for m in meaningless:
            if m in (subject + relation + object):
                return False
        # 代词/指示词过滤（单独作为实体不够具体）
        # pronouns = {'他','她','它','他们','我们','你','我','这','该','其','它们','这里','那里','这些','那些','这个','那个'}
        # if subject in pronouns or object in pronouns:
        #     return False
        # 检查主体和客体是否相同
        if subject == object:
            return False
        return True

    def post_process_triples(self, triples):
        """
        后处理：去重、排序、过滤
        """
        # 去重
        seen = set()
        unique_triples = []
        
        for triple in triples:
            # 创建唯一标识
            key = (triple['subject'], triple['relation'], triple['object'])
            
            if key not in seen:
                seen.add(key)
                unique_triples.append(triple)
        
        # 按置信度排序
        unique_triples.sort(key=lambda x: x['confidence'], reverse=True)
        
        return unique_triples
    
    def extract_from_text(self, text):
        """
        从文本中抽取三元组的主入口函数
        """
        print("开始处理文本...")
        print(f"文本长度: {len(text)} 字符")
        
        # 1. 文本预处理
        sentences = self.preprocess_text(text)
        print(f"分割为 {len(sentences)} 个句子")
        
        all_triples = []
        
        # 2. 逐句处理
        for i, sentence in enumerate(sentences):
            print(f"\n【处理第 {i+1} 句】")
            
            # 分析句子结构
            doc, dependency_info, root_token = self.analyze_sentence(sentence)
            
            # 打印依存分析结果（调试用）
            self.print_dependency_info(dependency_info)
            
            # 抽取三元组
            sentence_triples = self.extract_triples_by_rules(doc, root_token)
            
            print(f"本句抽取到 {len(sentence_triples)} 个三元组")
            for triple in sentence_triples:
                print(f"  ✓ ({triple['subject']}, {triple['relation']}, {triple['object']}) [{triple['rule']}]")
            
            all_triples.extend(sentence_triples)
        
        # 3. 后处理
        final_triples = self.post_process_triples(all_triples)
        
        print(f"\n{'='*60}")
        print(f"处理完成！共抽取 {len(final_triples)} 个唯一三元组")
        print(f"{'='*60}")
        
        return final_triples
    
    def print_dependency_info(self, dependency_info):
        """
        打印依存分析信息（用于调试）
        """
        print("依存分析结果:")
        for info in dependency_info:
            marker = "★" if info['is_root'] else " "
            print(f"  {marker} {info['text']}({info['pos']}) --{info['dep']}--> {info['head_text']}({info['head_pos']})")

In [45]:
model = DependencyTripleExtractor()
sentences = model.preprocess_text(text1)
doc, dependency_info, root_token = model.analyze_sentence(sentences[3])
material1_triples = model.extract_from_text(text1)

开始处理文本...
文本长度: 651 字符
分割为 17 个句子

【处理第 1 句】
依存分析结果:
    据(ADP) --case--> 报道(VERB)
    外媒(NOUN) --nsubj--> 报道(VERB)
    报道(VERB) --nmod:prep--> 坠毁(VERB)
    ，(PUNCT) --punct--> 坠毁(VERB)
    美国(PROPN) --nmod--> 军机(NOUN)
    两(NUM) --nummod--> 军机(NOUN)
    架(NUM) --mark:clf--> 两(NUM)
    海军(NOUN) --compound:nn--> 军机(NOUN)
    军机(NOUN) --nsubj--> 坠毁(VERB)
    26日(NOUN) --nmod:tmod--> 坠毁(VERB)
    分别(ADV) --advmod--> 坠毁(VERB)
    坠毁(VERB) --ROOT--> 坠毁(VERB)
    在(ADP) --case--> 南海(PROPN)
    南海(PROPN) --nmod:prep--> 坠毁(VERB)
    ，(PUNCT) --punct--> 坠毁(VERB)
    无(VERB) --conj--> 坠毁(VERB)
    人员(NOUN) --nsubj--> 伤亡(VERB)
    伤亡(VERB) --ccomp--> 无(VERB)
    。(PUNCT) --punct--> 坠毁(VERB)
本句抽取到 1 个三元组
  ✓ (美国两军机, 坠毁在, 南海) [PREP]

【处理第 2 句】
依存分析结果:
    第一(NUM) --nummod--> 事件(NOUN)
    起(NUM) --mark:clf--> 第一(NUM)
    坠机(NOUN) --compound:nn--> 事件(NOUN)
    事件(NOUN) --nsubj--> 涉及(VERB)
    涉及(VERB) --ROOT--> 涉及(VERB)
    一(NUM) --nummod--> 直升机(NOUN)
    架(NUM) --mark:clf--> 一(NUM)
    MH(PROPN) --

In [10]:
doc, dependency_info, root_token = model.analyze_sentence("李克强总理今天来我家了,我感到非常荣幸")
doc, dependency_info, root_token = model.analyze_sentence(doc)
triples = model.extract_preposition_triples(doc)
print(triples)
for token in doc:
    print(f"词: {token.text}, 依存关系: {token.dep_}, 父节点: {token.head.text}")

[]
词: 李克强, 依存关系: nmod:assmod, 父节点: 总理
词: 总理, 依存关系: nsubj, 父节点: 我家
词: 今天, 依存关系: nmod:tmod, 父节点: 我家
词: 来, 依存关系: xcomp, 父节点: 我家
词: 我家, 依存关系: ROOT, 父节点: 我家
词: 了, 依存关系: dep, 父节点: 我家
词: ,, 依存关系: punct, 父节点: 我家
词: 我, 依存关系: nsubj, 父节点: 感到
词: 感到, 依存关系: conj, 父节点: 我家
词: 非常, 依存关系: advmod, 父节点: 荣幸
词: 荣幸, 依存关系: ccomp, 父节点: 感到


In [None]:
# 查看依存关系
doc = model.nlp(sentences[0])
for token in doc:
    print(f"词: {token.text}, 依存关系: {token.dep_}, 父节点: {token.head.text}")

AttributeError: 'dict' object has no attribute 'text'

In [34]:
sentences[0]

'据外媒报道，美国两架海军军机26日分别坠毁在南海，无人员伤亡。'