In [1]:
import re

def read_text(file_path):
    with open(file_path, "r", encoding="utf-8") as file:
        text = file.read()
    return text

def preprocess_text(text):
    """文本预处理：清洗、分句、去除无关字符"""
    # 分句
    sentences = re.split(r'(?<=[。！？\?])', text)
    sentences = [s.strip() for s in sentences if len(s.strip()) > 5]
    
    # 清洗
    cleaned_sentences = []
    for sentence in sentences:
        # 去除引号、括号等字符
        cleaned = re.sub(r'[《》“”\n（）()]', '', sentence)
        cleaned_sentences.append(cleaned)
    return cleaned_sentences

In [2]:
text1 = read_text("text1.txt")
preprocess_text(text1)

['据外媒报道，美国两架海军军机26日分别坠毁在南海，无人员伤亡。',
 '第一起坠机事件涉及一架MH-60R海鹰直升机。',
 '根据美国海军太平洋舰队的声明，这架直升机在尼米兹号航空母舰进行例行操作时坠入南海。',
 '声明称，直升机上的三名机组人员被搜救队救起。',
 '半小时后，一架波音F/A-18F超级大黄蜂战斗机在尼米兹号航空母舰执行例行任务时也坠毁在南海。',
 '机上两名机组人员成功弹射逃生，被安全救起。',
 '据美国海军称，所有相关人员均安全且情况稳定。',
 '两起事故原因正在调查中。',
 ' 军事专家张军社27日接受环球时报采访时表示，美国在南海一天内先后坠毁一架舰载战斗机和一架直升机，这一事件并非偶然。',
 '美军长期在南海、亚太及全球范围内维持高强度战备状态，不断进行军事部署，以维持其霸权地位和国际警察角色。',
 '长期高压运作使美军兵力紧张、人员疲惫，事故发生的风险自然随之上升。',
 '他认为，此次事故很可能与操作疏忽或过度疲劳等因素有关。',
 '军事专家宋忠平27日接受环球时报采访时也持相似观点。',
 '他指出，美军长期以所谓航行自由为借口，在南海频繁炫耀武力，意在彰显其军事存在。',
 '表面上看，美国作为军事霸主仍在维持强势姿态，但实际上，即便拥有11 艘航空母舰，面对如此繁重的任务，美军也已力不从心。',
 '宋忠平分析称，美军航母肩负全球部署和多重任务，长期在中东及其他地区高强度执行作战和训练，加之部分官兵存在懈怠、厌战情绪，导致安全风险上升。',
 '因此，同一天发生两起坠机事故虽令人震惊，但并不令人意外。']

In [27]:
import spacy
import jieba
import jieba.posseg as pseg
import re
from collections import defaultdict

class DependencyTripleExtractor:
    def __init__(self):
        self.nlp = spacy.load("zh_core_web_sm")
        # 加载领域词典
        # self.load_domain_dictionary()
        
        # 初始化关系映射规则
        self.init_relation_relus()
    
    def init_relation_relus(self):
        """初始化依存关系到语义关系的映射规则"""
        # 基础依存关系映射
        self.dep_to_relation = {
            'nsubj': '',  # 主语，使用动词本身
            'dobj': '',   # 宾语，使用动词本身
            'nmod:poss': '的',  # 属格关系
            'prep': '',   # 介词，与动词组合
            'pobj': '',   # 介词宾语
            'appos': '是',  # 同位语关系
            'amod': '',   # 形容词修饰
            'nummod': ''  # 数量修饰
        }
    
    def preprocess_text(self, text):
        """文本预处理：清洗、分句、去除无关字符"""
        # 分句
        sentences = re.split(r'(?<=[。！？\?])', text)
        sentences = [s.strip() for s in sentences if len(s.strip()) > 5]
        
        # 清洗
        cleaned_sentences = []
        for sentence in sentences:
            # 去除引号、括号等字符
            cleaned = re.sub(r'[《》“”\n（）()]', '', sentence)
            cleaned_sentences.append(cleaned)
        return cleaned_sentences
    
    def analyze_sentence(self, sentence):
        """分析句子结构：分词、词性标注、依存句法分析"""
        # 实验spacy进行依存分析
        doc = self.nlp(sentence)
        
        # 构建依存关系图
        dependency_info = []
        root_token = None
        for token in doc:
            dep_info = {
                'text': token.text,
                'lemma': token.lemma_,
                'pos': token.pos_,
                'dep': token.dep_,
                'head_text': token.head.text,
                'head_pos': token.head.pos_,
                'is_root': token.dep == 'ROOT'
            }
            dependency_info.append(dep_info)
            if token.dep == 'ROOT':
                root_token = token
        return doc, dependency_info, root_token
    
    def extract_triples_by_rules(self, doc, dependency_info, root_token):
        """基于规则的三元组抽取"""
        triples = []

        # 规则1：主谓宾结构
        svo_triples = self.extract_svo_triples(doc, root_token)
        triples.extend(svo_triples)

        # 规则2：介词结构
        prep_triples = self.extract_preposition_triples(doc)
        triples.extend(prep_triples)

        # 规则3：属格关系
        poss_triples = self.extract_possessive_triples(doc)
        triples.extend(poss_triples)

        # 规则4：同位语关系
        appos_triples = self.extract_appos_triples(doc)
        triples.extend(appos_triples)

        return triples
    
    def extract_svo_triples(self, doc, root_token):
        """抽取主谓宾结构的三元组"""
        triples = []
        # 找到所有动词
        verbs = [token for token in doc if token.pos_ == 'VERB']
        if not verbs and root_token:
            verbs = [root_token]
        
        for verb in verbs:
            # 寻找主语
            subjects = []
            for child in verb.children:
                if child.dep_ in ['nsubj', 'nsubj:pass']:
                    subjects.append(child)
            
            # 寻找宾语
            objects = []
            for child in verb.children:
                if child.dep_ in ['dobj', 'obj']:
                    objects.append(child)
                elif child.dep_ == 'prep':
                    objects.append(child)
            
            # 生成三元组
            for subject in subjects:
                for object in objects:
                    # 获取完整的短语
                    subject_phrase = self.get_complete_phrase(subject)
                    object_phrase = self.get_complete_phrase(object)
                    relation = verb.text

                    # 验证是否合理
                    if self.is_valid_triple(subject_phrase, relation, object_phrase):
                        triple = {
                            'subject': subject_phrase,
                            'relation': relation,
                            'object': object_phrase,
                            'subject_type': self.classify_entity_type(subject),
                            'object_type': self.classify_entity_type(object),
                            'confidence': 0.8,
                            'rule': 'SVO'
                        }
                        triples.append(triple)
        return triples
    
    def extract_preposition_triples(self, doc):
        """抽取介词结构的三元组"""
        triples = []
        # 方法1：基于介词修饰关系
        triples.extend(self._extract_prep_triples(doc))
        # 方法2：基于名词修饰关系（nmod）
        triples.extend(self._extract_nmod_triples(doc))
        # 方法3：基于被动语态的施事关系
        # triples.extend(self._extract_agent_triples(doc))

        return triples

    def _extract_prep_triples(self, doc):
        """抽取基于prep关系的介词结构"""
        print("Extracting preposition triples...")
        triples = []
        # 找到所有介词
        prepositions = [token for token in doc if token.dep_ == 'prep']
        for prep in prepositions:
            head_verb = prep.head
            print(f"head_verb: {head_verb.text}")
            # 介词的宾语
            pobjects = [child for child in prep.children if child.dep_ == 'pobj']
            print(f"pobjects: {[p.text for p in pobjects]}")
            if pobjects and head_verb.pos_ == 'VERB':
                # 找到动词的主语
                subjects = [child for child in head_verb.children if child.dep_ in ['nsubj', 'nsubj:pass']]
                for subject in subjects:
                    for pobj in pobjects:
                        subject_phrase = self.get_complete_phrase(subject)
                        object_phrase = self.get_complete_phrase(pobj)
                        # 关系 = 动词 + 介词
                        relation = f"{head_verb.text}{prep.text}"
                        print(f"relation: {relation}")
                        # 验证是否合理
                        if self.is_valid_triple(subject_phrase, relation, object_phrase):
                            triple = {
                                'subject': subject_phrase,
                                'relation': relation,
                                'object': object_phrase,
                                'subject_type': self.classify_entity_type(subject),
                                'object_type': self.classify_entity_type(object),
                                'confidence': 0.7,
                                'rule': 'PREP'
                            }
                        triples.append(triple)

    def _extract_nmod_triples(self, doc):
        """抽取基于nmod关系的介词结构"""
        triples = []
        for token in doc:
            if token.dep_ in ['nmod', 'nmod:prep']:
                print(f"nmod token: {token.text}")
                head_noun = token.head
                modifier = token

                # 检查是否有介词标记
                preposition = None
                for child in modifier.children:
                    if child.dep_ == 'case':
                        preposition = child.text
                        break
                
                if preposition:
                    subject_phrase = self.get_complete_phrase(head_noun)
                    object_phrase = self.get_complete_phrase(modifier)
                    if self.is_valid_triple(subject_phrase, preposition, object_phrase):
                        triple = {
                            'subject': subject_phrase,
                            'relation': preposition,
                            'object': object_phrase,
                            'subject_type': self.classify_entity_type(head_noun),
                            'object_type': self.classify_entity_type(modifier),
                            'confidence': 0.6,
                            'rule': 'PREP'
                        }
                        triples.append(triple)
        return triples

    def extract_possessive_triples(self, doc):
        """抽取属格关系的三元组"""
        triples = []
        # 找到所有属格关系
        poss_relations = [token for token in doc if token.dep_ in ['poss', 'nmod:poss']]
        

    def get_complete_phrase(self, token):
        """获取完整的短语"""
        phrase_tokens = []
        
        # 使用栈进行深度优先遍历
        stack = [token]
        visited = set()

        while stack:
            current_token = stack.pop()
            if current_token.i in visited:
                continue
            visited.add(current_token.i)
            # 添加当前token
            phrase_tokens.append((current_token.i, current_token.text))
            # 添加所有修饰当前token的子节点
            for child in current_token.children:
                if child.dep_ in ['amod', 'nummod', 'compound', 'nmod']:
                    stack.append(child)
        
        # 按原始顺序排序并组合
        phrase_tokens.sort(key=lambda x: x[0])
        phrase = ''.join([text for _, text in phrase_tokens])

        return phrase

    def classify_entity_type(self, token):
        """基于词性和依存关系分类实体类型"""

        if token.pos_ in ['PROPN']: # 专有名词
            return 'ENTITY'
        elif token.pos_ == 'NOUN': # 普通名词
            return 'NOUN'
        elif token.pos_ == 'VERB': # 动词
            return 'ACTION'
        elif token.ent_type_ != '': # 已识别的实体类型
            return token.ent_type_
        else:
            return 'OTHER'
    
    def is_valid_triple(self, subject, relation, object):
        """验证三元组的有效性"""
        # 检查长度
        if len(subject) < 2 or len(object) < 2:
            return False
        # 检查关系有效性
        if len(relation) == 0 or relation in ['', ' ']:
            return False
        # 检查是否为无意义的组合
        meaningless = ['是是', '的的', '在在']
        for m in meaningless:
            if m in (subject + relation + object):
                return False
        # 代词/指示词过滤（单独作为实体不够具体）
        # pronouns = {'他','她','它','他们','我们','你','我','这','该','其','它们','这里','那里','这些','那些','这个','那个'}
        # if subject in pronouns or object in pronouns:
        #     return False
        # 检查主体和客体是否相同
        if subject == object:
            return False
        return True


In [28]:
model = DependencyTripleExtractor()
sentences = model.preprocess_text(text1)
doc, dependency_info, root_token = model.analyze_sentence(sentences[0])
triples = model._extract_nmod_triples(doc)
triples

nmod token: 报道
nmod token: 美国
nmod token: 南海


[{'subject': '坠毁',
  'relation': '据',
  'object': '报道',
  'subject_type': 'ACTION',
  'object_type': 'ACTION',
  'confidence': 0.6,
  'rule': 'PREP'},
 {'subject': '坠毁',
  'relation': '在',
  'object': '南海',
  'subject_type': 'ACTION',
  'object_type': 'ENTITY',
  'confidence': 0.6,
  'rule': 'PREP'}]

In [37]:
doc, dependency_info, root_token = model.analyze_sentence("学生们在图书馆认真学习")
doc, dependency_info, root_token = model.analyze_sentence(doc)
triples = model.extract_svo_triples(doc, root_token)
print(triples)

[]


In [33]:
# 查看依存关系
doc = model.nlp(sentences[0])
for token in doc:
    print(f"词: {token.text}, 依存关系: {token.dep_}, 父节点: {token.head.text}")

词: 据, 依存关系: case, 父节点: 报道
词: 外媒, 依存关系: nsubj, 父节点: 报道
词: 报道, 依存关系: nmod:prep, 父节点: 坠毁
词: ，, 依存关系: punct, 父节点: 坠毁
词: 美国, 依存关系: nmod, 父节点: 军机
词: 两, 依存关系: nummod, 父节点: 军机
词: 架, 依存关系: mark:clf, 父节点: 两
词: 海军, 依存关系: compound:nn, 父节点: 军机
词: 军机, 依存关系: nsubj, 父节点: 坠毁
词: 26日, 依存关系: nmod:tmod, 父节点: 坠毁
词: 分别, 依存关系: advmod, 父节点: 坠毁
词: 坠毁, 依存关系: ROOT, 父节点: 坠毁
词: 在, 依存关系: case, 父节点: 南海
词: 南海, 依存关系: nmod:prep, 父节点: 坠毁
词: ，, 依存关系: punct, 父节点: 坠毁
词: 无, 依存关系: conj, 父节点: 坠毁
词: 人员, 依存关系: nsubj, 父节点: 伤亡
词: 伤亡, 依存关系: ccomp, 父节点: 无
词: 。, 依存关系: punct, 父节点: 坠毁


In [34]:
sentences[0]

'据外媒报道，美国两架海军军机26日分别坠毁在南海，无人员伤亡。'