#1. Import thư viện

In [None]:
import re
import nltk
import pprint
import copy
import glob
from pprint import pprint
from nltk.tree import Tree
from itertools import groupby

# 2. Các function thông dụng

In [None]:
# Dùng luật để kiểm tra label có phải là head hay không
def is_head(rule, label):
  return re.search(rule, label)

In [None]:
# Vì lúc tìm headword cho các address(kiểu list) của các phrase
# thì return về dic[address]=headword nhưng dic không lưu đc dạng list -> đổi list thành string
# Nên khi sài lại address thì cần chuyển đổi list(kiểu string) thành real list
def str_to_list(s):
  result = []
  num = ''
  for i in s:
    if i != '[' and i !=']':
      if i != ',':
        if i.isnumeric():
          num +=i
      else: 
        result.append(int(num))
        num = ''
  result.append(int(num))
  return result

In [None]:
# Lấy tất cả index của các cây con 
def get_all_index_in_tree(tree):
  result = []
  for index, subtree in enumerate(tree):
    if type(subtree) != str:
      result.append([index])
  return result

In [None]:
# Trả về cây con thông qua địa chỉ
def get_subtree(subtree_address, tree):
  if type(subtree_address) == str:
    if subtree_address == 'root':
      return tree
    else:
      subtree_address = str_to_list(subtree_address) 
  for index in subtree_address:
    tree = tree[index]
  return tree

In [None]:
#Khám phá tất cả các phrases.
#Sử dụng thuật toán bfs - tìm kiếm theo chiều rộng
def get_all_subtree_address(tree):
  queue = get_all_index_in_tree(tree)
  explored = []
  while queue:
    node = queue.pop(0)
    if node not in explored:
      explored.append(node)
      subtree = get_subtree(node,tree)
      index_subtree_list = get_all_index_in_tree(subtree)
      for index_subtree in index_subtree_list:
        queue.append(node+index_subtree)
  return explored

In [None]:
#Do có thể xuất hiện nhiều từ giống nhau trong 1 câu
#Nên sẽ khó xác định đâu là headword của phrase
#-> Số hóa các từ trong câu: Mỗi từ sẽ biến đổi thành một số, số hóa bắt đầu từ số 1 
def from_word_to_number(tree):
  for index, leafPos in enumerate(tree.treepositions('leaves')):
    tree[leafPos] = str(index+1)
  return tree

In [None]:
#Trong lúc đề xuất luật infer dependency label cần xét cả nhãn gốc của từ 
#Nhãn gốc là node gần nhất với node lá(từ)
def get_POS_of_word(tree):
  result = {}
  for leafPos in tree.treepositions('leaves'):
    word = tree[leafPos]
    POS = tree[leafPos[:-1]].label()
    result[word] = POS
  return result

In [None]:
#Trả về danh sách các nhãn gốc của từ, dùng để điền vào format CONLLU
#Lưu ý: đối vs các nhãn NULL thì không lấy 'NONE' làm nhãn gốc
#Vì NONE no-use trong việc relink khi NULL làm head ở phần hậu xử lý 
def get_all_POS(tree):
  result = []
  for leafPos in tree.treepositions('leaves'):
    i = -1
    POS = tree[leafPos[:i]].label().split('-')[0]
    while POS == 'NONE':
      i = i - 1
      POS = tree[leafPos[:i]].label().split('-')[0]
    result.append(POS)
  return result

In [None]:
#Follow theo Choi's guideline, lưu function_tag như đặc trưng phụ, được điền vào format CONLLU
#Trả về function_tag của C(node cao nhất của headword)
def get_function_tag(tree):
  headword_of_phrase = assign_headword_for_phrase(tree)[0]
  C_of_headword = get_C_of_headword(headword_of_phrase)
  function_tags_of_word = {}
  for word in tree.leaves():
    C_label = get_subtree(C_of_headword[word], tree).label()
    function_tags = C_label.split('-')
    temp = []
    for function_tag in function_tags:
      if function_tag in ['PRD', 'CMP', 'LGS', 'CMP', 'MDP', 'TMP', 'LOC', 'MNR', 'PRP', 'ADV', 'CND', 'CNC']:
        temp.append(function_tag)
    if temp:
      temp = '-'.join(temp)
      function_tags_of_word[word] = temp
    else:
      function_tags_of_word[word] = '_'
  return function_tags_of_word

#3. Tìm headword cho các phrase
**Các bước cụ thể:**

+ Bước 1: Tìm head cho các phrase
+ Bước 2: Loop bước 1 đi sâu xuống cây để xác định được headword

**Lưu ý trong bước 1:**

  Nếu số phần tử list head trả về > 1 thì xét:
  + Nếu các label trong list head **giống nhau** thì thực hiện gán nhãn conjunction
  + Nếu các label trong list head **khác nhau**:
    + Xét trong khoảng từ first label đến last label trong list head có các nhãn như Cp, CONP, UCP thì gán nhãn conjunction.
    + Nếu không phải trường hợp conjunction thì xét theo luật phụ








In [None]:
# Trả về list các vị trí(index) có thể làm head và list label ứng với list các index 
def finding_head_of_tree(tree):
#Một số thay đổi:
#đổi VP, S ở S, và thêm PRD và VP, S ở SBAR thành VP, S, SBAr
# THÊM sQ vào S ở S và SBAR
# SQ: VP, Sq, S -> thêm VP và S
# thêm SPL
# Xem SPL như S ở S và SBAR
# Đảo -PRD trc S|SQ|SPL của S
# tÁCH VP thành 2 loại ở S
  head_percolation_rules = {
    "S":["-H$","^VP(-CMP|_|$)","-PRD$","^VP-","^(S|SQ|SPL)(-|_|$)","^ADJP(-|_|$)","^NP(-|_|$)"],
    "SBAR":["-H$","^VP(-|_|$)","^(S|SQ|SPL)(-|_|$)","^SBAR(-|_|$)","^ADJP(-|_|$)","^NP(-|_|$)"],
    "SQ": ["-H$","^VP(-|_|$)","^QVP(-|_|$)","^SQ(-|_|$)","^S(-|_|$)","^ADJP(-|_|$)","^NP(-|_|$)", "^QPP(-|_|$)"],
    "NP":["-H$","^NP(-|_|$)","^(Nc|Ncs|Nu|Nun|Nt|Nq|Num|Nw|Nr|Nn)(-|_|$)","^(Pd|Pp)(-|_|$)","^VP(-|_|$)"],
    "VP":["-H$","^VP(-|_|$)","^(Ve|Vc|D|Vcp|Vv)(-|_|$)","^(An|Aa)(-|_|$)","^ADJP(-|_|$)","^(Nc|Ncs|Nu|Nun|Nt|Nq|Num|Nw|Nr|Nn)(-|_|$)","^NP(-|_|$)","^SBAR(-|_|$)","^S(-|_|$)","^R(-|_|$)","^RP(-|_|$)","^PP(-|_|$)"],
    "ADJP":["-H$","^ADJP(-|_|$)","^(An|Aa)(-|_|$)","^(Nc|Ncs|Nu|Nun|Nt|Nq|Num|Nw|Nr|Nn)(-|_|$)","^S(-|_|$)"],
    "RP":["-H$","^RP(-|_|$)","^R(-|_|$)","^NP(-|_|$)"],
    "PP":["-H$","^PP(-|_|$)","^Cs(-|_|$)","^VP(-|_|$)","^SBAR(-|_|$)","^ADJP(-|_|$)","^QP(-|_|$)"],
    #"ADJP":["-H$","^ADJP(-|_|$)","^A(-|_|$)","^N(-|_|$)","^S(-|_|$)"],
    "QP":["-H$","^QP(-|_|$)","^Nq(-|_|$)","^Num(-|_|$)","^Nw(-|_|$)"],
    #"XP":["-H$","^XP(-|_|$)","^X(-|_|$)"],
    #"YP":["-H$","^YP(-|_|$)","^Y(-|_|$)"],
    "MDP":["-H$","^MDP(-|_|$)", "^Cs(-|_|$)", "^(An|Aa)(-|_|$)","^(Pd|Pp)(-|_|$)", "^R(-|_|$)", "^X(-|_|$)"],
    "QNP":["-H$","^QNP(-|_|$)","^NP(-|_|$)","^(Nc|Ncs|Nu|Nun|Nt|Nq|Num|Nw|Nr|Nn)(-|_|$)","^(Pd|Pp)(-|_|$)"],
    "QADJP":["-H$","^QADJP(-|_|$)","^(An|Aa)(-|_|$)","^(Nc|Ncs|Nu|Nun|Nt|Nq|Num|Nw|Nr|Nn)(-|_|$)","^(Ve|Vc|D|Vcp|Vv)(-|_|$)","^(Pd|Pp)(-|_|$)","^X(-|_|$)"],
    "QRP":["-H$","^QRP(-|_|$)","^(Pd|Pp)(-|_|$)","^Cs(-|_|$)","^X(-|_|$)"],
    "QPP":["-H$","^QPP(-|_|$)","^Cs(-|_|$)","^(Pd|Pp)(-|_|$)","^X(-|_|$)"],
    #"QXP":["-H$","^XP(-|_|$)","^X(-|_|$)"],
    #"QVP":["-H$","^(Ve|Vc|D|Vcp|Vv)(-|_|$)"],
    "UCP":["-H$"],
    "SPL":["-H$","^VP(-|_|$)","^SPL(-|_|$)", "^ADJP(-|_|$)","^NP(-|_|$)"]
    }

  phrase_type = tree.label().split('-')[0]

  head_index_result = []
  head_label_result = []
  if  phrase_type not in head_percolation_rules:
    for index, element in enumerate(tree):
      label_of_element = element.label()
      if is_head("-H$", label_of_element):
        head_index_result.append([index])
        head_label_result.append(label_of_element)

    if head_index_result:
      return head_index_result, head_label_result
    else:
      return [[0]], [tree[0].label()] 

  if phrase_type == 'RP':
    for rule in head_percolation_rules['RP']:
      if not head_index_result:
        for index, element in zip(range(len(tree)-1, -1, -1), reversed(tree)):
          label_of_element = element.label()
          if is_head(rule, label_of_element):
            head_index_result.append([index])
            head_label_result.append(label_of_element)
      else: 
        break
        
    if head_index_result:
      return head_index_result[::-1], head_label_result[::-1]
    else: 
      return [[0]], [tree[0].label()] 
    
  else:
    for rule in head_percolation_rules[phrase_type]:
      if not head_index_result:
        for index, element in enumerate(tree):
          label_of_element = element.label()
          if is_head(rule, label_of_element):
            head_index_result.append([index])
            head_label_result.append(label_of_element)
      else: 
        break
        
    if head_index_result:
      return head_index_result, head_label_result
    else: 
      return [[0]], [tree[0].label()] 

In [None]:
# Hàm dùng trong trường hợp các label trong head list khác nhau:
# Hàm lấy các label trong khoảng từ first label -> last label trong head list 
# Để xét coi có Cp, UCP, CONJP
def deepen_head_list(mother_tree, tree_address, head_index_list):
  deep_address_list = []
  deep_label_list = []
  first_index = head_index_list[0][0]
  last_index = head_index_list[-1][0]
  for index in range(first_index, last_index+1):
    if tree_address != 'root':
      subtree_address = tree_address + [index]
    else:
      subtree_address = [index]
    deep_address_list.append(subtree_address)
    deep_label_list.append(get_subtree(subtree_address, mother_tree).label())
  return deep_address_list, deep_label_list

In [None]:
# Dán nhãn conjunction 
def get_conjunction(P_address, C_address_list, C_label_list):
  P_of_C_dic = {}
  previous_C_address = str(C_address_list[0])
  for C_address, C_label in zip(C_address_list[1:], C_label_list[1:]):
    if C_label == 'PU':
      P_of_C_dic[str(C_address)] = (previous_C_address, 'PUNCT')
    elif re.search('^(Cp|CONJP)', C_label):
      P_of_C_dic[str(C_address)] = (previous_C_address, 'CC')
    else:
      P_of_C_dic[str(C_address)] = (previous_C_address, 'CONJ')
    if (C_label != 'PU') and (C_label != 'Cp') and ('CONJP' not in C_label):
      previous_C_address = str(C_address)
  return P_of_C_dic

In [None]:
def has_SPL_and_S(unique):
  has_S = False
  has_SPL = False
  for element in unique:
    if re.search('^S(-|$)', element):
      has_S = True
    elif re.search('^SPL(-|$)', element):
      has_SPL = True
  if has_S and has_SPL:
    return True
  else:
    return False

In [None]:
# Kiểm tra có phải là trường hợp conj hay không
def is_conjunction(C_label_list):

  unique = set(C_label_list)
  if len(unique) == 3:
    if ('PU' in unique) and ((('Cp' in unique) or ('CONJP' in unique)) and (('Cp' != C_label_list[0]) or ('CONJP' != C_label_list[0]))):
      return True
    elif (('PU' in unique) or ('Cp' in unique) or ('CONJP' in unique)) and has_SPL_and_S(unique):
      return True
  elif len(unique) == 2:
    if ('PU' in unique) or ((('Cp' in unique) or ('CONJP' in unique)) and (('Cp' != C_label_list[0]) or ('CONJP' != C_label_list[0]))):
      return True
    elif has_SPL_and_S(unique):
      return True
  elif len(unique) == 1:
    return True
  return False

def has_same_phrase_type(C_label_list):
  phrase_type_set= set()
  for C_label in C_label_list:
    phrase_type = C_label.split('-')[0]
    phrase_type_set.add(phrase_type)
    if len(phrase_type_set) == 2:
      return False
  return True

#Dùng xác định head khi head list > 1
def identify_head(tree, P_address, C_address_list, C_label_list, head_C_address_list, head_C_label_list):

  if is_conjunction(C_label_list):
    P_of_C_dic = get_conjunction(P_address, C_address_list, C_label_list)
    return [C_address_list[0], P_of_C_dic]

  if has_same_phrase_type(head_C_label_list):
    if (('Cp' in C_label_list) or ('CONJP' in C_label_list)) and (C_label_list[0] != 'Cp') and (C_label_list[0] != 'CONJP'):
      P_of_C_dic = get_conjunction(P_address, C_address_list, C_label_list)
      return [C_address_list[0], P_of_C_dic]
    else:
      first_element = head_C_address_list[0]
      if first_element[-1] >= 1:
        pre_address_of_head_C_address_list = first_element[:-1] + [first_element[-1]-1] 
        pre_subtree = get_subtree(pre_address_of_head_C_address_list, tree)
        pre_subtree_label = pre_subtree.label()
        if re.search('^(Cp|CONJP)', pre_subtree_label):
          P_of_C_dic = get_conjunction(P_address, C_address_list, C_label_list)
          return [C_address_list[0], P_of_C_dic]

    for head_C_address, head_C_label in zip(head_C_address_list, head_C_label_list):
      if '-' not in head_C_label:
        return [head_C_address]

    for head_C_address, head_C_label in zip(head_C_address_list, head_C_label_list):
      if '-SBJ' not in head_C_label:
        return [head_C_address]
  else:
    if ('Cp' in C_label_list) and (C_label_list[0] != 'Cp'):
      P_of_C_dic = get_conjunction(P_address, C_address_list, C_label_list)
      return [C_address_list[0], P_of_C_dic]

    else:
      P_phrase_type = get_subtree(P_address, tree).label().split('-')[0]
      head_exception_rules = {
          "NP":["^(Nn_swsp|Nn_w)(-|$)","^(Nn|Nu|Nun|Nt)(-|$)","^(Num|Nq|Nr)(-|$)", "^(Pd|Pp)"],
          "ADJP":["^(Aa)"],
          "QP":["^Nq(-|$)","^Num(-|$)"],
          "Nn_swsp":["^(Ncs|Nc)(-|$)"],
          "VP":["^(Ve|Vc|D|Vcp|Vv)(-|$)"],
          "S":["^(S|SQ|SPL)($)", "^(ADJP)"],
          "SBAR":["^(S|SQ|SPL)($)"],
          "PP":["^(Cs)"],
          "VP":["^(Vv|Vc|Ve)","^(Nq)"] # Luật theo cây bị gán sai :))

      }
      for rule in head_exception_rules[P_phrase_type]:
              for head_C_address, head_C_label  in zip(head_C_address_list, head_C_label_list):
                if is_head(rule, head_C_label):
                  return [head_C_address]
  return "Nope"      

In [None]:
# Trả về list các phrase có chung headword
def from_phrase_to_headword(mother_tree, tree, tree_address):
  phrase_to_headword = [tree_address]
  if tree_address != 'root':
    tree = get_subtree(tree_address, mother_tree)
  P_of_C_dic = {}
  while type(tree[0]) != str:
      head_index_list, head_label_list = dd_of_tree(tree)
      
      
      if len(head_index_list) == 1:
        if tree_address != 'root':
          tree_address = tree_address + head_index_list[0]
        else:
          tree_address = head_index_list[0]
        tree = get_subtree(tree_address, mother_tree)
        phrase_to_headword.append(tree_address)
      else:   
        address_list, label_list = deepen_head_list(mother_tree, tree_address, head_index_list)
        head_address_list = []
        for head_index in head_index_list:
          if tree_address != 'root':
            head_address_list.append(tree_address + head_index)
          else:
            head_address_list.append(head_index)
        result = identify_head(mother_tree, tree_address, address_list, label_list, head_address_list, head_label_list)
        tree_address = result[0]
        #print(tree_address, address_list, label_list)
        tree = get_subtree(tree_address, mother_tree)
        phrase_to_headword.append(tree_address)
        if len(result) == 2:
          P_of_C_dic.update(result[1])  
  phrase_to_headword.append(tree[0])
  return [phrase_to_headword, P_of_C_dic]

In [None]:
def assign_headword_for_phrase(tree):
  
  P_of_C_dic = {}
  headword_of_phrase = {}
  phrase_address_list = ['root'] + get_all_subtree_address(tree)
  
  #Tìm head
  for phrase_address in phrase_address_list:
    if str(phrase_address) not in headword_of_phrase:
      if phrase_address != 'root': 
        subtree = get_subtree(phrase_address, tree)
      else:    
        subtree = tree
      if type(subtree[0]) != str:
        result = from_phrase_to_headword(tree, subtree, phrase_address)       
        phrase_to_headword = result[0][:-1]
        headword = result[0][-1] 
        P_of_C_dic.update(result[1])
        for head_phrase_address in phrase_to_headword:
          headword_of_phrase[str(head_phrase_address)] = headword
      else:
        headword_of_phrase[str(phrase_address)] = subtree[0]

  for phrase_address in get_all_subtree_address(tree):
    subtree = get_subtree(phrase_address, tree)
    label = subtree.label().split('-')[0]
    if label == 'UCP':
      C_address_list = []
      C_label_list = []
      for index_subtree in range(len(subtree)):
        subtree_address = phrase_address+[index_subtree]
        subtree_label = get_subtree(subtree_address, tree).label() 
        C_address_list.append(subtree_address)
        C_label_list.append(subtree_label)
      P_of_C_dic.update(get_conjunction(phrase_address, C_address_list, C_label_list))
  return [headword_of_phrase, P_of_C_dic]

In [None]:
tree = Tree.fromstring('(S (PP-LOC (Cs-H Trong) (NP (Nn-H khoảnh) (Nn đất) (NP (QP (R khoảng) (Num-H 100)) (Nu-H m2)) (VP (ID-H che_mưa_che_nắng) (PP-MNR (Cs-H bằng) (NP (ADJP (Aa-H nhiều)) (Nn_swsp (Nc-H-1 tấm) (Nn bạt)) (VP (Vv *P*) (VP (Vv-H chắp_vá) (NP-DOB (Nn *D*-1))))))) (Pd ấy))) (NP-PRD (Nn-H bụi)) (ADJP-PRD (Aa-H mù_mịt)) (PU .))')
tree.pretty_print()
# tree = Tree.fromstring('(S (NP-SBJ (Nn-H Câu_chuyện) (Vv đánh_ghen) (NP-TMP (Nt-H ngày_mai))) (VP (R càng) (Vv-H chứng_tỏ) (NP-DOB (Nn-H điều) (Pd đó))) (PU ,) (NP-TMP (Nt-H khi) (SBAR (Cs *0*) (S (NP-SBJ (Num một) (Nn_swsp (Nc-H người) (Nn vợ)) (ADJP (Aa-H thảo_hiền))) (VP (Vv-H hoá) (VP-CMP (Vv-H thành) (NP-CMP (Num một) (Nn-H hoạn_thư) (NP (NP (Nn-H kế) (ADJP (Aa-H độc))) (PU ,) (NP (Nn-H mưu) (ADJP (Aa-H sâu)))))))))) (PU ...))')
from_word_to_number(tree)
assign_headword_for_phrase(tree)

                                                               S                                                                     
                    ___________________________________________|___________________________________________________________________   
                 PP-LOC                                                                                             |       |      | 
   ________________|_____________________________________                                                           |       |      |  
  |                                                      NP                                                         |       |      | 
  |      ________________________________________________|_____________________________________________________     |       |      |  
  |     |     |               |                          VP                                                    |    |       |      | 
  |     |     |               |                __________|_

[{'[0, 0]': '1',
  '[0, 1, 0]': '2',
  '[0, 1, 1]': '3',
  '[0, 1, 2, 0, 0]': '4',
  '[0, 1, 2, 0, 1]': '5',
  '[0, 1, 2, 0]': '5',
  '[0, 1, 2, 1]': '6',
  '[0, 1, 2]': '6',
  '[0, 1, 3, 0]': '7',
  '[0, 1, 3, 1, 0]': '8',
  '[0, 1, 3, 1, 1, 0, 0]': '9',
  '[0, 1, 3, 1, 1, 0]': '9',
  '[0, 1, 3, 1, 1, 1, 0]': '10',
  '[0, 1, 3, 1, 1, 1, 1]': '11',
  '[0, 1, 3, 1, 1, 1]': '10',
  '[0, 1, 3, 1, 1, 2, 0]': '12',
  '[0, 1, 3, 1, 1, 2, 1, 0]': '13',
  '[0, 1, 3, 1, 1, 2, 1, 1, 0]': '14',
  '[0, 1, 3, 1, 1, 2, 1, 1]': '14',
  '[0, 1, 3, 1, 1, 2, 1]': '13',
  '[0, 1, 3, 1, 1, 2]': '13',
  '[0, 1, 3, 1, 1]': '10',
  '[0, 1, 3, 1]': '8',
  '[0, 1, 3]': '7',
  '[0, 1, 4]': '15',
  '[0, 1]': '2',
  '[0]': '1',
  '[1, 0]': '16',
  '[1]': '16',
  '[2, 0]': '17',
  '[2]': '17',
  '[3]': '18',
  'root': '17'},
 {}]

#4. Dán nhãn 
Các bước cụ thể:
+ Bước 1: Xác định C(node cao nhất của headword) và P(node parent của C)
+ Bước 2: Viết luật dán nhãn được suy ra từ label của C và P


In [None]:
# Lấy C của word
def get_C_of_headword(headword_of_phrase):
  duplicate_headword_of_phrase = {}
  duplicate_headword_of_phrase.update(headword_of_phrase)
  del duplicate_headword_of_phrase['root']
  merge = {}
  for key, value in sorted(duplicate_headword_of_phrase.items()):
      merge.setdefault(value, []).append(key)
  C ={}
  for headword in merge:
    C[headword]=min(merge[headword], key=len)
  return C

In [None]:
# Lấy P của C
def get_P_of_C(phrase_address, root_address):
  if not str_to_list(phrase_address)[:-1]:
    return root_address
  return str(str_to_list(phrase_address)[:-1])

In [None]:
# Dán nhãn dựa trên các Luật
def get_dependency_relation(P_address, C_address, P_index, C_index, tree):
  #get C_label and P_label
  C_tree = get_subtree(C_address, tree)
  C = C_tree.label()
  P = get_subtree(P_address, tree).label()
  
  POS_of_word = get_POS_of_word(tree)
  p = POS_of_word[P_index]
  c = POS_of_word[C_index]
  # Nếu C là UCP thì relation sẽ đc quyết bởi POS trái nhất trong cây UCP
  if 'UCP' in C:
    C = C_tree[0].label()

  #Luật
  if has_SBJ(C):
    return has_SBJ(C)
  
  if is_ADJUNCT(C):
    return is_ADJUNCT(C)
  
  if is_ADVCL(C):
    return is_ADVCL(C)

  if is_NP_ADVMOD(C):
    return is_NP_ADVMOD(C)

  if is_ADJP_ADVMOD(P, C):
    return is_ADJP_ADVMOD(P, C)
  
  if is_PARATAXIS(C):
    return is_PARATAXIS(C)
  
  if is_VOCATIVE(C):
    return is_VOCATIVE(C)

  if is_APPOS(P, C):
    return is_APPOS(P, C)

  if is_SOUND(c):
    return is_SOUND(c)


  if is_VMOD_or_RCMOD(P, C, C_tree):
    return is_VMOD_or_RCMOD(P, C, C_tree)

  if is_NUM(P, C):
    return is_NUM(P, C)

  if is_NN(P, C):
    return is_NN(P, C)

  if is_PREP(C):
    return is_PREP(C)
  
  if is_POBJ_or_PCOMP(P, C):
    return is_POBJ_or_PCOMP(P, C)
    
  if is_PUNCT(C):
    return is_PUNCT(C)

  if is_CLF_or_NN(P, p):
    return is_CLF_or_NN(P, p)

  if is_AMOD(P, C):
    return is_AMOD(P, C)
  
  if is_NUMBER_or_QUANTMOD(P, C):
    return is_NUMBER_or_QUANTMOD(P, C)
  
  if is_NN(P, C):
    return is_NN(P, C)
  
  if is_DET(P, C):
    return is_DET(P, C)

  if is_ATTR(p, C):
    return is_ATTR(p, C)

  if is_IOBJ(P, C):
    return is_IOBJ(P, C)
  
  if is_OBJ_or_NP_ADVMOD(P, C):
    return is_OBJ_or_NP_ADVMOD(P, C)
  
  if is_AOBJ(P, C):
    return is_AOBJ(P, C)

  if is_CCOMP_or_XCOMP(P, C, C_tree):
    return is_CCOMP_or_XCOMP(P, C, C_tree)
  
  if is_ACOMP(P, C):
    return is_ACOMP(P, C)

  if is_SINO(P):
    return is_SINO(P)

  if is_INTJ(C):
    return is_INTJ(C)
  
  if is_CC(C):
    return is_CC(C)

  if is_MARK(C):
    return is_MARK(C)
#Luật phụ
  if is_NSUBJ(C, P):
    return is_NSUBJ(C, P)

  return 'DEP'

## Các luật chuyển đổi

In [None]:
# Hàm dùng để xác định xem  coi S có chủ ngữ không để suy ra CCOMP or XCOMP
def has_empty_subject(tree):
  if 'SBAR' in tree.label():
    for subtree in tree:
      if re.search('^S(-|$)', subtree.label()):
        for sub_subtree in subtree:
          if 'NP-SBJ' in sub_subtree.label() and 'NONE' in sub_subtree[0].label():
            return True
        return False
    return False 
  elif re.search('^S(-|$)', tree.label()):
    for subtree in tree:
      if 'NP-SBJ' in subtree.label() and 'NONE' in subtree[0].label():
        return True
    return False
  return False

In [None]:
# Hàm dùng để xem coi cây có thành phần POS mà mình muốn hay không
# Ví dụ xét coi cây có nhãn VP để suy ra CCOMP or XCOMP
def has_POS(tree, POS):
  if 'SBAR' in tree.label():
    for subtree in tree:
      if re.search('^S(-|$)', subtree.label()):
        for sub_subtree in subtree:
          label_sub_subtree = sub_subtree.label()
          if re.search('^{}(-|$)'.format(POS), label_sub_subtree):
            return True
        return False
    return False 

  elif re.search('^S(-|$)', tree.label()):
    for subtree in tree:
      label_subtree = subtree.label()
      if re.search('^{}(-|$)'.format(POS), label_subtree):
        return True
    return False
  return False

In [None]:
# Hàm dùng kiểm tra VP có danh từ hay k?
def has_Noun(tree):
  for subtree in tree:
    if re.search('^NP|Nc|Ncs|Nu|Nun|Nt|Nq|Num|Nw|Nr|Nn', subtree.label()):
      return True
  return False

In [None]:
#S(-|$) phân biệt vs SBAR
def has_SBJ(C):
  if ('NP-SBJ' in C) or ('QNP-SBJ' in C):
    return 'NSUBJ'
  elif 'ADJP-SBJ' in C:
    return 'ASUBJ'
  elif 'VP-SBJ' in C:
    return 'VSUBJ'
  elif ('S-SBJ' in C) or ('SPL-SBJ' in C) or ('SBAR-SBJ' in C) or ('SQ-SBJ' in C):
    return 'CSUBJ'
  else:
    return False

def is_NSUBJ(C, P):
  if re.search('^S$', P) and re.search('^NP$', C):
    return 'NSUBJ'
  else:
    return False
# def is_VMOD_or_RCMOD(P, C):#Ve|Vc|D|Vcp|Vv
#   if re.search('^(VP)', C):
#     if re.search('^(VP|ADJP|S)', P):
#       return 'VMOD'
#     elif re.search('^(NP|QNP)', P):
#       return 'RCMOD'
#     else:
#       return False
#   elif re.search('^(NP)', P) and re.search('^(Vv)', C) :
#     return 'VMOD'
#   else:
#     return False

def is_VMOD_or_RCMOD(P, C, C_tree):#Ve|Vc|D|Vcp|Vv
  if re.search('^(VP|ADJP|S)', P):
    if re.search('^(VP|Ve|Vc|D|Vcp|Vv|VN)', C):
      return 'VMOD'
    else:
      return False
  elif re.search('^(NP|QNP)', P):
    if re.search('^(VP)', C):
      if has_Noun(C_tree): # Fix phải có danh từ, check 24_90295, 44_7105
        return 'RCMOD'
      else:
        return 'VMOD'
    elif re.search('^(Ve|Vc|D|Vcp|Vv|VN)', C):
      return 'VMOD'
    else:
      return False
  else:
    return False

def is_NUM(P, C):
  if re.search('^(NP|QNP|Nn)', P) and re.search('^(Num|QP)', C):
    return 'NUM'
  else:
    return False

# def is_NN(P, C):
#   if re.search('^(NP)', P) and re.search('^(NP|ID)', C):
#     return 'NN'
#   else:
#     return False

def is_PREP(C):
  if re.search('^(PP|QPP)', C):
    return 'PREP'
  else:
    return False

def is_POBJ_or_PCOMP(P, C):
  if re.search('^(PP|QPP)', P):
    if re.search('^(NP)', C):
      return 'POBJ'
    else:
      return 'PCOMP'
  else:
    return False
    
def is_PUNCT(C):
  if re.search('^(PU|LBRK|RBRK)', C):
    return 'PUNCT'
  else:
    return False

def is_CLF_or_NN(P, p):
  if re.search('^(Nn_swsp)', P):
    if re.search('^(Ncs)', p):
      return 'NCS'
    elif re.search('^(Nc)', p):
      return 'NC'
    else:
      return 'NN'
  else:
    return False

def is_AMOD(P, C):
  if re.search('^(NP|QNP)', P) and re.search('^(VA|NA|Aa|An|ADJP)', C): 
    return 'AMOD'
  else:
    return False

def is_NUMBER_or_QUANTMOD(P, C):
  if re.search('^(QP)', P):
    if re.search('^(Num|Nq)', C):
      return 'NUMBER'
    else:
      return 'QUANTMOD'
  else: 
    return False 

def is_NN(P, C):
  if re.search('^(NP|QNP)', P) and re.search('^(NP|Nr|Nt|Nu|Nun|Nn|ID)', C):
    return 'NN'
  else:
    return False

def is_DET(P, C):
  if re.search('^(NP|QNP)', P) and re.search('^(Nw|Nq|Pd|Pp)', C):
    return 'DET' 
  else:
    return False

def is_ATTR(p, C):
  if re.search('^(Vc)', p) and re.search('^(NP|QNP)', C):
    return 'ATTR' 
  else:
    return False

def is_IOBJ(P, C):
  if re.search('^(VP)', P) and re.search('^(NP-IOB)', C):
    return 'IOBJ'
  else:
    return False

def is_OBJ_or_NP_ADVMOD(P, C):
  if re.search('^(VP)', P) and re.search('^(NP)(-MNR)', C):
    return 'NP_ADVMOD'
  elif (re.search('^(VP)', P) and re.search('^(NP|QNP|Nn)', C)) or ('-DOB' in C):
    return 'OBJ'
  else:
    return False

def is_AOBJ(P, C):
  if (re.search('^(ADJP)', P) and re.search('^(NP|QNP|Nn)', C)) or ('-DOB' in C):
    return 'AOBJ'
  else:
    return False

def is_CCOMP_or_XCOMP(P, C, C_tree):
  # if re.search('^(NP)', P):  
  #   if re.search('^(S)(-CMP|$)', C): # phân biệt SBAR
  #     return 'CCOMP'
  #   elif re.search('^(SBAR)(-CMP|$)', C) and C_tree[0].label() == 'Cs':
  #     return 'CCOMP'
  #   else:
  #     return 'False'
  if re.search('^(VP|NP|ADJP|RP|SQ|S|QVP|QNP)', P):
    if re.search('^(SQ|SPL)(-|$)', C):
      return 'CCOMP'
    elif re.search('^(S)(-[0-9]|-CMP|$)', C):
      if has_empty_subject(C_tree) and has_POS(C_tree, 'VP'):
        return 'XCOMP'
      else:
        return 'CCOMP'
    elif re.search('^(SBAR)(-[0-9]|-CMP|$)', C):
      if has_empty_subject(C_tree) and has_POS(C_tree, 'VP'):
        return 'XCOMP'
      else:
        return 'CCOMP'
    else:
      return False
  else:
    return False

def is_ADJUNCT(C):
  if re.search('^(R|RP|QRP)(-|$)', C):
    return 'ADJUNCT'
  else:
    return False

def is_NP_ADVMOD(C):
  if re.search('^(NP|QP)(-TMP|-MNR|-ADV|-LOC|-PRP|-CND|-CNC)', C):
    return 'NP_ADVMOD'
  else:
    return False

def is_ADJP_ADVMOD(P, C):
  if re.search('^(ADJP)(-MNR|-ADV|-TMP|-LOC|-PRP|-CND|-CNC)', C):
      return 'ADJP_ADVMOD'
  elif re.search('^(ADJP)', P) and re.search('^(ADJP)', C):
    return 'ADJP_ADVMOD'
  else:
    return False

def is_ACOMP(P, C):
  if re.search('^(VP)', P) and re.search('^(ADJP)(-CMP|$)', C):
    return 'ACOMP'
  else:
    return False

def is_ADVCL(C):
  if re.search('^(S|SPL|SBAR)(-CND|-TMP|-PRP|-ADV|-MNR|-LOC|-CNC)', C):
    return 'ADVCL'
  else:
    return False

def is_PARATAXIS(C):
  if '-PRN'in C:
    return 'PARATAXIS'
  else:
    return False

def is_SINO(P):
  if '_w' in P:
    return 'SINO'
  else:
    return False

def is_INTJ(C):
  if (re.search('^(E|M)', C)) or ('-MDP' in C):
    return 'INTJ'
  else:
    return False

def is_CC(C):
  if re.search('^(Cp|CONJP)', C):
    return 'CC'
  else:
    return False

def is_MARK(C):
  if C == 'Cs':
    return 'MARK'
  else:
    return False

def is_VOCATIVE(C):
  if '-VOC'in C:
    return 'VOCATIVE'
  else:
    return False

def is_SOUND(c): # fixx lỗi 25_7276
  if re.search('^ON', c):
    return 'SOUND'
  else:
    return False

def is_APPOS(P, C):
  if ('NP' in P) and ('HLN' in C or 'TTL' in C): # fix là danh từ P là NP,...
    return 'APPOS'
  else:
    return False

##Hàm main

In [None]:
def get_all_relation(tree):
  tree = from_word_to_number(tree)
  
  result = assign_headword_for_phrase(tree)

  headword_of_phrase = result[0]
  C_of_headword = get_C_of_headword(headword_of_phrase)
 
  P_of_C = result[1]
  relation_dic = {}
  root_address = C_of_headword[headword_of_phrase['root']]
  
  P_index = '0'
  relation = 'ROOT'
  C_index = headword_of_phrase['root']
  relation_dic[C_index] = [P_index, relation]
  

  for C_index in tree.leaves():
    if C_index not in relation_dic:
      
      C_address = C_of_headword[C_index]
      if C_address in P_of_C:
        P_address = P_of_C[C_address][0]
        relation = P_of_C[C_address][1]
        P_index = headword_of_phrase[P_address]
        relation_dic[C_index] = [P_index, relation]
      else:
        P_address = get_P_of_C(C_address, root_address)
        P_index = headword_of_phrase[P_address] 
        relation = get_dependency_relation(P_address, C_address, P_index, C_index, tree)
        relation_dic[C_index] = [P_index, relation]
  return relation_dic

In [None]:
get_all_relation(tree)

{'1': ['17', 'PREP'],
 '10': ['8', 'POBJ'],
 '11': ['10', 'NC'],
 '12': ['13', 'VMOD'],
 '13': ['10', 'VMOD'],
 '14': ['13', 'OBJ'],
 '15': ['2', 'DET'],
 '16': ['17', 'AOBJ'],
 '17': ['0', 'ROOT'],
 '18': ['17', 'PUNCT'],
 '2': ['1', 'POBJ'],
 '3': ['2', 'NN'],
 '4': ['5', 'ADJUNCT'],
 '5': ['6', 'NUM'],
 '6': ['2', 'NN'],
 '7': ['2', 'VMOD'],
 '8': ['7', 'PREP'],
 '9': ['10', 'AMOD']}

#Hậu xử lý

## Hàm tạo file thông dụng

In [None]:
def to_oneline(folder, filename):
  with open(f'/content/NIIVTB-1/{folder}/{filename}','r',encoding='utf8') as reader:
    regex = r'(?<=<s>).+?(?=</s>)'
    pattern = re.compile(regex,re.M|re.I|re.S)
    data = reader.readlines()
    data = ''.join(data)
    sentences = re.findall(pattern=pattern,string=data)
  with open(f'/content/OneLine/{folder}/[Line]{filename}','w',encoding='utf8') as writer:
    for sentence in sentences:
      writer.write(re.sub(re.compile('[\s\t\n]+',re.I|re.M),' ',sentence).strip())
      writer.write('\n')
      

## Hàm thông dụng

In [None]:
# Trả về cây dependency (dạng list)
def get_dependency_tree_list(folder, filename):
  #exception_trees = ['(S (PP-TMP (Cs-H Sau) (NP (Nt-H khi) (SBAR (Cs *0*) (S (NP-SBJ (NONE *-1)) (VP (Nn-H thoả_thuận) (NP (Nn-H giá_cả) (NP (Num 10.000) (Nu-H đồng_[]_Nu-H kg)))))))) (PU ,) (NP-SBJ (Nn-H công_việc) (Ve còn_lại) (PP (Cs-H của) (NP-1 (Nn-H anh_em) (Nr Bùi_A)))) (VP (Vc-H là) (VP-CMP (Vv-H bắt) (NP (Nn-H chó)) (VP (D-H vào) (NP (Nn-H rọ))))) (PU .))']
  with open(f'/content/OneLine/{folder}/[Line]{filename}','r',encoding='utf8') as reader:
    lines = reader.readlines()
  dependency_tree_list = []
  for line in lines:
    #print(filename, line)
    dependency_tree = []
    tree = Tree.fromstring(line)
    original_tree = Tree.fromstring(line)
    relations = get_all_relation(tree)
    POS_tags = get_all_POS(tree)
    function_tag_list = get_function_tag(tree)
    for index, word in enumerate(original_tree.leaves()):
      word_index = str(index+1)
      head_index = relations[word_index][0]
      relation = relations[word_index][1]
      POS = POS_tags[index]
      function_tag = function_tag_list[word_index]
      dependency_tree.append([word_index, word, '_', POS, '_', function_tag, head_index, relation,'_','_'])
    dependency_tree_list.append(dependency_tree)
  return dependency_tree_list

## Thêm second relation
+ Nguyên nhân: một số nhãn NULL có index refer tới các phrase nên khi khử nhãn NULL -> sửa các relaiton mà chỉ đến  NULL sang chỉ đến phrase mà NULL đề cập tới nhưng làm vậy là sai vì một phrase chỉ được phép nhận 1 head nên việc sửa này được coi là 1 đặc trưng phụ(second relation)
+ Các bước: 

In [None]:
# Lấy các label có index
def get_phrase_contain_index(tree):
  phrase_index_list = []
  phrase_address_list = get_all_subtree_address(tree)
  for phrase_address in phrase_address_list:
    subtree_label = get_subtree(phrase_address, tree).label()
    if re.search('[0-9]$', subtree_label):
      phrase_type = subtree_label.split('-')[0]
      index = subtree_label.split('-')[-1]
      phrase_index = phrase_type + '-' + index
      phrase_index_list.append((phrase_address, phrase_index))

  phrase_index_dic = dict()
  #Thêm ràng buộc
  #if phrase_index_list:
  for phrase_index, phrase_address in groupby(sorted(phrase_index_list, key = lambda ele: ele[1]), key = lambda ele: ele[1]):
    phrase_index_dic[phrase_index] = [ele[0] for ele in phrase_address]
  return phrase_index_dic

In [None]:
# Lấy các vị trí của NULL-word trong câu
def get_phrase_has_linked_NULL(tree):
  word_list = tree.leaves()
  tree = from_word_to_number(tree)
  result = []
  for word, leafPos in zip(word_list, tree.treepositions('leaves')):
    if re.search('^\*', word) and word[-1].isnumeric():
      i = -1
      index_word = tree[leafPos]
      address = leafPos[:i]
      POS = tree[address].label()
      while (POS == 'NONE') or (not POS.isupper()):
        i = i - 1
        address = leafPos[:i]
        POS = tree[address].label()
      phrase_type = POS.split('-')[0]
      NULL_index = word[-1]
      phrase_index = phrase_type + '-' + NULL_index
      result.append([index_word, phrase_index, list(address)])
  return result

In [None]:
# Trong trường hợp tìm mapping phrase cho NULLword mà có nhiều mapping phrase
# Tính khoảng cách xem NULLword gần với mapping phrase nào nhất?
def get_distance(phrase_address, map_address):
  count = 0
  for phrase_address_index, map_address_index in zip(phrase_address, map_address):
    if phrase_address_index - map_address_index == 0:
      count = count + 1
    else:
      return count
  return count

In [None]:
def find_map_phrase_address(phrase_of_NULL, map_phrase_list):
# Ưu tiên 1
  phrase_index = phrase_of_NULL[1]
  phrase_address = phrase_of_NULL[2]
  for map_phrase, map_phrase_address in map_phrase_list.items():
    if map_phrase == phrase_index:
      if len(map_phrase_address) >=2:
        distance = []
        for address in map_phrase_address:
          distance.append(get_distance(phrase_address, address))
        selected_index = distance.index(max(distance))
        return map_phrase_address[selected_index]
      else:
        return map_phrase_address[0]

# Ưu tiên 2 
  map_phrase_exception = {
      'NP':'^(Nc|Ncs|Nu|Nun|Nt|Nq|Num|Nw|Nr|Nn)',
      'VP':'^(Ve|Vc|D|Vcp|Vv)',
      'ADJP':'^(An|Aa)',
      'S':'^SQ',
      'SQ':'^S($)'
  }
  for map_phrase, map_phrase_address in map_phrase_list.items():
    #print(phrase_index)
    phrase_type = phrase_index.split('-')[0]
    index_of_phrase = phrase_index.split('-')[1]
    if phrase_type in map_phrase_exception:
      if re.search(map_phrase_exception[phrase_type], map_phrase) and map_phrase[-1] == index_of_phrase:
        if len(map_phrase_address) >=2:
          distance = []
          for address in map_phrase_address:
            distance.append(get_distance(phrase_address, address))
          selected_index = distance.index(max(distance))
          return map_phrase_address[selected_index]
        else:
          return map_phrase_address[0]
  return False

In [None]:
def add_second_relation(tree, dependency_tree, linked_NULL_list, map_phrase_list):
  #if linked_NULL_list and map_phrase_list:
  headword_of_phrase = assign_headword_for_phrase(tree)[0]
  for linked_NULL in linked_NULL_list:
    element = dependency_tree[int(linked_NULL[0])-1]
    head_index = element[6]
    relation = element[7]
    map_phrase_address = find_map_phrase_address(linked_NULL, map_phrase_list)
    if map_phrase_address != False:
      map_index = headword_of_phrase[str(map_phrase_address)]
      map_element = dependency_tree[int(map_index)-1]
      second_dep = map_element[8]
      if second_dep == '_':
        map_element[8] = head_index + ':' + relation
      else:
        map_element[8] = map_element[8] + '|' + head_index + ':' + relation
  return dependency_tree

In [None]:
tree = Tree.fromstring('(S (S (NP-SBJ (NONE *-1)) (VP-MNR (Vv-H Thấy) (SBAR-CMP (Cs *0*) (S (NP-SBJ (Pp-H chúng_tôi)) (VP (Vv-H thất_vọng) (VP (Vv-H quay) (R ra))))))) (PU ,) (NP-SBJ-1 (NP (Nn-H chị) (PU ") (Nn dịch_vụ) (PU ") (VP (Vv-H mời) (NP-DOB-1 (Pp-H chúng_tôi)) (PP (Cs-H vào) (NP (Cs trong))) (NP-TMP (Nt-H lúc) (SBAR (Cs *0*) (S (NP-SBJ (NONE *-1)) (VP (R mới) (Vv-H đến))))))) (Cp và) (NP (Nn_swsp (Nc-H ông) (Nn thường_trực)))) (VP (Cp vừa) (Vv-H nói) (Cp vừa) (Vv-H cười) (SBAR-CMP (Cs *0*) (PU :) (PU ") (S (NP-SBJ (NONE *E*)) (VP (VP (Vv-H Thấy) (R chưa)) (PU ,) (VP (Ve-H mất) (NP-CMP (M có) (Num 10) (Nun-H đôla) (PP (Cs-H cho) (NP (Pp-H chúng_tôi))))) (Cp thì) (VP (R có_khi) (NP-TMP (Pd-H bây_giờ)) (VP (R đã) (VA-H xong) (R rồi))) (PU ,) (VP (Vv-H thích) (VP (Vv-H làm) (ADJP (Aa-H thẳng)))) (Cp thì) (VP (Vv-H chờ) (PP-TMP (Cs-H đến) (NP (Nt-H chiều) (Aa muộn))) (M nhé)))) (PU "))) (PU .))')
original_tree = Tree.fromstring('(S (S (NP-SBJ (NONE *-1)) (VP-MNR (Vv-H Thấy) (SBAR-CMP (Cs *0*) (S (NP-SBJ (Pp-H chúng_tôi)) (VP (Vv-H thất_vọng) (VP (Vv-H quay) (R ra))))))) (PU ,) (NP-SBJ-1 (NP (Nn-H chị) (PU ") (Nn dịch_vụ) (PU ") (VP (Vv-H mời) (NP-DOB-1 (Pp-H chúng_tôi)) (PP (Cs-H vào) (NP (Cs trong))) (NP-TMP (Nt-H lúc) (SBAR (Cs *0*) (S (NP-SBJ (NONE *-1)) (VP (R mới) (Vv-H đến))))))) (Cp và) (NP (Nn_swsp (Nc-H ông) (Nn thường_trực)))) (VP (Cp vừa) (Vv-H nói) (Cp vừa) (Vv-H cười) (SBAR-CMP (Cs *0*) (PU :) (PU ") (S (NP-SBJ (NONE *E*)) (VP (VP (Vv-H Thấy) (R chưa)) (PU ,) (VP (Ve-H mất) (NP-CMP (M có) (Num 10) (Nun-H đôla) (PP (Cs-H cho) (NP (Pp-H chúng_tôi))))) (Cp thì) (VP (R có_khi) (NP-TMP (Pd-H bây_giờ)) (VP (R đã) (VA-H xong) (R rồi))) (PU ,) (VP (Vv-H thích) (VP (Vv-H làm) (ADJP (Aa-H thẳng)))) (Cp thì) (VP (Vv-H chờ) (PP-TMP (Cs-H đến) (NP (Nt-H chiều) (Aa muộn))) (M nhé)))) (PU "))) (PU .))')
dup_tree = Tree.fromstring('(S (S (NP-SBJ (NONE *-1)) (VP-MNR (Vv-H Thấy) (SBAR-CMP (Cs *0*) (S (NP-SBJ (Pp-H chúng_tôi)) (VP (Vv-H thất_vọng) (VP (Vv-H quay) (R ra))))))) (PU ,) (NP-SBJ-1 (NP (Nn-H chị) (PU ") (Nn dịch_vụ) (PU ") (VP (Vv-H mời) (NP-DOB-1 (Pp-H chúng_tôi)) (PP (Cs-H vào) (NP (Cs trong))) (NP-TMP (Nt-H lúc) (SBAR (Cs *0*) (S (NP-SBJ (NONE *-1)) (VP (R mới) (Vv-H đến))))))) (Cp và) (NP (Nn_swsp (Nc-H ông) (Nn thường_trực)))) (VP (Cp vừa) (Vv-H nói) (Cp vừa) (Vv-H cười) (SBAR-CMP (Cs *0*) (PU :) (PU ") (S (NP-SBJ (NONE *E*)) (VP (VP (Vv-H Thấy) (R chưa)) (PU ,) (VP (Ve-H mất) (NP-CMP (M có) (Num 10) (Nun-H đôla) (PP (Cs-H cho) (NP (Pp-H chúng_tôi))))) (Cp thì) (VP (R có_khi) (NP-TMP (Pd-H bây_giờ)) (VP (R đã) (VA-H xong) (R rồi))) (PU ,) (VP (Vv-H thích) (VP (Vv-H làm) (ADJP (Aa-H thẳng)))) (Cp thì) (VP (Vv-H chờ) (PP-TMP (Cs-H đến) (NP (Nt-H chiều) (Aa muộn))) (M nhé)))) (PU "))) (PU .))')
relations = get_all_relation(tree)
POS_tags = get_all_POS(tree)
function_tag_list = get_function_tag(tree)
dependency_tree = []
for index, word in enumerate(original_tree.leaves()):
  word_index = str(index+1)
  head_index = relations[word_index][0]
  relation = relations[word_index][1]
  POS = POS_tags[index]
  function_tag = function_tag_list[word_index]
  dependency_tree.append([word_index, word, '_', POS, '_', function_tag, head_index, relation,'_','_'])

linked_NULL_list = get_phrase_has_linked_NULL(dup_tree)
map_phrase_list = get_phrase_contain_index(dup_tree)
new_dependency_tree = add_second_relation(dup_tree, dependency_tree, linked_NULL_list, map_phrase_list)


In [None]:
# Đưa second_relation của NULL lên map phrase
def edit_second_relation_of_NULL(tree, dependency_tree, linked_NULL_list, map_phrase_list):
  headword_of_phrase = assign_headword_for_phrase(tree)[0]
  for linked_NULL in linked_NULL_list:
    element = dependency_tree[int(linked_NULL[0])-1]
    head_index = element[6]
    second_relation = element[8]
    if second_relation != '_':
      map_phrase_address = find_map_phrase_address(linked_NULL, map_phrase_list)
      if map_phrase_address != False:
        map_index = headword_of_phrase[str(map_phrase_address)]
        map_element = dependency_tree[int(map_index)-1]
        second_dep = map_element[8]
        if second_dep == '_':
          map_element[8] = second_relation
        else:
          map_element[8] = map_element[8] + '|' + second_relation
  return dependency_tree

## Relink headNULL
+ Nguyên nhân: do khi áp dụng luật chuyển đổi vô tình lấy nhãn NULL làm head

In [None]:
# Lấy các mối quan hệ mà NULL làm head
def get_dep_dic_of_NULL(dependency_tree):
  dep_dic = {}
  for element in dependency_tree:
    word_index = element[0]
    word = element[1]
    NULL_pos = element[3]
    NULL_head_index = element[6]
    NULL_relation = element[7]
    if re.search('^\*', word):
      dep_index_list = []
      pos_list = []
      word_list = []
      for temp in dependency_tree:
        dep_word_index = temp[0]
        dep_word = temp[1]
        pos_of_dep_word = temp[3]
        head_index = temp[6]
        if head_index == word_index :
          dep_index_list.append(dep_word_index)
          pos_list.append(pos_of_dep_word)
          word_list.append(dep_word)
      if dep_index_list:
        dep_dic[word_index] = [NULL_pos, NULL_head_index, NULL_relation, dep_index_list, pos_list, word_list]
  return dep_dic

In [None]:
# Chọn một từ mà có mối quan hệ với NULL và trong đó NULL làm head
# Từ được chọn sẽ thay thế NULL làm head cho các mối quan hệ liên quan tới NULL
def select_index(NULL_pos, dep_index_list, pos_list, word_list):
  phrase_type_list = ['^(NP|Nc|Ncs|Nu|Nun|Nt|Nq|Num|Nw|Nr|Nn)', '^(VP|Ve|Vc|D|Vcp|Vv|VN)', '^(VA|NA|ADJP|An|Aa)']
  for phrase_type in phrase_type_list:
    if re.search(phrase_type, NULL_pos):
      for dep_index, pos, word in zip(dep_index_list, pos_list, word_list):
        if (re.search(phrase_type, pos)) and ('*' not in word):
          return dep_index
  for dep_index, pos, word in zip(dep_index_list, pos_list, word_list):
    if (pos != 'PU') and (pos != 'Cp') and (pos != 'Cs') and ('*' not in word):
      return dep_index

  for dep_index, pos, word in zip(dep_index_list, pos_list, word_list):
    if (pos != 'PU') and ('*' not in word):
      return dep_index

In [None]:
def relink_head_NULL(dependency_tree):
  head_NULL_dic = get_dep_dic_of_NULL(dependency_tree)
  if head_NULL_dic:
    for NULL_index, dep_list in head_NULL_dic.items():
      NULL_pos = dep_list[0]
      NULL_head_index = dep_list[1]
      NULL_relation = dep_list[2]
      dep_index_list = dep_list[3]
      pos_list = dep_list[4]
      word_list = dep_list[5]
      if (len(word_list) == 1) and ('*' not in word_list[0]):
        selected_index = dep_index_list[0]
        selected_element = dependency_tree[int(selected_index)-1]
        selected_element[6] = NULL_head_index
        selected_element[7] = NULL_relation

        for element in dependency_tree:
          second_relation_field = element[8]
          second_relation_list = second_relation_field.split('|')
          second_relation_list = [ selected_index +':'+second_relation.split(':')[1] if second_relation.split(':')[0] == NULL_index else second_relation for second_relation in second_relation_list] 
          element[8] = '|'.join(second_relation_list)

      elif len(word_list) >= 2:
        selected_index = select_index(NULL_pos, dep_index_list, pos_list, word_list)
        selected_element = dependency_tree[int(selected_index)-1]
        selected_element[6] = NULL_head_index
        selected_element[7] = NULL_relation

        for element in dependency_tree:
          head_index = element[6]
          if head_index == NULL_index:
            element[6] = selected_index

          second_relation_field = element[8]
          second_relation_list = second_relation_field.split('|')
          second_relation_list = [ selected_index +':'+second_relation.split(':')[1] if second_relation.split(':')[0] == NULL_index else second_relation for second_relation in second_relation_list] 
          element[8] = '|'.join(second_relation_list)
  return dependency_tree

##Khử nhãn NULL

In [None]:
def minus_1(start_index, end_index):
  minus_1 = {}
  for index in range(start_index, end_index+1):
    minus_1[str(index)]=str(index-1)
  return minus_1 

In [None]:
def map_index(tree_dependency, minus_1):

  for relation in tree_dependency:
    word_index = relation[0]
    if word_index in minus_1:
      relation[0] = minus_1[word_index]

    head_index = relation[6]
    if head_index in minus_1:
      relation[6] = minus_1[head_index]

    if relation[8] != '_':
      new_second_dependency = []
      second_dependency_element = relation[8]
      second_dependency_list = second_dependency_element.split('|')
      for second_dependency in second_dependency_list:
        split_second_dependency = second_dependency.split(':')
        number = split_second_dependency[0]
        dep = split_second_dependency[1]
        if number in minus_1:
          second_dependency = minus_1[number] +':'+ dep
        new_second_dependency.append(second_dependency)
      relation[8] = '|'.join(new_second_dependency)

  return tree_dependency
    

In [None]:
def remove_NULL(tree_dependency):
  #Xóa các second relation mà có head_index trong với head_index chính thức và index_word
  for index, element in enumerate(tree_dependency):
      index_word = index+1
      head_index = element[6]
      second_relation_field = element[8]
      if second_relation_field != '_':
        new_second_relation_list = []
        second_relation_list = second_relation_field.split('|')
        for second_relation in second_relation_list:
          s_head_index = second_relation.split(':')[0]
          if (s_head_index != str(index_word)) and (s_head_index != head_index):
            new_second_relation_list.append(second_relation)
        if new_second_relation_list:
          element[8] = '|'.join(new_second_relation_list)
        else:
          element[8] = '_'

  #Khử nhãn NULL
  check_null_1 = True
  while check_null_1:
    for index, element in enumerate(tree_dependency):
      check_null_2 = True
      word = element[1]
      index_word = index+1

      if re.search('(^\*)', word):
        minus = minus_1(index_word+1, len(tree_dependency))
        tree_dependency = map_index(tree_dependency, minus)
        del tree_dependency[index]
        check_null_2 = False
        break
    if check_null_2:
      check_null_1 = False
  return tree_dependency

# Tạo file CONLLU

In [None]:
def finish_dependency_tree(folder, filename, dependency_treebank):
  with open(f'/content/OneLine/{folder}/[Line]{filename}','r',encoding='utf8') as reader:
    lines = reader.readlines()
  new_filename = filename[:-4] + '.conllu'
  with open(f'/content/VnDep/{folder}/[VnDep]{new_filename}','w',encoding='utf8') as writer:
    tree_index = 1 
    for line, dependency_tree in zip(lines, dependency_treebank):
      print(filename,line)
      tree = Tree.fromstring(line)
      linked_NULL_list = get_phrase_has_linked_NULL(tree)
      map_phrase_list = get_phrase_contain_index(tree)

      new_dependency_tree = add_second_relation(tree, dependency_tree, linked_NULL_list, map_phrase_list)
      edit_dependency_tree = edit_second_relation_of_NULL(tree, new_dependency_tree, linked_NULL_list, map_phrase_list)
      relink_headNULL_dependency_tree = relink_head_NULL(edit_dependency_tree)
      remove_NULL_dependency_tree = remove_NULL(relink_headNULL_dependency_tree)
      
      writer.write('# ID = {}\n'.format(tree_index))
      tree_index = tree_index + 1
      for element in edit_dependency_tree:
        writer.write('\t'.join(element))
        writer.write('\n')
      writer.write('\n')

##Kiểm tra lại cây sau khi khử NULL

In [None]:
def get_second_head_index(second_relation_field):
  if second_relation_field != '_':
    second_relation_list = second_relation_field.split('|')
    return [second_relation.split(':')[0] for second_relation in second_relation_list]
  else:
    return False

In [None]:
def print_dependency_tree(dependency_tree):
  for ele in dependency_tree:
    print(ele)

In [None]:
def check_dependency_tree(folder, filename, dependency_treebank):
  with open(f'/content/OneLine/{folder}/[Line]{filename}','r',encoding='utf8') as reader:
    lines = reader.readlines() 
    for line, dependency_tree in zip(lines, dependency_treebank):
      
      tree = Tree.fromstring(line)
      linked_NULL_list = get_phrase_has_linked_NULL(tree)
      map_phrase_list = get_phrase_contain_index(tree)

      new_dependency_tree = add_second_relation(tree, dependency_tree, linked_NULL_list, map_phrase_list)
      edit_dependency_tree = edit_second_relation_of_NULL(tree, new_dependency_tree, linked_NULL_list, map_phrase_list)
      relink_headNULL_dependency_tree = relink_head_NULL(edit_dependency_tree)
      before_remove_NULL = copy.copy(relink_headNULL_dependency_tree)
      remove_NULL_dependency_tree = remove_NULL(relink_headNULL_dependency_tree)

      #Check sau khi khử NULL
      index_word_list = []
      head_index_list = []
      second_head_index_list = []
      for element in remove_NULL_dependency_tree:
        index_word_list.append(element[0])
        if element[6] != '0':
          head_index_list.append(element[6])
        if get_second_head_index(element[8]):
          second_head_index_list.append(get_second_head_index(element[8]))

      for head_index in head_index_list:
        if head_index not in index_word_list:
          print(filename, line)
          print('head_index', head_index)
          print_dependency_tree(before_remove_NULL)

      for second_head_index in second_head_index_list:
        for index in second_head_index:
          if index not in index_word_list: 
            print(filename, line)
            print('second_index', index)
            print_dependency_tree(before_remove_NULL)

In [None]:
# from google.colab import files
# file_list = ['Dev_4784','Dev_25432','Dev_25480','Dev_25600','Dev_26554','Dev_46137','Dev_7105','Dev_7276','Dev_81347','Dev_90295','Dev_9539']
# dependency_treebank_list = []
# for file in file_list:
#   to_oneline(file)
#   dependency_treebank = get_dependency_tree_list(file)
#   finish_dependency_tree(file, dependency_treebank)

#Thực hiện trên toàn NIIVTB

In [None]:
!cp /content/drive/MyDrive/NIIVTB/NIIVTB-1.zip /content

In [None]:
!unzip /content/NIIVTB-1.zip

Archive:  /content/NIIVTB-1.zip
   creating: NIIVTB-1/
   creating: NIIVTB-1/Dev/
  inflating: NIIVTB-1/Dev/Dev_25283.prd  
  inflating: NIIVTB-1/Dev/Dev_25283.raw  
  inflating: NIIVTB-1/Dev/Dev_25300.prd  
  inflating: NIIVTB-1/Dev/Dev_25300.raw  
  inflating: NIIVTB-1/Dev/Dev_25302.prd  
  inflating: NIIVTB-1/Dev/Dev_25302.raw  
  inflating: NIIVTB-1/Dev/Dev_25432.prd  
  inflating: NIIVTB-1/Dev/Dev_25432.raw  
  inflating: NIIVTB-1/Dev/Dev_25433.prd  
  inflating: NIIVTB-1/Dev/Dev_25433.raw  
  inflating: NIIVTB-1/Dev/Dev_25435.prd  
  inflating: NIIVTB-1/Dev/Dev_25435.raw  
  inflating: NIIVTB-1/Dev/Dev_25480.prd  
  inflating: NIIVTB-1/Dev/Dev_25480.raw  
  inflating: NIIVTB-1/Dev/Dev_25600.prd  
  inflating: NIIVTB-1/Dev/Dev_25600.raw  
  inflating: NIIVTB-1/Dev/Dev_26554.prd  
  inflating: NIIVTB-1/Dev/Dev_26554.raw  
  inflating: NIIVTB-1/Dev/Dev_46137.prd  
  inflating: NIIVTB-1/Dev/Dev_46137.raw  
  inflating: NIIVTB-1/Dev/Dev_4784.prd  
  inflating: NIIVTB-1/Dev/Dev_4784.ra

In [None]:
!mkdir /content/OneLine

In [None]:
!mkdir /content/OneLine/Train 
!mkdir /content/OneLine/Dev 
!mkdir /content/OneLine/Test 

In [None]:
!mkdir /content/VnDep

In [None]:
!mkdir /content/VnDep/Train
!mkdir /content/VnDep/Dev
!mkdir /content/VnDep/Test

In [None]:
path_list = glob.glob('/content/NIIVTB-1/*/*.prd')

In [None]:
for index, path in enumerate(path_list):
  print(index+1)
  folder = path.split('/')[-2]
  filename = path.split('/')[-1]
  to_oneline(folder, filename)
  dependency_treebank = get_dependency_tree_list(folder, filename)
  finish_dependency_tree(folder, filename, dependency_treebank)
  #check_dependency_tree(folder, filename, dependency_treebank)
  #print('{:.2f}'.format((index+1/len(path_list))*100))

[1;30;43mStreaming output truncated to the last 5000 lines.[0m

Train_5932.prd (S (NP-SBJ (Pp-H Ông)) (VP (Vv-H nói) (SBAR-CMP (Cs *0*) (PU :) (PU ") (S (S-SBJ (NP-TMP (Nt-H Ngày_xưa)) (NP-SBJ (Pp-H chúng_tôi)) (VP (Vv-H lặn) (NP (Nn-H bộ)) (VP-PRP (Vv-H bắt) (NP-DOB (Nc-H con) (NP (Nn-H vẹm) (PU ,) (Nn-H hến_cơm)))))) (VP (Vc-H là) (VP-CMP (Ve-H hết)))))) (PU .))

Train_5932.prd (S (S (NP-SBJ (NONE *E*)) (VP (Vv-H Lặn) (NP-TMP (ID-H quanh_năm_suốt_tháng)))) (Cp nhưng) (S (NP-SBJ (Nn-H nhà)) (ADJP-PRD (R vẫn) (Aa-H nghèo) (SBAR-PRP (Cs vì) (S (S (VP-SBJ (Vv-H lặn) (Nn-H bộ)) (VP (R không) (Ve-H hết) (NP-CMP (Nn-H khả_năng) (PP (Cs-H của) (NP (Pp-H mình)))))) (PU ,) (Cp với_lại) (SPL (NP-TMP (Nt-H hồi) (Pd đó)) (VP (R chưa) (Ve-H có) (NP-CMP (Nn-H phong_trào) (VP (Vv-H ăn) (NP (Nn-H hải_sản))) (PP (Cs-H như) (NP-TMP (Pd-H bây_giờ)))))))))) (PU .))

Train_5932.prd (S (S-PRP (NP-TMP (Nt-H Nay)) (NP-SBJ (Nn-H thanh_niên)) (VP (R toàn) (Vv-H lặn) (Nn-H máy))) (Cp nên) (S (S-SBJ (NP-SBJ (P

In [None]:
tree = Tree.fromstring('(SPL-HLN (NP (Nn-H Mùi)(NP (Nn-H hổ))(PP-LOC (Cs-H ở)(NP (Nun-H thành_phố))))(PU .))')
tree.pretty_print()
#(SQ (Cp Vì) (QNP-PRP (Pp-H sao)) (NP (Pp-H vậy))) (NP-VOC (M hở) (Nr-H Thuỳ)) (PU ?))

          SPL-HLN                     
       ______|______________________   
      NP                            | 
  ____|_____________                |  
 |    |           PP-LOC            | 
 |    |       ______|________       |  
 |    NP     |               NP     | 
 |    |      |               |      |  
Nn-H Nn-H   Cs-H           Nun-H    PU
 |    |      |               |      |  
Mùi   hổ     ở           thành_phố  . 



In [None]:
!zip -r '/content/drive/MyDrive/NIIVTB/VnDependencyTreebank_v6.zip' '/content/VnDep'

  adding: content/VnDep/ (stored 0%)
  adding: content/VnDep/Dev/ (stored 0%)
  adding: content/VnDep/Dev/[VnDep]Dev_7105.conllu (deflated 76%)
  adding: content/VnDep/Dev/[VnDep]Dev_7276.conllu (deflated 75%)
  adding: content/VnDep/Dev/[VnDep]Dev_25600.conllu (deflated 77%)
  adding: content/VnDep/Dev/[VnDep]Dev_46137.conllu (deflated 76%)
  adding: content/VnDep/Dev/[VnDep]Dev_25433.conllu (deflated 66%)
  adding: content/VnDep/Dev/[VnDep]Dev_5770.conllu (deflated 76%)
  adding: content/VnDep/Dev/[VnDep]Dev_59422.conllu (deflated 75%)
  adding: content/VnDep/Dev/[VnDep]Dev_25435.conllu (deflated 65%)
  adding: content/VnDep/Dev/[VnDep]Dev_82515.conllu (deflated 76%)
  adding: content/VnDep/Dev/[VnDep]Dev_26554.conllu (deflated 77%)
  adding: content/VnDep/Dev/[VnDep]Dev_82172.conllu (deflated 75%)
  adding: content/VnDep/Dev/[VnDep]Dev_25283.conllu (deflated 74%)
  adding: content/VnDep/Dev/[VnDep]Dev_81347.conllu (deflated 74%)
  adding: content/VnDep/Dev/[VnDep]Dev_4784.conllu (de