#Installation and Import

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
import re
import os

In [2]:
pip install tagme

Collecting tagme
  Downloading https://files.pythonhosted.org/packages/7b/ea/bbdb46fec64423ea0b28fd508ab8ee8b59a918db090d5e073dd6f3bf227f/tagme-0.1.3-py2.py3-none-any.whl
Installing collected packages: tagme
Successfully installed tagme-0.1.3


In [3]:
import sys
!{sys.executable} -m pip install -U germanetpy

Collecting germanetpy
[?25l  Downloading https://files.pythonhosted.org/packages/12/16/d17862422eae401706fc31b0f5eb3d45d18bef211a3b284167c18cd24935/germanetpy-0.2.1-py3-none-any.whl (54kB)
[K     |██████                          | 10kB 15.9MB/s eta 0:00:01[K     |████████████                    | 20kB 8.5MB/s eta 0:00:01[K     |██████████████████▏             | 30kB 7.6MB/s eta 0:00:01[K     |████████████████████████▏       | 40kB 6.8MB/s eta 0:00:01[K     |██████████████████████████████▏ | 51kB 4.1MB/s eta 0:00:01[K     |████████████████████████████████| 61kB 3.2MB/s 
Collecting fastenum>=0.0.1
  Downloading https://files.pythonhosted.org/packages/62/8d/364b584a546dbce6e046efaa7926446f4f681900c6992258c740f73c39fb/fastenum-0.0.8.tar.gz
Collecting python-Levenshtein==0.12.0
[?25l  Downloading https://files.pythonhosted.org/packages/42/a9/d1785c85ebf9b7dfacd08938dd028209c34a0ea3b1bcdb895208bd40a67d/python-Levenshtein-0.12.0.tar.gz (48kB)
[K     |███████████████████████████

In [4]:
from pathlib import Path
from germanetpy.germanet import Germanet

data_path = "/content/drive/My Drive/Colab Notebooks/AutomaticOnt/germanet/GN_V150/GN_V150_XML"
frequencylist_nouns = "/content/drive/My Drive/Colab Notebooks/AutomaticOnt/germanet/GN_V150/FreqLists/noun_freqs_decow14_16.txt"
germanet = Germanet(data_path)

Load GermaNet data...: 100%|█████████▉| 99.99999999999996/100 [01:21<00:00,  1.22it/s]
Load Wictionary data...: 100%|██████████| 100.0/100 [00:03<00:00, 31.06it/s]
Load Ili records...: 100%|██████████| 100.0/100 [00:01<00:00, 95.36it/s]


#Files Reading

**First Reading Datev Glossary terms**

In [145]:
glossary_file = pd.read_excel('/content/drive/MyDrive/Colab Notebooks/AutomaticOnt/Datev_content/glossary.xlsx')
glossary_file = glossary_file[['Term', 'Status']]
glossary_file.head()

Unnamed: 0,Term,Status
0,(Kanzlei-)Rechnungswesen,deprecated
1,(pro),deprecated
2,@-Consulting,preferred
3,**ANREDE**,non-term
4,**DATUM**,non-term


In [146]:
glossary_file['Status'].unique()

array(['deprecated', 'preferred', 'non-term', 'admitted', 'proposed'],
      dtype=object)

In [152]:
# saving all terms in a list and lower casing all terms
glossary_terms = list(glossary_file['Term'])
for i in range(len(glossary_terms)):
  glossary_terms[i] = glossary_terms[i].lower()

In [5]:
from os import listdir
from os.path import isfile, join
import json
import re
onlyfiles = [f for f in listdir("/content/drive/My Drive/Colab Notebooks/AutomaticOnt/Corpus/GermanLegalFiles/all_files/") if isfile(join("/content/drive/My Drive/Colab Notebooks/AutomaticOnt/Corpus/GermanLegalFiles/all_files/", f))]
onlyfiles.sort()

folder_path = "/content/drive/My Drive/Colab Notebooks/AutomaticOnt/Corpus/GermanLegalFiles/all_files/"

In [6]:
# 0019421
# 0057729
# 0061693
# 0077309
# 0084252
# 0105285
# 0124330
# 0138573
# 0165920
# 0173722
# 0380318
# 0382251
# 0382370
# 0443434
# 0556087
# 0578343
# 1150029

In [7]:
def remove_html_tags(text):
  text_arr = text.split('</lxbase:bibliographische-angaben>')
  text_use = text_arr[1]
  # marking para
  text_use = re.sub(re.compile('</lxbase:a>'), '^^para^^', text_use)
  # remove new lines
  text_use = re.sub(re.compile('\n'), ' ', text_use)
  clean = re.compile('<.*?>')
  return re.sub(clean, '', text_use)

In [8]:
def getLegalParaList(data):
  data = data.replace('EStG', 'EStG|').replace('EStG|)', 'EStG)|').replace('GDL', 'GDL|').replace('BewG', 'BewG|').replace('EStDV', 'EStDV|').replace('AO', 'AO|')

  words_pattern = r'§[ §a-zA-Z0-9,.]+'
  legal_para_list = re.findall(words_pattern, data, flags=re.IGNORECASE)
  legal_para_list = list(set(legal_para_list))
  return legal_para_list

def getHeadingParaSequence(data):
  # mark paragraphs in the document
  data_arr = data.split('^^para^^')
  # get type of text chunks - roman heading, number heading or paragraph
  roman_heading_pattern = r"[IXV]+[. ][^\n]+"
  num_heading_pattern = r"[0-9]+. [^\n]+"
  heading_para_list = []
  for sequence in data_arr:
    if re.match(roman_heading_pattern, sequence):
      heading_para_list.append(['roman_head', sequence])
    elif re.match(num_heading_pattern, sequence):
      heading_para_list.append(['num_head', sequence])
    else:
      heading_para_list.append(['para', sequence])
  sequence_type_df = pd.DataFrame(heading_para_list, columns=['type', 'text'])
  return sequence_type_df

In [40]:
def fileToHeadParaPair(fileFullpath):
  with open(fileFullpath, 'r') as file:
    data = file.read()
  data = remove_html_tags(data)
  
  # get Heading Para Sequence
  sequence_type_df = getHeadingParaSequence(data)
  # Prepare Heading Para pair 
  document_sequence_list = []
  num_head_flag = 0
  roman_head_flag = 0
  para_text = ""
  heading_text = ""
  for index, rows in sequence_type_df.iterrows():
    seq_type = rows["type"]
    seq_text = rows["text"]

    if seq_type == "roman_head":
      if para_text != "":
        document_sequence_list.append([heading_text, para_text])
        heading_text = ""
        para_text = ""
        roman_head_flag = 0
      roman_head_flag = roman_head_flag + 1
      heading_text = heading_text + seq_text
    elif seq_type == "num_head":
      num_head_flag = num_head_flag + 1
      if roman_head_flag > 0:
        para_text = para_text + seq_text
      else:
        if para_text != "":
          document_sequence_list.append([heading_text, para_text])
          roman_head_flag = 0
          para_text = ""
          heading_text = ""
        heading_text = heading_text + seq_text
    else:
      para_text = para_text + seq_text

  document_sequence_df = pd.DataFrame(document_sequence_list, columns=["heading", "para"])

  return document_sequence_df

In [39]:
# sequence_type_df = fileToHeadParaPair(folder_path + "0578343.xml")
# sequence_type_df.head()
# for i in range(len(sequence_type_df)): 
#   print(sequence_type_df.loc[i, "type"], sequence_type_df.loc[i, "text"]) 

#Processing Heading and Para

In [102]:
heading_para_df = fileToHeadParaPair(folder_path+ onlyfiles[1])

In [106]:
heading_para_df.head()

Unnamed: 0,heading,para
0,,Bodengewinnbesteuerung nach § 55 EStGInhaltsüb...
1,I. Allgemeines zur Bodengewinnbesteuerung,1. Gesetzliche Grundlagen2. Zeitlicher Anwendu...
2,III. Land- und forstwirtschaftlich genutzte Gr...,1. Allgemeines2. Wirtschaftliches Eigentum3. G...
3,IV. Übertragung eines land- und forstwirtschaf...,1. Übertragung des Betriebs mit allen wesentli...
4,V. Die Fälle der Betriebsaufgabe bei land- und...,1. Betriebseinstellung2. Betriebsverpachtung


In [107]:
def getSynsets(word):
  synsets = germanet.get_synsets_by_orthform(word)
  return synsets

def getFirstSynset(word):
  synsets = getSynsets(word)
  if len(synsets) > 0:
    first_synset = germanet.get_synset_by_id(synsets[0].id)
  else:
    return 0
  return first_synset

In [46]:
def getHypernymPathsIds(synset):
  paths = synset.hypernym_paths()
  paths_ids = []
  for path in paths:
    temp_path_ids = []
    for item in path:
      temp_path_ids.append(item.id)
    paths_ids.append(temp_path_ids)
  return paths_ids

def issubclass(a_synset, b_synset):
  a_hyper_paths = getHypernymPathsIds(a_synset)
  b_hyper_paths = getHypernymPathsIds(b_synset)
  a_synset_id = a_synset.id
  b_synset_id = b_synset.id
  # check if b is in hypernyms path of a
  flag = 0
  for path in a_hyper_paths:
    if b_synset_id in path:
      flag = 1                # it means b_synset is hypernym of a_synset
  return flag

# a_synset = getFirstSynset("Oberfinanzdirektion")
# b_synset = getFirstSynset("Finanzamt")
# issubclass(a_synset, b_synset)

In [47]:
import tagme
# Set the authorization token for subsequent calls.
tagme.GCUBE_TOKEN = "0cd18564-f4c2-4891-af4d-6745c23b43a4-843339462"

def getTermList(text):
  all_terms = []
  # doc = nlp(text) 
  # for token in doc: 
  #   if getFirstSynset(token.lemma_) != 0:
  #     all_terms.append(token.lemma_)

  # get terms using Tagme
  temp_annotations = tagme.annotate(text, lang="de")
  if len(temp_annotations.annotations) > 0:
    for ann in temp_annotations.get_annotations(0.1):
      if getFirstSynset(ann.mention) != 0:
        all_terms.append(ann.mention)
      elif getFirstSynset(ann.entity_title) != 0:
        all_terms.append(ann.entity_title)
        # all_terms.append(ann.mention)
      else:
        all_terms.append(ann.mention)
  all_terms = list(set(all_terms))
  return all_terms

In [132]:
# word_a = "ausländische"
# print(len(word_a.split(' ')))
# word_b = "§ 34d EStG"
# para = "ausländische Einkünfte nach § 34d EStG"
# conn_str_list = re.findall(rf'{word_a}.+? {word_b}',para)
# print(conn_str_list)
# margin_length = len(word_a.split(' ')) + len(word_b.split(' ')) + 6
# if len(conn_str_list) > 0:
#   for sent in conn_str_list:
#     if len(sent.split(' ')) < margin_length and len(sent.split(' ')) > 2:
#       filtered_sent = sent.rstrip("§ 34d EStG").strip()
#       left_word_arr = word_a.split(' ')
#       for s in left_word_arr:
#         filtered_sent = filtered_sent.lstrip(s).strip()
#       print(filtered_sent)

1
['ausländische Einkünfte nach § 34d EStG']
Einkünfte nach


In [156]:
 distance_relation_pair = []
 paragraphs = []
 dirty_tags = ['Abs', 'EStG', 'bzw', 'EStR', 'EStDV', 'BewG', 'DM', 'GDL', 'AO']
 for index, rows in heading_para_df.iterrows():
  para_id = "para_" + str(index)

  heading = rows["heading"]
  para = rows["para"]

  # Saving all paragraphs in a different dataframe
  paragraphs.append([para_id, para])

  if heading != "":
    head_terms = list(set(getTermList(heading))) + getLegalParaList(heading)
  else:
    head_terms = []
  if para != "":
    para_terms = list(set(getTermList(para))) + getLegalParaList(para)
  else:
    para_terms = []

  # check if para terms are subclass of any head_term
  # head_para_pair = []
  # for para_term in para_terms:
  #   for head_term in head_terms:
  #     para_synset = getFirstSynset(para_term)
  #     head_synset = getFirstSynset(head_term)
  #     if para_synset != head_synset:
  #       if para_synset != 0 and head_synset != 0:
  #         if issubclass(para_synset, head_synset) == 1:
  #           head_para_pair.append([para_term, 'isSubclassof', head_term])
  #         else:
  #           head_para_pair.append([para_term, 'hasTheme', head_term])
  
  # # check if para terms have subclasses with each others
  # para_para_pair = []  #(child, parent)
  # for para_term1 in para_terms:
  #   for para_term2 in para_terms:
  #     a_synset = getFirstSynset(para_term1)
  #     b_synset = getFirstSynset(para_term2)
  #     if a_synset != b_synset:
  #       if a_synset != 0 and b_synset != 0:
  #           if issubclass(a_synset, b_synset) == 1:
  #             para_para_pair.append([para_term1, 'isSubclassof', para_term2])

  # Check if there is single or double word relation between any word pairs
  # in a Single paragraph
  for i in range(0, len(para_terms)):
    for j in range(i+1, len(para_terms)):
      glossary_status = ""
      word_a = para_terms[i]
      word_b = para_terms[j]
      if word_a in dirty_tags or word_b in dirty_tags:
        continue;

      # Check if subject or object are in Glossary list
      if word_a in glossary_terms:
        glossary_status = word_a + " |"
      
      if word_b in glossary_terms:
        glossary_status = glossary_status + "| " + word_b

      # Maximum 6 word relation between source and target
      margin_length = len(word_a.split(' ')) + len(word_b.split(' ')) + 6

      # Extracting Relation between two tags
      conn_str_list = re.findall(rf'{word_a}.+? {word_b}',para)
  
      if len(conn_str_list) > 0:
        for sent in conn_str_list:
          if len(sent.split(' ')) < margin_length and len(sent.split(' ')) > 4:
            filtered_sent = sent.rstrip(word_b).strip()
            left_word_arr = word_a.split(' ')
            for s in left_word_arr:
              filtered_sent = filtered_sent.lstrip(s).strip()
            if filtered_sent != '':
              distance_relation_pair.append([word_a, filtered_sent, word_b, para_id, glossary_status])

temp_relation_df = pd.DataFrame(distance_relation_pair, columns=['Word1', 'Relation', 'Word2', 'Para_id', 'Glossary_status'])
paragraph_df = pd.DataFrame(paragraphs, columns=['Para_id', 'Text'])
# para_para_relation_df = pd.DataFrame(para_para_pair, columns=['Word1', 'Relation', 'Word2'])
# head_para_relation_df = pd.DataFrame(head_para_pair, columns=['Word1', 'Relation', 'Word2'])

In [158]:
paragraph_df

Unnamed: 0,Para_id,Text
0,para_0,Bodengewinnbesteuerung nach § 55 EStGInhaltsüb...
1,para_1,1. Gesetzliche Grundlagen2. Zeitlicher Anwendu...
2,para_2,1. Allgemeines2. Wirtschaftliches Eigentum3. G...
3,para_3,1. Übertragung des Betriebs mit allen wesentli...
4,para_4,1. Betriebseinstellung2. Betriebsverpachtung
5,para_5,1. Allgemeines2. Ausgangsbeträge nach § 55 Abs...
6,para_6,"1. Antragstellung, Ermittlung und Feststellung..."
7,para_7,1. Gesetzliche GrundlagenDurch das Zweite Steu...
8,para_8,1. Ernsthafte land- und forstwirtschaftliche B...
9,para_9,1. Allgemeines Land- und forstwirtschaftlich ...


In [136]:
temp_relation_df

Unnamed: 0,Word1,Relation,Word2
0,Festsetzung,Festsetzung von Vorauszahlungen,Vorauszahlungen
1,höheren,höheren Teilwerts nach § 55 Abs,Abs
2,Bodens,Bodens X. Besteuerung,Besteuerung
3,Bodens,Bodens bei Stpfl. mit Gewinnermittlung,Gewinnermittlung
4,Bodens,Bodens X. Besteuerung der Boden,Boden
5,Besteuerung,Besteuerung der Bodengewinne1. Bei Gewinnermit...,Gewinnermittlung
6,Besteuerung,Besteuerung der Boden,Boden
7,Steuer,Steuerarten VIII,VIII
8,VIII,VIII. Verlustklausel IX,IX
9,gemä,gemäß § 55 Abs,Abs


**Temp Relation Dataframe Cleaning**

In [57]:
pd.set_option('display.max_rows', None)

In [76]:
temp_relation_df.shape

(1340, 3)

In [90]:
def isLawPara(inputString):
  matched = re.match("§[ §a-zA-Z0-9,.]+", inputString)
  is_match = bool(matched)
  return is_match

In [92]:
# Remove duplicates
temp_relation_df = temp_relation_df.drop_duplicates()
law_para_list = []
for index, rows in temp_relation_df.iterrows():
  word1 = rows["Word1"]
  word2 = rows["Word2"]
  relation = rows["Relation"]
  # filter out rows with Law Para in different Dataframe
  if isLawPara(word1) or isLawPara(word2):
    print(word1+" | " + relation+ " | " +word2)

ausländische Einkünfte --- Einkünfte nach § 34d----§ 34d EStG
Einkünfte --- die von § 2----§ 2 AStG
Anwendung --- des § 2----§ 2 AStG
Steuerermäßigung --- des § 34c----§ 34c EStG
Zugangsfiktion --- des § 123----§ 123 AO
AStG --- i.V.m. § 17----§ 17 EStG
Besteuerung --- nach § 6 AStG----§ 6 AStG ausl
Vorschrift --- des § 17----§ 17 EStG
Anschaffungskosten --- sowie § 9----§ 9 BewG
Werbungskosten --- nach § 160----§ 160 AO
Verfahren --- nach § 18----§ 18 AStG
vorläufig --- nach § 165----§ 165 AO
Außensteuergesetzes --- §§Fassung----§§Fassung desf
