In [1]:
#-------------------------------------------------------------------------------------
# Script for extracting items from decision texts
#-------------------------------------------------------------------------------------
# Author: Patrik Schwalm
# E-Mail: schwapa3@students.zhaw.ch
# Last update: 15.05.2022
# Version 1.21

#------------------------------
# Setup
#------------------------------
# Install Python (Ananconda)
# Install the necessary Python libraries (see Python libraries)
# Save labeled training and evaluation data in the working directory
# Save 'sentence_data.json' from the text categorizer in the working directory
# Create the file 'ner_base_config.cfg' at: https://spacy.io/usage/training#quickstart


#------------------------------
# README
#------------------------------
# The trained model is saved to the folder 'output'


#-------------------------------------------------------------------------------------
# Python libraries
#-------------------------------------------------------------------------------------

import re
import spacy
from spacy.tokens import DocBin
from tqdm import tqdm
import json
from datetime import datetime

#-------------------------------------------------------------------------------------
# Create training data
#-------------------------------------------------------------------------------------

# This function is based on: https://github.com/amrrs/custom-ner-with-spacy3
def create_training_data():
    
    # get training data
    with open('train_data.json') as json_file:
        train_data = json.load(json_file)
        
    # get validation data
    with open('valid_data.json') as json_file:
        valid_data = json.load(json_file)
        
    # create training data
    print('Create training data:')
    nlp = spacy.blank('en')
    db = DocBin()
    docs = []
    for text, annot in tqdm(train_data['annotations']): 
        doc = nlp.make_doc(text) 
        ents = []
        for start, end, label in annot["entities"]:
            span = doc.char_span(start, end, label=label, alignment_mode="contract")
            if span is None:
                print("Skipping entity")
                print(text)
            else:
                ents.append(span)
        doc.ents = ents 
        db.add(doc)

    db.to_disk("./data/train.spacy")
    
    # create validation data
    print('Create validation data:')
    nlp = spacy.blank('en')
    db = DocBin()
    docs = []
    for text, annot in tqdm(valid_data['annotations']): 
        doc = nlp.make_doc(text) 
        ents = []
        for start, end, label in annot["entities"]:
            span = doc.char_span(start, end, label=label, alignment_mode="contract")
            if span is None:
                print("Skipping entity")
            else:
                ents.append(span)
        doc.ents = ents 
        db.add(doc)

    db.to_disk("./data/valid.spacy")

#-------------------------------------------------------------------------------------
# Create config-file
#-------------------------------------------------------------------------------------

def create_config():
    
    !python -m spacy init fill-config ./ner_base_config.cfg ./ner_config.cfg

#-------------------------------------------------------------------------------------
# Train the NER model
#-------------------------------------------------------------------------------------

def train_model():
    
    !python -m spacy train ner_config.cfg --output ./output

#-------------------------------------------------------------------------------------
# Span categorizer
#-------------------------------------------------------------------------------------

def span_categorizer():
    
    new_data = {}
    new_data['mergers'] = []
    
    check_manual_data = {}
    check_manual_data['mergers'] = []
    
    
    with open('sentence_data.json') as json_file:
        data = json.load(json_file)
    
    nlp_ner = spacy.load("output/model-best")
    nlp = spacy.blank('en')
    # Set max document length
    nlp.max_length = 3000000
    
    number_of_mergers = len(data['mergers'])
    
    pbar = tqdm(total=int(number_of_mergers))
    
    # calcualte phase of the merger
    for merger_index, merger in enumerate(data['mergers']):
        new_decisions = []
        phase = ""
        
        try:
            not_date = datetime.strptime(merger['notification date'], '%d.%m.%Y').date()
            dec_date = datetime.strptime(merger['decisions'][0]['decision date'], '%d.%m.%Y').date()
            dec_period = (dec_date-not_date).days
            
            if(dec_period <= 49):
                phase = "Phase I"
            else:
                phase = "Phase II"
        except:
            phase = "Error - Could not calculate phase"
        
        # Extract items from the decision texts
        for decision_index, decision in enumerate(merger['decisions']):
            if(decision):
                new_decision_texts = []
                                
                for decision_text_index, decision_text, in enumerate(decision['decision texts']):
                    
                    dec_affected_laws = []
                    dec_conditions_obligations = []
                    dec_violations = []
                    dec_decisions = []
                    dec_penalties = []
                    
                    if(decision_text['dec sentences'] == ""):
                        dec_affected_laws = ["Error - Please check the pdf-file manually"]
                        dec_conditions_obligations = ["Error - Please check the pdf-file manually"]
                        dec_violations = ["Error - Please check the pdf-file manually"]
                        dec_decisions = ["Error - Please check the pdf-file manually"]
                        dec_penalties = ["Error - Please check the pdf-file manually"]

                    if(decision_text['dec sentences']):
                        text = decision_text['dec sentences']
                        
                        if(len(text) < nlp.max_length):
                            
                            doc2 = nlp_ner(text)
                            
                            for ent in doc2.ents:
                                
                                if ent.label_ == "AFFECTED LAW":
                                    dec_affected_laws.append(ent.text)
                                elif ent.label_ == "CONDITIONS & OBLIGATIONS":
                                    dec_conditions_obligations.append(ent.text)
                                elif ent.label_ == "VIOLATION":
                                    dec_violations.append(ent.text)
                                elif ent.label_ == "DECISION":
                                    dec_decisions.append(ent.text)
                                    
                            fines = re.findall('[Aa](\stotal)? fine of ((EUR)|(\\u20ac)|(ECU)) [0-9]*[0-9\s,]*(\smillion)? is\s?(hereby)? imposed on [\s\S]*?(?=( pursuant)|( for))', text)
                            periodic_penalty_payments = re.findall('periodic penalty payments .+?(((EUR)|(\\u20ac)|(ECU)) [0-9]*[0-9\s,]*(\smillion)?)', text)
                            
                            for fine in fines:
                                dec_penalties.append(fine)
                                
                            for periodic_penalty_payment in periodic_penalty_payments:
                                dec_penalties.append(periodic_penalty_payment)
                                    
                        new_decision_texts.append({
                            'language': decision_text['language'],
                            'link': decision_text['link'],
                            'dec affected laws': dec_affected_laws,
                            'dec conditions & obligations': dec_conditions_obligations,
                            'dec violations': dec_violations,
                            'dec decisions': dec_decisions, 
                            'dec penalties': dec_penalties
                        })
                        
                new_decisions.append({
                    'decision type': decision['decision type'],
                    'decision date': decision['decision date'],
                    'decision texts': new_decision_texts
                })
                
        new_data['mergers'].append({
            'case number': merger['case number'],
            'companies': merger['companies'],
            'notification date': merger['notification date'],
            'NACE': merger['NACE'],
            'Phase': phase,
            'decisions': new_decisions
        })
        
        pbar.update(1)
        
    with open('final_merger_data.json', 'w') as outfile:
        json.dump(new_data, outfile, indent = 4)
                                
                                

In [2]:
create_training_data()

Create training data:


100%|██████████| 204/204 [00:04<00:00, 47.17it/s]


Create validation data:


100%|██████████| 53/53 [00:00<00:00, 86.23it/s] 


In [3]:
create_config()

[+] Auto-filled config with all values
[+] Saved config
ner_config.cfg
You can now add your data and train your pipeline:
python -m spacy train ner_config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [4]:
train_model()

[i] Saving to output directory: output

[2022-05-12 15:59:00,602] [INFO] Set up nlp object from config
[2022-05-12 15:59:00,617] [INFO] Pipeline: ['tok2vec', 'ner']
[2022-05-12 15:59:00,622] [INFO] Created vocabulary
[2022-05-12 15:59:00,628] [INFO] Finished initializing nlp object
[2022-05-12 15:59:05,224] [INFO] Initialized pipeline components: ['tok2vec', 'ner']



[i] Using CPU
[1m
[+] Initialized pipeline
[1m
[i] Pipeline: ['tok2vec', 'ner']
[i] Initial learn rate: 0.001
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00     64.89    0.00    0.00    0.00    0.00
  0     200       3429.62  13910.13   57.97   61.54   54.79    0.58
  2     400       2086.34   1107.08   71.79   67.47   76.71    0.72
  3     600       3038.84    803.24   79.08   75.62   82.88    0.79
  4     800       2373.15    608.47   83.92   85.71   82.19    0.84
  5    1000      10292.31    755.44   83.28   79.87   86.99    0.83
  7    1200      26459.18   1167.06   86.62   89.13   84.25    0.87
  9    1400      15090.89    496.00   91.10   91.10   91.10    0.91
 11    1600       8886.76    382.73   90.97   92.25   89.73    0.91
 14    1800        302.42    164.71   89.29   93.28   85.62    0.89
 17    2000        999.40    196.72   89.51   91.43   87.67    0.90
 20

In [4]:
span_categorizer()

100%|██████████| 8447/8447 [02:40<00:00, 52.51it/s] 
