In [1]:
#-------------------------------------------------------------------------------------
# Extract text with the "relevant" information from decision texts
#-------------------------------------------------------------------------------------
# Author: Patrik Schwalm
# E-Mail: schwapa3@students.zhaw.ch
# Last update: 10.05.2022
# Version 2.00

#------------------------------
# Setup
#------------------------------
# Install Python (Ananconda)
# Install the necessary Python libraries (see Python libraries)
# Save the 'final_train_data.json' file and the 'final_valid_data.json' in the working directory
# Save the file 'clean_input_data.json' from the data cleaner in the working directory
# Create the file 'textcat_base_config.cfg' at: https://spacy.io/usage/training#quickstart

#------------------------------
# README
#------------------------------
# The trained model is saved to the folder 'output'


#-------------------------------------------------------------------------------------
# Python libraries
#-------------------------------------------------------------------------------------

import spacy
from spacy.tokens import DocBin
from tqdm import tqdm
import json

#-------------------------------------------------------------------------------------
# Set categories for labeled data
#-------------------------------------------------------------------------------------

# This function is based on: https://towardsdatascience.com/building-sentiment-classifier-using-spacy-3-0-transformers-c744bfc767b
def make_docs(data):
    
    nlp = spacy.load("en_core_web_lg")
    
    docs = []
    
    for doc, label in tqdm(nlp.pipe(data, as_tuples=True), total = len(data)):
        if label == 'beg':
            doc.cats['beg'] = 1
            doc.cats['end'] = 0
            doc.cats['out'] = 0
        elif label == 'end':
            doc.cats['beg'] = 0
            doc.cats['end'] = 1
            doc.cats['out'] = 0
        else:
            doc.cats['beg'] = 0
            doc.cats['end'] = 0
            doc.cats['out'] = 1
            
        docs.append(doc)
    
    return(docs)       

#-------------------------------------------------------------------------------------
# Create training and validation data
#-------------------------------------------------------------------------------------

# This function is based on: https://towardsdatascience.com/building-sentiment-classifier-using-spacy-3-0-transformers-c744bfc767b
# This function is based on : https://medium.com/analytics-vidhya/building-a-text-classifier-with-spacy-3-0-dd16e9979a
def create_training_data():
    
    with open('final_train_data.json') as json_file:
        train_data = json.load(json_file)
    
    with open('final_valid_data.json') as json_file:
        valid_data = json.load(json_file)
        
    print('Create training data:')
    train_docs = make_docs(train_data)
    doc_bin = DocBin(docs=train_docs)
    doc_bin.to_disk("./data/train.spacy")
    
    print('Create validation data:')
    valid_docs = make_docs(valid_data)
    doc_bin = DocBin(docs=valid_docs)
    doc_bin.to_disk("./data/valid.spacy")
    
#-------------------------------------------------------------------------------------
# Create the config file
#-------------------------------------------------------------------------------------

def create_config():
    
    !python -m spacy init fill-config ./textcat_base_config.cfg ./textcat_config.cfg

#-------------------------------------------------------------------------------------
# Train the model
#-------------------------------------------------------------------------------------

def train_model():
    
    !python -m spacy train textcat_config.cfg --output ./output
    

#-------------------------------------------------------------------------------------
# Text catgorizer
#-------------------------------------------------------------------------------------

def text_categorizer():
    
    new_data = {}
    new_data['mergers'] = []
    
    with open('clean_input_data.json') as json_file:
        data = json.load(json_file)
        
    nlp_textcat = spacy.load("output/model-best")
    nlp = spacy.load("en_core_web_lg")
    nlp.max_length = 3000000
    skipped_files_counter = 0
    
    number_of_mergers = len(data['mergers'])
    
    pbar = tqdm(total=int(number_of_mergers))
    
    for merger_index, merger in enumerate(data['mergers']):
        new_decisions = []
        for decision_index, decision in enumerate(merger['decisions']):
            if(decision):
                new_decision_texts = []
                
                for decision_text_index, decision_text, in enumerate(decision['decision texts']):
                    
                    dec_sentences = []
                    
                    if(decision_text['text']):
                        text = decision_text['text']
                        
                        if(len(text) < nlp.max_length):
                            
                            doc = nlp(text)
                            is_important = False

                            for sent in doc.sents:
                                sentence = sent.text
                                doc2 = nlp_textcat(sentence)
                                cats = doc2.cats
                                beg = cats['beg']
                                end = cats['end']
                                out = cats['out']
                                
                                if (beg > end) and (beg > out):
                                    if not(is_important):
                                        is_important = True
                                    else:
                                         dec_sentences.clear()
                                    if (dec_sentences):
                                        dec_sentences.clear()
                                    dec_sentences.append(sentence)
                                elif (end > beg) and (end > out):
                                    if(is_important):
                                        dec_sentences.append(sentence)
                                    is_important = False
                                else:
                                    if is_important == True:
                                        dec_sentences.append(sentence)
                        else:
                            print('Error - Text was too big to be analyzed')
                        
                        final_dec_sentences = ""
                        for sentence in dec_sentences:
                            final_dec_sentences = final_dec_sentences + " " + sentence                           
                        
                        new_decision_texts.append({
                            'language': decision_text['language'],
                            'link': decision_text['link'],
                            'dec sentences': final_dec_sentences
                        })
                    
                new_decisions.append({
                    'decision type': decision['decision type'],
                    'decision date': decision['decision date'],
                    'decision texts': new_decision_texts
                })
                    
        new_data['mergers'].append({
            'case number': merger['case number'],
            'companies': merger['companies'],
            'notification date': merger['notification date'],
            'NACE': merger['NACE'],
            'decisions': new_decisions
        })
        
        pbar.update(1)
        
    with open('sentence_data.json', 'w') as outfile:
        json.dump(new_data, outfile, indent = 4)
        

In [2]:
create_training_data()

Create training data:


100%|██████████| 1440/1440 [00:12<00:00, 116.16it/s]


Create validation data:


100%|██████████| 360/360 [00:03<00:00, 115.09it/s]


In [3]:
create_config()

[+] Auto-filled config with all values
[+] Saved config
textcat_config.cfg
You can now add your data and train your pipeline:
python -m spacy train textcat_config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [4]:
train_model()

[i] Saving to output directory: output
[i] Using CPU
[1m
[+] Initialized pipeline
[1m
[i] Pipeline: ['textcat']
[i] Initial learn rate: 0.001
E    #       LOSS TEXTCAT  CATS_SCORE  SCORE 
---  ------  ------------  ----------  ------
  0       0          0.03        0.00    0.00
  0     200         12.59       93.02    0.93
  0     400          7.25       94.78    0.95
  1     600          1.16       96.40    0.96
  2     800          0.46       97.14    0.97
  3    1000          0.20       97.17    0.97
  4    1200          0.09       97.32    0.97
  6    1400          0.05       97.91    0.98
  8    1600          0.02       97.91    0.98
 10    1800          0.01       97.91    0.98
 13    2000          0.01       98.18    0.98
 16    2200          0.01       98.05    0.98
 21    2400          0.00       98.33    0.98
 25    2600          0.00       98.47    0.98
 29    2800          0.00       98.47    0.98
 34    3000          0.00       98.47    0.98
 38    3200          0.00   

[2022-05-07 15:04:14,679] [INFO] Set up nlp object from config
[2022-05-07 15:04:14,705] [INFO] Pipeline: ['textcat']
[2022-05-07 15:04:14,714] [INFO] Created vocabulary
[2022-05-07 15:04:14,731] [INFO] Finished initializing nlp object
[2022-05-07 15:04:18,703] [INFO] Initialized pipeline components: ['textcat']


In [4]:
text_categorizer()

100%|██████████| 8447/8447 [7:50:04<00:00,  3.34s/it]     
