In [1]:
#-------------------------------------------------------------------------------------
# Create labeled data from the file clean_train_data.json from the data_cleaner_vx.xx
#-------------------------------------------------------------------------------------
# Author: Patrik Schwalm
# E-Mail: schwapa3@students.zhaw.ch
# Last update: 01.05.2022
# Version 2.02

#------------------------------
# Setup
#------------------------------
# Install Python (Ananconda)
# Install the necessary Python libraries (see Python libraries)
# Save the JSON-file 'clean_train_data.json' from the data cleaner in the in the working directory

#------------------------------
# README
#------------------------------
# The labeled data has still to be manually verified by a human


#-------------------------------------------------------------------------------------
# Python libraries
#-------------------------------------------------------------------------------------

import json
import spacy
import re
from tqdm import tqdm
import json

#-------------------------------------------------------------------------------------
# Read data from JSON-file
#-------------------------------------------------------------------------------------

# get clean_train_data.json
def get_json_file(filename = 'clean_train_data.json'):
    # read JSON-file
    with open(filename) as json_file:
        data = json.load(json_file)

    return(data)

#-------------------------------------------------------------------------------------
# Labele the sentences of the deicion texts
#-------------------------------------------------------------------------------------

def label_data(data):
    
    # Initialisation
    beg_labeled_data = {}
    end_labeled_data = {}
    out_labeled_data = {}
    beg_labeled_data['labeled datasets'] = []
    end_labeled_data['labeled datasets'] = []
    out_labeled_data['labeled datasets'] = []
    
    nlp = spacy.load("en_core_web_lg")
    # Set max document length
    nlp.max_length = 3000000
    skipped_files_counter = 0
    
    for merger_index, merger in tqdm(enumerate(data['mergers']), total = len(data['mergers'])):
        case_number = merger['case number']
        for decision_index, decision in enumerate(merger['decisions']):
            decision_type = decision['decision type']
            for decision_text_index, decision_text in enumerate(decision['decision texts']):
                decision_link = decision_text['link']
                decision_text = decision_text['text']
                
                if(len(decision_text) < nlp.max_length):
                   
                    doc = nlp(decision_text)

                    for sent in doc.sents:
                        if(
                            re.search('[Hh][Aa][Ss]\s[Aa]-?[Dd][Oo][Pp][Tt]-?[Ee][Dd]\s[Tt][Hh][Ii][Ss]\s[Dd][Ee]-?[Cc][Ii]-?[Ss][Ii][Oo][Nn]', sent.text) \
                            or re.search('For (the|these) above (mentioned)?\s?reasons,? the (European)?\s?Commission has (therefore)?\s?(decided|(come to the conclusion))', sent.text) \
                            or re.search('The Commission has therefore decided', sent.text) \
                            or re.search('For the reasons set out in the Notice on a simplified procedure,? the European Commission has decided', sent.text) \
                            or re.search('In (light|view) of the above,? the Commission', sent.text) \
                            or re.search('For the above reasons,? subject to full compliance with the commitments submitted by the notifying party, the Commission has decided', sent.text) \
                            or re.search('Based on the above considerations,? the Commission has decided', sent.text) \
                            or re.search('For these reasons,? the Commission concludes', sent.text) \
                            or re.search('For the above reasons,? and given that [\s\S]* expressed its agreement,? ', sent.text)
                        ):
                            beg_labeled_data['labeled datasets'].append({
                                'text': sent.text,
                                'label': 'beg',
                                'decision type': decision_type,
                                'link': decision_link,
                                'case number': case_number
                            })
                        elif(
                            re.search('For the Commission', sent.text) \
                            or re.search('ELECTRONICALLY RE-CREATED TEXT', sent.text)
                        ):
                            end_labeled_data['labeled datasets'].append({
                                'text': sent.text,
                                'label': 'end',
                                'decision type': decision_type,
                                'link': decision_link,
                                'case number': case_number
                            })
                        else:
                            out_labeled_data['labeled datasets'].append({
                                'text': sent.text,
                                'label': 'out',
                                'decision type': decision_type,
                                'link': decision_link,
                                'case number': case_number
                            })
                else:
                    skipped_files_counter += 1
                                    
    return(beg_labeled_data, end_labeled_data, out_labeled_data, skipped_files_counter)

#-------------------------------------------------------------------------------------
# Data labeler
#-------------------------------------------------------------------------------------

def en_data_labler():
    data = get_json_file()
    beg_labeled_data, end_labeled_data, out_labeled_data, skipped_files_counter = label_data(data)
    
    print('----------------------------------------------------')
    print('Writing data to JSON-File ...Please wait')
    print('----------------------------------------------------')
    
    with open('beg.json', 'w') as outfile:
        json.dump(beg_labeled_data, outfile, indent = 4)
    
    with open('end.json', 'w') as outfile:
        json.dump(end_labeled_data, outfile, indent = 4)

    with open('out.json', 'w') as outfile:
        json.dump(out_labeled_data, outfile, indent = 4)
        
    print('----------------------------------------------------')
    print('JSON-File successfully saved')
    print('----------------------------------------------------')
    
    print('----------------------------------------------------')
    print('Skipped '+str(skipped_files_counter)+' files because they were too large.')
    print('----------------------------------------------------')
    
    print('----------------------------------------------------')
    print('Script sucessfully terminated')
    print('----------------------------------------------------')


In [None]:
en_data_labler()

100%|██████████| 6946/6946 [7:52:10<00:00,  4.08s/it]     


----------------------------------------------------
Writing data to JSON-File ...Please wait
----------------------------------------------------
