In [1]:
#-------------------------------------------------------------------------------------
# Script for cleaning the output data from web_scraper_vx.xx
#-------------------------------------------------------------------------------------
# Author: Patrik Schwalm
# E-Mail: schwapa3@students.zhaw.ch
# Last update: 10.05.2022
# Version 1.31

#------------------------------
# Setup
#------------------------------
# Install Python (Ananconda)
# Install the necessary Python libraries (see Python libraries)
# Create a JSON-file with the notebook web_scraper_vx.xx.ipynb
# Save the JSON-file in the in the working directory with the name 'raw_data.json'

#------------------------------
# README
#------------------------------
# The cleaned JSON-files are saved in the working directoory with the name 'clean_train_data.json' and 'clean_input_data.json'
# Only decision texts in english can be cleaned

#-------------------------------------------------------------------------------------
# Python libraries
#-------------------------------------------------------------------------------------

import json
import re

#-------------------------------------------------------------------------------------
# Read data from JSON-file
#-------------------------------------------------------------------------------------

def get_json_file(filename = 'raw_data.json'):
    # read JSON-file
    with open(filename) as json_file:
        data = json.load(json_file)

    return(data)
        
#-------------------------------------------------------------------------------------
# Clean company names
#-------------------------------------------------------------------------------------

def clean_company_names(data):
    
    # Remove additions such as: 'SYNTHELABO (Art.14 proc.) (see M.1397 and M.1542)' from the company names
    regex_company_names = [
        ('\([Aa]rt.[0-9a-zA-Z_\s.]*\)', ''),
        ('\([Ss]ee[0-9a-zA-Z_\s.]*\)', ''),
        ('\\"', ''),
        ('^\s*', ''),
        ('\s*$', '')
    ]

    for merger in data['mergers']:
        for company_index, company in enumerate(merger['companies']):
            for regex_company_name, new in regex_company_names:
                company = re.sub(regex_company_name, new, company)
            merger['companies'][company_index] = company
    
    return(data)

#-------------------------------------------------------------------------------------
# Clean texts of some (cid:)
#-------------------------------------------------------------------------------------

def clean_text_of_unprintable_chars(data):
    
    # Only (cid:133) is needed for the sentenicer in the data_analyzer, the others are just cosmetics
    regex_decision_texts = [
        ("\(cid:146\)", "'"),
        ('\(cid:147\)', '"'),
        ('\(cid:148\)', '"'),
        ('\(cid:133\)', '...'),
        ('\(cid:150\)', '-')
    ]
    
    for merger in data['mergers']:
        for decision_index, decision in enumerate(merger['decisions']):
            for decision_text_index, decision_text in enumerate(decision['decision texts']):
                text = decision_text['text']
                for regex_decision_text, new in regex_decision_texts:
                    text = re.sub(regex_decision_text, new, text)
                decision_text['text'] = text

    return(data)

#-------------------------------------------------------------------------------------
# delete all decision texts except the english ones
#-------------------------------------------------------------------------------------

def get_en_decision_texts_only(data):
    
    merger_indexes = []
    decision_indexes = []
    decision_text_indexes = []
    
    for merger_index, merger in enumerate(data['mergers']):
        for decision_index, decision in enumerate(merger['decisions']):
            for decision_text_index, decision_text in enumerate(decision['decision texts']):
                if(
                    decision_text['language'] != "en" \
                    or "Error - PDF-file is not searchable" in decision_text['text'] \
                    or "Error - link does not point to a valid PDF-file" in decision_text['text']
                ):
                    merger_indexes.append(merger_index)
                    decision_indexes.append(decision_index)
                    decision_text_indexes.append(decision_text_index)
    
    
    merger_indexes.sort(reverse=True)
    decision_indexes = decision_indexes[::-1]
    decision_text_indexes = decision_text_indexes[::-1]
    
    for index in range(len(merger_indexes)):
        data['mergers'][merger_indexes[index]]['decisions'][decision_indexes[index]]['decision texts'].pop(decision_text_indexes[index])
                            
    return(data)

#-------------------------------------------------------------------------------------
# remove all \n (only tested with en decision texts)
#-------------------------------------------------------------------------------------

def clean_decision_texts_of_newline(data):
    
    for merger in data['mergers']:
        for decision_index, decision in enumerate(merger['decisions']):
            for decision_text_index, decision_text in enumerate(decision['decision texts']):
                # This function is based on: https://stackoverflow.com/a/37001613
                decision_text['text'] = ' '.join(decision_text['text'].split())
                
    return(data)

#-------------------------------------------------------------------------------------
# remove all all merger cases without a decision text
#-------------------------------------------------------------------------------------

def remove_mergers_without_decision_texts(data):
    
    merger_indexes = []
    decision_indexes = []
    
    for merger_index, merger in enumerate(data['mergers']):
        for decision_index, decision in enumerate(merger['decisions']):
            if not decision['decision texts']:
                merger_indexes.append(merger_index)
                decision_indexes.append(decision_index)
                
    merger_indexes.sort(reverse=True)
    decision_indexes = decision_indexes[::-1]
    
    for index in range(len(merger_indexes)):
        data['mergers'][merger_indexes[index]]['decisions'].pop(decision_indexes[index])
    
    
    del_merger_indexes = []
    
    for del_merger_index, merger in enumerate(data['mergers']):
        if not merger['decisions']:
            del_merger_indexes.append(del_merger_index)
    
    del_merger_indexes.sort(reverse=True)
    
    for index in range(len(del_merger_indexes)):
        data['mergers'].pop(del_merger_indexes[index])
            
    return(data)


#-------------------------------------------------------------------------------------
# Write data to JSON-file
#-------------------------------------------------------------------------------------

def write_to_json_file(data, filename):
    # Write all the scraped data in the JSON-file
    with open(filename+'.json', 'w') as outfile:
        json.dump(data, outfile, indent = 4)

#-------------------------------------------------------------------------------------
# Create a json-fille for training data for the data labler
#-------------------------------------------------------------------------------------

def create_en_train_data():
    data = get_json_file()
    data = clean_company_names(data)
    data = get_en_decision_texts_only(data)
    data = clean_decision_texts_of_newline(data)
    data = remove_mergers_without_decision_texts(data)
    data = clean_text_of_unprintable_chars(data)
    write_to_json_file(data, 'clean_train_data')
    
#-------------------------------------------------------------------------------------
# Create clean input data for the model
#-------------------------------------------------------------------------------------

def create_en_analysis_data():
    data = get_json_file()
    data = clean_company_names(data)
    data = get_en_decision_texts_only(data)
    data = clean_decision_texts_of_newline(data)
    data = clean_text_of_unprintable_chars(data)
    write_to_json_file(data, 'clean_input_data')
    

In [2]:
create_en_train_data()
create_en_analysis_data()