In [1]:
#-------------------------------------------------------------------------------------
# Script for scraping merger cases on the European Commission competition website
#-------------------------------------------------------------------------------------
# Author: Patrik Schwalm
# E-Mail: schwapa3@students.zhaw.ch
# Last update: 29.04.2022
# Version 2.02

#------------------------------
# Setup
#------------------------------
# Install Python (Ananconda)
# Install the necessary Python libraries (see Python libraries)
# Install Google Chrome
# Download ChromeDriver (https://chromedriver.chromium.org/downloads)
# Save the ChromeDriver in the working directory
# Create a file called 'user_agents.txt' with user agents
# Create a folder called 'tmp' in the working directory

#------------------------------
# README
#------------------------------
# The web scraper saves the data in a JSON-file with the following naming convention '%d%m%Y-%H%M%S.json'
# The web scraper will write the data to the file once all the data is scraped
# This means the web scraper cannot be paused
# It takes around 9 hours for the web scarper to finish
# The created JSON-file needs roughly 500 MB of disk space
# On the European Commission competition website are 6 PDF-files that are not searchable (as of 06.02.2022)


#-------------------------------------------------------------------------------------
# Python libraries
#-------------------------------------------------------------------------------------

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import random
import time
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys
from datetime import datetime
import json
import os
import requests
import pdfplumber
from tqdm import tqdm

#-------------------------------------------------------------------------------------
# User agents
#-------------------------------------------------------------------------------------

ua_path = 'user_agents.txt'
ua_list = [line.rstrip('\n') for line in open(ua_path)]

#----------------------------------------------------------------------------------------
# Driver settings
#----------------------------------------------------------------------------------------

def new_chrome_driver(headless = False):
    opts = Options()
    opts.add_argument('user-agent=' + random.choice(ua_list))
    
    # Driver headless (True/False)
    if headless == True:
        opts.add_argument('--headless')
    else:
        opts.add_argument('--window-size=1200,800')
    
    # Driver
    driver = webdriver.Chrome(executable_path='chromedriver.exe', options=opts)
    
    return (driver)

#-------------------------------------------------------------------------------------
# Download PDF-file and extract the text
#-------------------------------------------------------------------------------------

def pdf_handler(link, language):
    
    # Check if language is empty
    if language == '':
        language = 'N/A' 
    
    # Filter out corrupt links without PDF-files
    if link == 'https://ec.europa.eu/competition/mergers//':                      

        text = 'Error - link does not point to a valid PDF-file'
                            
    else:                       
    
        headers = {
                    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
                    "Accept-Encoding": "gzip, deflate, br",
                    "Accept-Language": "de-DE,de;q=0.9,en-US;q=0.8,en;q=0.7",
                    "Cache-Control": "max-age=0",
                    "Connection": "keep-alive",
                    "Host": "ec.europa.eu",
                    "Referer": "https://ec.europa.eu/competition/elojade/isef/index.cfm?fuseaction=dsp_result&policy_area_id=2",
                    "sec-ch-ua": '" Not;A Brand";v="99", "Google Chrome";v="97", "Chromium";v="97"',
                    "sec-ch-ua-mobile": "?0",
                    "sec-ch-ua-platform": '"Windows"',
                    "Sec-Fetch-Dest": "document",
                    "Sec-Fetch-Mode": "navigate",
                    "Sec-Fetch-Site": "same-origin",
                    "Sec-Fetch-User": "?1",
                    "Upgrade-Insecure-Requests": "1",
                    "User-Agent": random.choice(ua_list)
                }

        delay = 5
        max_retries = 10

        # Download the PDF-file
        # Retry to download the PDF-file after an ConnectionResetError 10054 with increased delay
        # ConnectionResetError 10054 may occur at irregular intervals
        # The error handling of the ConnectionResetError 10054 is based on: https://stackoverflow.com/a/67499291
        for _ in range(max_retries):
            try:
                response = requests.get(link, headers=headers)
                file_path = os.path.join('./tmp', os.path.basename(link))
                with open(file_path, 'wb') as f:
                    f.write(response.content)                  
                break
            except:
                time.sleep(delay)
                delay *=2
                print('Information: ConnectionResetError 10054 occured')
        else:
            raise Exception()

        try:
            text = ''
            # Extract text from the PDF-file
            with pdfplumber.open(file_path) as pdf:
                for page in pdf.pages:
                    text += page.extract_text()
            
            # Handle PDF-files that are not searchable
            if text == '':
                text = 'Error - PDF-file is not searchable'
            
            page.close()
            # Delete the PDF-file(s) to save space
            os.remove(file_path)

        # Links to the PDF-files can be corrupt and do not point to a PDF-file
        # If that is the case the generated PDF-file can not be opened with pdfplumber and an eror is thrown
        except:

            text = 'Error - link does not point to a valid PDF-file'
                
    return(language, text)

#-------------------------------------------------------------------------------------
# Scraping functionalities
#-------------------------------------------------------------------------------------

def eu_merger_scraper(case_number_filter = 'none'):
    
    print('----------------------------------------------------')
    print('Starting the script ...Please wait')
    print('----------------------------------------------------')
    
    # Initialisation
    driver = new_chrome_driver()
    date_time = datetime.now().strftime("%d%m%Y-%H%M%S")
    last_site = False
    data = {}
    data['mergers'] = []
    
    # Access the website and delete all cookies
    driver.get('https://ec.europa.eu/competition/elojade/isef/index.cfm?clear=1&policy_area_id=2')
    driver.delete_all_cookies()
    time.sleep(5)
    
    # 'Select All' Decision Types
    driver.find_element_by_xpath('//*[@id="adv_search_2"]/fieldset/div[3]/a[1]').click()
    time.sleep(3)
    
    
    # Deselct the 'Decision Types' without an 'Article'
    no_art_decisions = driver.find_elements_by_xpath('//*[@id="decision_type_id_2"]/option[not(contains(text(),"Art."))]')
    
    for no_art_decision in no_art_decisions:
        ActionChains(driver) \
            .key_down(Keys.CONTROL) \
            .click(no_art_decision) \
            .key_up(Keys.CONTROL) \
            .perform()
    time.sleep(3)
    
    # Define filter 'Case Number' --> this is used for testing purposes
    if case_number_filter != 'none':
        driver.find_element_by_xpath('//*[@id="case_number"]').send_keys(case_number_filter)
    
    # Click 'Search' Button
    driver.find_element_by_xpath('//*[@id="BodyContent"]/form/input[1]').click()
    time.sleep(5)
    
    # Get the number of mergers
    number_of_mergers = driver.find_element_by_xpath('//*[@id="BodyContent"]/table[3]/tbody/tr[2]/td').text[10:]
    pbar = tqdm(total=int(number_of_mergers))
    
    #------------------------------
    # Loop through all the pages
    #------------------------------
    while not last_site:
        
        # Open all the 'Show details' tabs
        details = driver.find_elements_by_class_name('details')
        while details:
            
            for detail in details:
                detail.click()
                time.sleep(0.4)
            
            # Check if all 'Show details' tabs are opened
            time.sleep (1)
            details = driver.find_elements_by_xpath('//input[@value="Show details"]')
        
        # Get all the mergers on the page
        mergers =  driver.find_elements_by_class_name('test')
        
        # Loop through all the mergers on the current page
        for merger in mergers:
            
            # Get the 'Case Number' of the merger
            case_number = merger.find_element_by_xpath('.//p/span/strong/a[1]').text
            
            companies = []
            # Get all the companies names
            sel_companies = merger.find_elements_by_xpath('.//p/span/strong/a[position()>=2]')
            # Append each company to the list 'companies'
            for sel_company in sel_companies:
                companies.append(sel_company.text)
            
            # Check if the merger has a notification date and if so get the notification date
            if merger.find_elements_by_xpath('.//*[@class="details"]/tbody/tr[./td[contains(text(),"Notification on")]]/td[2]'):
                notification_date = merger.find_element_by_xpath('.//*[@class="details"]/tbody/tr[./td[contains(text(),"Notification on")]]/td[2]').text
            else:
                notification_date = 'N/A'
                
            nace_codes = []
            # Get all the 'NACE' codes
            sel_nace_codes = merger.find_elements_by_xpath('.//*[@class="details"]/tbody/tr[./td[contains(text(),"NACE")]]/td[2]/a')
            # Append each 'NACE' code to the list 'nace_codes'
            for sel_nace_code in sel_nace_codes:
                nace_codes.append(sel_nace_code.text)
                
            decisions =[]
            # Get all 'Decisions'
            sel_decisions = merger.find_elements_by_xpath('.//*[@id="decisions"]/tbody/tr[./td[2]/strong]')
            
            # Loop through all 'Decisions' 
            for sel_decision_index, sel_decision in enumerate(sel_decisions):
                
                # Only select 'Decisions' containing the text 'Art'
                if sel_decision.find_elements_by_xpath('.//td[2]/strong[contains(text(),"Art")]'):
                    
                    # Get the decision type
                    decision_type = sel_decision.find_element_by_xpath('.//td[2]/strong[contains(text(),"Art")]').text
                    # Get the decision date
                    decision_date = sel_decision.find_element_by_xpath('.//td[1]/strong').text
                    
                    decision_texts = []
                    # Get all the decision texts
                    sel_decision_texts = sel_decision.find_elements_by_xpath('.//following-sibling::tr[./td[3]/table/tbody/tr[*]/td[2]/a and count(preceding-sibling::tr[./td[2]/strong])='+str(sel_decision_index+1)+'][1]/td[3]/table/tbody/tr[*]/td[2]/a')
                    
                    # Check if decision text where found
                    if sel_decision_texts:

                        # Loop through all decsion texts
                        for sel_decision_text in sel_decision_texts:

                            link = sel_decision_text.get_attribute("href")
                            language = sel_decision_text.text

                            language, text = pdf_handler(link, language)

                            # Append the decision text information to the list 'decision_texts'
                            decision_texts.append({
                                'language': language,
                                'link': link,
                                'text': text
                            })
                            
                    # Append the decision information to the list 'decisions'
                    decisions.append({
                        'decision type': decision_type,
                        'decision date': decision_date,
                        'decision texts': decision_texts
                    })
                
            # Append the merger information to the list 'mergers'
            data['mergers'].append({
                'case number': case_number,
                'companies': companies,
                'notification date': notification_date,
                'NACE': nace_codes,
                'decisions': decisions
            })
            
            # Update the progress bar
            pbar.update(1)
        
        # Check if the last site was reached
        if driver.find_elements_by_xpath('//*[@id="BodyContent"]/table[3]/tbody/tr[1]/td[@style="display:none"]/input[@value="Next"]'):
            last_site = True
        else:
            driver.find_element_by_xpath('//*[@id="BodyContent"]/table[3]/tbody/tr[1]/td[3]/input').click()
            time.sleep(3)
    
    # Close progress bar
    pbar.close()
    
    print('----------------------------------------------------')
    print('Writing data to JSON-File ...Please wait')
    print('----------------------------------------------------')
    
    # Write all the scraped data in the JSON-file
    with open('data' + date_time + '.json', 'w') as outfile:
        json.dump(data, outfile, indent = 4)
        
    print('----------------------------------------------------')
    print('JSON-File successfully saved')
    print('----------------------------------------------------')
    
    # delete corrupted PDF-files from the 'tmp'-folder
    folder_path = './tmp'
    for f in os.listdir(folder_path):
        if f.endswith('.pdf'):
            os.remove(os.path.join(folder_path, f))
    
    # Close driver
    driver.close()
    
    print('----------------------------------------------------')
    print('Script sucessfully terminated')
    print('----------------------------------------------------')
    


In [None]:
eu_merger_scraper()

----------------------------------------------------
Starting the script ...Please wait
----------------------------------------------------


  3%|▎         | 237/8489 [26:40<9:03:08,  3.95s/it]  