# DB_COMP Web Scraper

## Overview
This code contains a web scraper for the [DB_COMP](https://db-comp.eu/) website, which hosts decisions issued by the European Commission on Competition Law and the Digital Markets Act. The scraper is designed to extract and process relevant information from the website, providing an efficient way to collect and analyze these decisions.

## Features
- Scrapes decision documents from the DB_COMP website.
- Extracts key information such as decision dates, titles and URLs.
- Stores the extracted data in a structured format for easy analysis.

In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.proxy import Proxy, ProxyType
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import Select

import time 
from bs4 import BeautifulSoup
import csv
import pandas as pd
import tqdm
import requests
import os
import re
from datetime import datetime

In [None]:
driver = webdriver.Chrome()
driver.get("https://db-comp.eu/")
time.sleep(4)
driver.find_element(By.LINK_TEXT, "Accept all").click()
driver.find_element(By.ID, "1").click()
driver.find_element(By.ID, "2").click()
driver.find_element(By.ID, "3").click()
driver.find_element(By.ID, "5").click()
driver.find_element(By.ID, "plgslt_Slot_Main_3_search").click()

In [None]:
l_link = []
l_titulo = []
l_categories = []
l_release_date = []

for i in range(69):

    html = driver.page_source
    soup = BeautifulSoup(html, 'html.parser')
    table = soup.find("div", {"id": "plgslt_Slot_Main_3_results"})

    for row in table.findAll('div', class_='result'):
        link = row.a['href']
        titulo = row.find('p', class_='title').text.strip() 
        categories = row.find('div', class_='categories').text.strip()
        release_date = row.find('p', class_='release_date').text.strip()

        l_link.append(link)
        l_titulo.append(titulo)
        l_categories.append(categories)
        l_release_date.append(release_date)
        
    driver.find_element(By.LINK_TEXT, "Next").click()
    time.sleep(3)



In [None]:
df = pd.DataFrame(list(zip(l_titulo, l_categories, l_release_date, l_link)),
                                    columns=['Title', 'Category', 'Release Date', 'Link'])
df

In [None]:
# Let's do some processing to adapt the format of the release date

df['Release Date'] = df['Release Date'].str.replace('Date: ', '')

def transform_date(date_str):
    # Define the regex pattern to match the date format
    pattern = r"(\d{1,2})\s([A-Za-z]+)\s(\d{4})"
    
    # Dictionary to map month names to their numeric values
    month_map = {
        'Jan': '01', 'Feb': '02', 'Mar': '03', 'Apr': '04',
        'May': '05', 'Jun': '06', 'Jul': '07', 'Aug': '08',
        'Sep': '09', 'Oct': '10', 'Nov': '11', 'Dec': '12'
    }
    
    # Function to replace the date string
    def replace_date(match):
        day = match.group(1).zfill(2)  # Zero-pad day if necessary
        month = month_map[match.group(2)[:3]]  # Get the month number from the map
        year = match.group(3)
        return f"{day}/{month}/{year}"
    
    # Use re.sub to replace the date format
    return re.sub(pattern, replace_date, date_str)

df['Release Date'] = df['Release Date'].apply(transform_date)

In [None]:
df.to_csv(r"D:\Proyectos\db_comp\database.csv", index = False, header=True)