# 06. Criterion D: Check if all cookie vendors are mentioned in cookie statement

In [None]:
import time
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium import webdriver
from selenium.common.exceptions import WebDriverException
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
import sqlite3
import csv
import spacy
import requests
from bs4 import BeautifulSoup

In [None]:
def read_cookie_vendors_from_csv(path):
    cookie_vendors = {}
    with open(path, mode='r') as file:
        csv_reader = csv.DictReader(file)
        for row in csv_reader:
            cookie_name = row['Cookie']
            cookie_vendors[cookie_name] = row['Vendor']
    return cookie_vendors

In [None]:
# Function that fetches and returns the text of a webpage
def fetch_page(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        
        # Check if the response is an HTML page
        if "text/html" in response.headers.get("Content-Type", ""):
            return response.text
        else:
            print(f"{url} is not an HTML page.")
            return None
    except requests.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return None

In [None]:
def accept_cookies(driver):
    try:
        # Wait for the cookie pop-up to appear and accept it
        wait = WebDriverWait(driver, 10)
        accept_button = wait.until(EC.element_to_be_clickable(  (By.XPATH, "//button[text()='Accept all' or text()='Accepteer cookies']")))
        accept_button.click()
    except Exception as e:
        return None
        print(f"An error occurred while trying to accept cookies: {e}")

In [None]:
def get_cookies(driver):
    # Wait some time for cookies to be set
    time.sleep(5)
    cookies = driver.get_cookies()
    return cookies

In [None]:
def match_cookies_with_vendors(cookies, cookie_vendors):
    matched_vendors = []
    for cookie in cookies:
        name = cookie['name']
        if name in cookie_vendors:
            matched_vendors.append(cookie_vendors[name])
    return matched_vendors

In [None]:
def check_vendor_mentions(cookie_statement, matched_vendors):
    nlp = spacy.load('en_core_web_sm')
    doc = nlp(cookie_statement)

    mentioned_vendors = []
    for vendor in matched_vendors:
        if vendor.lower() in doc.text.lower():
            mentioned_vendors.append(vendor)
    return mentioned_vendors

In [None]:
# Set up Chrome options
options = Options()
options.add_argument("--headless")  # Run in headless mode
options.add_argument("--disable-gpu")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")

# Set up the Chrome driver
service = ChromeService(executable_path='/usr/bin/chromedriver')
driver = webdriver.Chrome(service=service, options=options)

In [None]:
# Combine all previously defined functions to check if all cookies are mentioned
def all_vendors_mentioned(website):
    url = website["url"]
    cookie_statement_url = website["cookie_statement_url"]

    # Visit the website
    driver.get(url)

    # Accept all cookies
    accept_cookies(driver)

    if(accept_cookies == None):
        return False

    # Fetch all cookies
    cookies = get_cookies(driver)

    # Match the cookies with their vendors
    matched_vendors = match_cookies_with_vendors(cookies, cookie_vendors)

    # Fetch cookie statement
    html_content = fetch_page(cookie_statement_url)

    if(html_content == None):
        return False
    
    # Check if vendors are mentioned in the cookie statement
    mentioned_vendors = check_vendor_mentions(html_content, matched_vendors)

    # If the number of vendors mentioned is the same as the number of vendors matches the website is considered compliant
    return len(mentioned_vendors) == len(matched_vendors)

In [None]:
# Connect to the database
conn = sqlite3.connect("data/websites.db")
cursor = conn.cursor()

In [None]:
# Fetch and print all cookie statement URLs from the table and save to an array
websites = []

cursor.execute("SELECT url, cookie_statement_url FROM website_data")
rows = cursor.fetchall()

for row in rows:
    websites.append({"url": row[0], "cookie_statement_url": row[1]})

In [None]:
# Get all the cookie vendors from the csv file
cookie_vendors = read_cookie_vendors_from_csv("data/known_cookies.csv")

In [None]:
# Loop through all websites
for website in websites:
    all_cookies_mentioned = all_vendors_mentioned(website)

    # Save to database
    cursor.execute("""
    UPDATE website_data
    SET all_cookies_mentioned = ?
    WHERE url = ?
    """, (all_cookies_mentioned, website["url"]))
    conn.commit()
    