# 03. Criterion A: Cookie statement presence
This script uses the library BeautifulSoup to parse the HTML content of the URL in the database and checks for the presence of an **a** tag that has an href attrubute to a cookie statement or a privacy statement. If a direct link to a cookie statement is not found, the script checks for a privacy statement.

In [None]:
import requests
import sqlite3
import json
from bs4 import BeautifulSoup
from urllib.parse import urljoin

In [None]:
# Function that fetches the keywords from data and returns them as usable arrays
def get_keywords(path):
    # Read the JSON file
    with open(path, 'r') as file:
        data = json.load(file)

    # Extract arrays for "cookie" and "privacy"
    cookie_keywords = data.get("cookie", [])
    privacy_keywords = data.get("privacy", [])
    
    return cookie_keywords, privacy_keywords

In [None]:
# Function that fetches and returns the text of a webpage
def fetch_page(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        
        # Check if the response is an HTML page
        if "text/html" in response.headers.get("Content-Type", ""):
            return response.text
        else:
            print(f"{url} is not an HTML page.")
            return None
    except requests.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return None

In [None]:
# Function that retrieves the cookie statement for a given URL
def get_cookie_statement_url(url):
    try:
        html_content = fetch_page(url)

        if(html_content == None):
            return None
        
        # Parse the HTML content
        soup = BeautifulSoup(html_content, "html.parser")
        
        # Define keyword groups for cookie and privacy statements
        cookie_keywords, privacy_keywords = get_keywords("data/keywords_statements.json")

        # First check if anything with cookie keywords can be found and return the URL
        for link in soup.find_all('a', href=True):
            href = link.get('href')
            if any(keyword in link.text.lower() for keyword in cookie_keywords):
                # Return the full URL of the cookie statement
                full_url = urljoin(url, href)
                return full_url

        # If the previous search didn't return a URL, use the privacy keywords to find the privacy statement
        for link in soup.find_all('a', href=True):
            href = link.get('href')
            if any(keyword in link.text.lower() for keyword in privacy_keywords):
                # Return the full URL of the privacy statement
                full_url = urljoin(url, href)
                return full_url
        
    except requests.RequestException as e:
        print(f"Error accessing {url}: {e}")
        return None

In [None]:
# Connect to the database
conn = sqlite3.connect("data/websites.db")
cursor = conn.cursor()

In [None]:
# Fetch and print all URLs from the table and save to an array
urls = []

cursor.execute('SELECT url FROM website_data')
rows = cursor.fetchall()

for row in rows:
    urls.append(row[0])

In [None]:
# Loop through all URLs and save the cookie or privacy statement URL to the database
for url in urls:
    cookie_statement_url = get_cookie_statement_url(url)
    if(cookie_statement_url != None):
        cursor.execute('''
        UPDATE website_data
        SET cookie_statement_url = ?
        WHERE url = ?
        ''', (cookie_statement_url, url))
        conn.commit()