# 04. Criterion B: Cookie statement contains sufficient information
This script parses the content of the Cookie statement URL using the BeautifulSoup library and checks whether it contains information about cookies, purpose and data using the NLP tool SpaCy. 

In [None]:
import requests
import json
import sqlite3
from bs4 import BeautifulSoup
import spacy

In [None]:
# Load spaCy language models
nlp_en = spacy.load("en_core_web_sm")
nlp_nl = spacy.load("nl_core_news_sm")
nlp_de = spacy.load("de_core_news_sm")

In [None]:
# Function that fetches the keywords from data and returns them as usable arrays
def get_keywords(path):
    # Read the JSON file
    with open(path, 'r') as file:
        data = json.load(file)

    # Extract arrays for "cookie" and "privacy"
    cookie_keywords = data.get("cookie", [])
    purpose_keywords = data.get("purpose", [])
    data_keywords = data.get("data", [])
    
    return cookie_keywords, purpose_keywords, data_keywords

In [None]:
# Function that fetches and returns the text of a webpage
def fetch_page(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        
        # Check if the response is an HTML page
        if "text/html" in response.headers.get("Content-Type", ""):
            return response.text
        else:
            print(f"{url} is not an HTML page.")
            return None
    except requests.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return None

In [None]:
def contains_cookie_information(html_content):
    soup = BeautifulSoup(html_content, "html.parser")
    text_content = soup.get_text(separator=" ").strip()
    
    # Process the text with spaCy
    doc_en = nlp_en(text_content.lower())
    doc_nl = nlp_nl(text_content.lower())
    doc_de = nlp_de(text_content.lower())
    
    # Initialize flags
    cookie_info = False
    purpose_info = False
    data_info = False

    cookie_keywords, purpose_keywords, data_keywords = get_keywords("data/keywords_cookies.json")
    
    # Function to check keywords in sentences
    def check_keywords(doc, keywords):
        return any(keyword in sent.text for sent in doc.sents for keyword in keywords)
    
    # Check for each language's keywords
    if (check_keywords(doc_en, cookie_keywords["en"]) or check_keywords(doc_nl, cookie_keywords["nl"]) or check_keywords(doc_de, cookie_keywords["de"])):
        cookie_info = True
    if (check_keywords(doc_en, purpose_keywords["en"]) or check_keywords(doc_nl, purpose_keywords["nl"]) or check_keywords(doc_de, purpose_keywords["de"])):
        purpose_info = True
    if (check_keywords(doc_en, data_keywords["en"]) or check_keywords(doc_nl, data_keywords["nl"]) or check_keywords(doc_de, data_keywords["de"])):
        data_info = True
    
    return cookie_info and purpose_info and data_info

In [None]:
# Connect to the database
conn = sqlite3.connect("data/websites.db")
cursor = conn.cursor()

In [None]:
# Fetch and print all cookie statement URLs from the table and save to an array
cookie_statement_urls = []

cursor.execute("SELECT cookie_statement_url FROM website_data")
rows = cursor.fetchall()

for row in rows:
    cookie_statement_urls.append(row[0])

In [None]:
# Loop through all URLs and save the cookie or privacy statement URL to the database
for cookie_statement_url in cookie_statement_urls:
    html_content = fetch_page(cookie_statement_url)
    
    # Only parse the content if html_content is returned
    if html_content:
        has_contains_cookie_information = contains_cookie_information(html_content)
    else:
        has_contains_cookie_information = False

    # Save to database
    cursor.execute("""
    UPDATE website_data
    SET cookie_statement_information = ?
    WHERE cookie_statement_url = ?
    """, (has_contains_cookie_information, cookie_statement_url))
    conn.commit()