In [1]:
import re
from bs4 import BeautifulSoup

In [45]:
import json
data_file = "../ico/guidelines/guidelines/monitor_worker.json"
with open(data_file, 'r', encoding='utf-8') as file:
    data = json.load(file)

In [57]:

def find_faq_questions_and_links(content):

    start_pos = content.find("In detail")
    
    # If "In detail" is found, slice the content to start after it
    if start_pos != -1:
        content = content[start_pos:]
    
    # Regular expression to find all <a> tags with href and the associated text. The # here ensures we only find links in the input content
    matches = re.findall(r'<a href="#([^"]+)"[^>]*>(.*?)</a>', content)

    # Initialize an empty set to track unique href references
    seen_hrefs = set()

    # Filter the list to include only valid and unique references that appear later in the document
    questions_with_references = []
    for href, question in matches:
        if href not in seen_hrefs and f'id="{href}"' in content:
            questions_with_references.append((question, href))
            seen_hrefs.add(href)

    return questions_with_references

# Usage
for page in data:
    name = page["section"]
    questions_with_references = find_faq_questions_and_links(page['content'])
    if len(questions_with_references) > 0:
        print(name + " is an FAQ page")
    else:
        print(name + " contains only content, and no FAQ")

Employment practices and data protection: monitoring workers contains only content, and no FAQ
Data protection and monitoring workers is an FAQ page
What do we need to do if we use monitoring tools that use solely automated processes? is an FAQ page
Specific data protection considerations for different ways or methods of monitoring workers is an FAQ page
Can we use biometric data for time and access control and monitoring? is an FAQ page
Checklists contains only content, and no FAQ


In [58]:
def subdivide_content_into_faq(content, questions_with_references):

    # this is the pattern of the question in the body of the content
    section_pattern = r'<a id=\"{}\"></a>.*?'

    # List to store the subsections
    subsections = []

    # Keep track of the last position in the content but only start the search after the heading "In detail" if it exists
    last_pos = 0

    for question, href in questions_with_references:
        # Compile the pattern for the current href
        pattern = re.compile(section_pattern.format(href))
        
        # Find the section start position
        match = pattern.search(content, last_pos)
        
        if match:
            # Get the position of the match
            start_pos = match.start()
            
            if last_pos == 0 and start_pos > 0:
                # Capture text before the first section if there is any
                subsections.append(content[last_pos:start_pos].strip())
            
            # Find the next occurrence of any section or the end of the content
            next_section_match = None
            for next_question, next_href in questions_with_references:
                if next_href != href:
                    next_pattern = re.compile(section_pattern.format(next_href))
                    next_section_match = next_pattern.search(content, start_pos + len(match.group()))
                    if next_section_match:
                        break
            
            if next_section_match:
                end_pos = next_section_match.start()
            else:
                end_pos = len(content)
            
            # Extract the subsection
            subsection = content[start_pos:end_pos].strip()
            subsections.append(subsection)
            
            # Update the last position
            last_pos = end_pos

    # Capture any remaining content after the last section
    if last_pos < len(content):
        subsections.append(content[last_pos:].strip())

    # The variable subsections now contains the list of content parts split by the questions_with_references
    return subsections

In [68]:
page = data[4]
questions_with_references = find_faq_questions_and_links(content)
print(len(questions_with_references))
subsections = subdivide_content_into_faq(content, questions_with_references)
print(len(subsections))


0
1


In [73]:
questions_with_references = find_faq_questions_and_links(data[4]["content"])
questions_with_references


[('What is biometric data?', 'can1'),
 ('When might we use biometric data for time and attendance control and monitoring?',
  'can2'),
 ('What are access controls?', 'can3'),
 ('What is biometric attendance monitoring?', 'monitoring'),
 ('How do we determine if using biometric data for access control is necessary and proportionate?',
  'can4'),
 ('What lawful basis and condition for processing can we rely on when using biometric data?',
  'can5'),
 ('Do we need to carry out a data protection impact assessment (DPIA)?',
  'can6'),
 ('What about accuracy, fairness and rights relating to automated decision-making?',
  'can7'),
 ('What do we need to tell workers about biometric data and access controls?',
  'can8'),
 ('Can workers object to the use of biometric data for access control?',
  'can9'),
 ('<span>What about the security of biometric data?</span>', 'can10'),
 ('Checklist', 'can11')]

In [76]:
# create subdivided text

subdivided_data = []
for page in data:    
    section = page["section"]
    url = page["url"]
    content = page['content']
    questions_with_references = find_faq_questions_and_links(content)
    if len(questions_with_references) == 0:
        entry = {
            "section": section,
            "url": url,
            "content": content,
        }
        subdivided_data.append(entry)
    else:
        subsections = subdivide_content_into_faq(content, questions_with_references)
        if len(subsections) != len(questions_with_references) + 1:            
            print("Could not find links for every FAQ in the ToC for page: " + section)
            break
        else:
            # entry for the ToC
            entry = {
                "section": section,
                "url": url,
                "content": subsections[0],
            }
            subdivided_data.append(entry)

            for i in range(0, len(questions_with_references)):
                question, href = questions_with_references[i]
                entry = {
                    "section": section,
                    "subsection": question,
                    "url": url + "#" + href,
                    "content": subsections[i+1],
                }
                subdivided_data.append(entry)
    
    
# Convert the list to JSON format
# subdivided_data = json.dumps(data, indent=4)

# Optionally, save it to a file

split_data_file = "../ico/guidelines/guidelines/monitor_worker_split.json"
with open(split_data_file, 'w', encoding='utf-8') as f:
    json.dump(subdivided_data, f, indent=4)
    


In [77]:
def extract_visible_text(html_content):
    # Parse the HTML content
    soup = BeautifulSoup(html_content, 'html.parser')

    # Replace <p> tags with double newlines
    for p in soup.find_all('p'):
        p.insert_after('\n\n')

    # Get the visible text
    visible_text = soup.get_text()

    # Normalize whitespace and remove extra newlines
    visible_text = '\n\n'.join([line.strip() for line in visible_text.splitlines() if line.strip()])

    return visible_text

In [86]:
subdivided_data[5]['subsection']
# "subsection": "How do we identify a lawful basis?",
print(extract_visible_text(subdivided_data[5]['content']).replace("\n\n", "\n"))

How do we identify a lawful basis?
How you decide which lawful basis applies depends on your specific purpose and the context of the monitoring. You must think about why you want to monitor workers. You must identify which lawful basis best fits the circumstances. We have listed the available lawful bases below, along with some guidance to help you identify the right basis for your circumstances. You can also use our interactive guidance tool to help you. Carrying out a data protection impact assessment (DPIA) may also help you to identify the most appropriate basis.
You must not adopt a one-size-fits-all approach. No one basis is always better, safer or more important than the others. However, some are likely to be more appropriate than others for employers. We highlight some of these below.
Sometimes, more than one basis might apply. You should identify all those that apply, and document them from the start. Try to get it right first time, as you should not change it later without go

In [66]:
questions_with_references[0]

IndexError: list index out of range

In [27]:
from IPython.display import Markdown, display
display(Markdown(subsections[24]))    


<a id="dp24"></a>Checklist</h2>
<div class="example example-letter">
<p>□ We have checked that the monitoring of workers is necessary for the purpose we have identified. We are satisfied there is no other reasonable and less intrusive way to achieve that purpose.</p>
<p>□ We have considered whether we need to do a DPIA and either completed one or documented the reason we considered one wasn’t required.</p>
<p>□ When making our DPIA decision, we have considered seeking the views of workers and representatives and either done this or documented our decision not to.</p>
<p>□ We have identified a lawful basis for monitoring workers.</p>
<p>□ Where required, we have identified an appropriate special category condition for monitoring workers if we’re likely to capture any special category data as part of our monitoring.</p>
<p>□ We have documented what personal information we are processing when we monitor workers.</p>
<p>□ Where required, we have an appropriate policy document in place.</p>
<p>□ We have included specific information about monitoring workers in our privacy information so that workers are aware of any monitoring taking place. We have made sure that this information is readily accessible to workers.</p>
<p>□ We have considered whether the risks associated with monitoring workers affects our other obligations around data minimisation, security, and appointing Data Protection Officers (DPOs) and representatives.</p>
<p>□ We have considered data protection issues as part of the design and implementation of monitoring systems and practices, including where we use external suppliers for monitoring technology, and where we use the functionalities built into communication and collaboration work tools.</p>
<p>□ Where necessary, we have considered the rules for international transfers.</p>
</div>
<p>You can also view and print off this checklist and all the checklists of this guidance on our <a rel="noopener" data-udi="umb://document/c7011a82892041979f6979eaedc8b603" href="/for-organisations/uk-gdpr-guidance-and-resources/employment/monitoring-workers/checklists/#_Checklists" target="_blank" title="Checklists" data-anchor="#_Checklists" class="link-external">checklists page<span class="invisible"></span></a>.</p>
        </div>