# Text diff

Find the differences in the text between two crawls of richtlijnendatabase.nl

In [1]:
import csv
import gzip
import pandas as pd

In [2]:
def read_paragraphs_file(file_name, gzip_flag=False):
    if gzip_flag:
        infile = gzip.open(file_name, mode="rt")
    else:
        infile = open(file_name, "r")
    csvreader = csv.reader(infile)
    dict_out = {}
    for row in csvreader:
        key = row.pop(0)
        dict_out[key] = row
    infile.close()
    return dict_out

## 1. Extract paragraphs and compare

Part of this code was based on the notebook `keyword_search.ipynb` section `Text segmentation`.

In [8]:
from bs4 import BeautifulSoup
import os
import re
import przona
import sys

In [4]:
def get_paragraphs(soup):
    paragraphs = []
    for paragraph in soup.find_all(re.compile("^(p|li|h[0-9])$")):
        text = re.sub("\s+", " ", paragraph.text).strip()
        if text != "":
            paragraphs.append(text)
    return paragraphs


def read_file(file_name):
    file_id = open(file_name, "r")
    text = ""
    for line in file_id:
        text += line
    file_id.close()
    return text


def process_files(target_dir):
    paragraphs = {}
    counter = 0
    for root, dirs, files in os.walk(target_dir):
        for file_name in files:
            if re.search(r'.html$', file_name):
                counter += 1
                if counter % 100 == 0:
                    przona.squeal(counter)
                file_location = os.path.join(root, file_name)
                web_page_text = read_file(file_location)
                web_page_text_with_spaces = re.sub(">", "> ", web_page_text)
                soup = BeautifulSoup(web_page_text_with_spaces)
                file_location_relative = re.sub(target_dir, "", file_location)
                paragraphs[file_location_relative] = get_paragraphs(soup)
    przona.squeal(counter)
    return paragraphs

In [5]:
def remove_duplicates(links_in):
    links_out = []
    for i in range(0, len(links_in)):
        if len(links_in[i]) > 1 and i > 0 and len(links_out[-1]) == 1 and links_out[-1][0] + 1 in links_in[i]:
            links_out.append([links_out[-1][0] + 1])
        else:
            links_out.append(links_in[i])
    for i in range(len(links_out)-1, -1, -1):
        if len(links_out[i]) > 1 and i < len(links_out)-1 and len(links_out[i+1]) == 1 and links_out[i+1][0] - 1 in links_out[i]:
            links_out[i] = [links_out[i+1][0] - 1]
    return(links_out)

In [6]:
def find_gaps(links, links_old, paragraphs1, paragraphs2):
    text_out = ""
    processed = {}
    for i in range(0, len(links)):
        if len(links[i]) == 0 and i > 0 and i < len(links)-1 and len(links[i-1]) == 1 and len(links[i+1]) == 1 and links[i-1][0] == links[i+1][0] - 2:
            replacement_id = links[i - 1][0] + 1
            processed[replacement_id] = True
            if paragraphs1[i] != "Log in" or paragraphs2[replacement_id] != "Inloggen":
                text_out += f"[REPLACED] {paragraphs1[i]} [WAS] {paragraphs2[replacement_id]}\n"
        elif len(links[i]) == 0 and i > 0 and i < len(links)-1 and len(links[i-1]) == 1 and len(links[i+1]) == 1 and links[i-1][0] == links[i+1][0] - 1:
            if paragraphs1[i] != "Zoek":
                text_out += f"[INSERTED] {paragraphs1[i]}\n"
        elif len(links[i]) == 0:
            text_out += f"[NEW] {paragraphs1[i]}\n"
    for i in range(0, len(links_old)):
        if len(links_old[i]) == 0 and i not in processed:
            text_out += f"[REMOVED] {paragraphs2[i]}\n"
    return text_out

In [7]:
paragraphs = process_files("../data/richtlijnendatabase.nl/richtlijn")
paragraphs_old = process_files("../data/richtlijnendatabase.nl-20210315/richtlijn")

9335


In [9]:
def process(file_name, outfile=sys.stdout):
    links = [ [] for _ in range(0, len(paragraphs[file_name])) ]
    links_old = [ [] for _ in range(0, len(paragraphs_old[file_name])) ]
    for i in range(0, len(paragraphs[file_name])):
        for j in range(0, len(paragraphs_old[file_name])):
            if paragraphs[file_name][i] == paragraphs_old[file_name][j]:
                links[i].append(j)
                links_old[j].append(i)
    text_out = find_gaps(remove_duplicates(links), remove_duplicates(links_old), paragraphs[file_name], paragraphs_old[file_name]) 
    if text_out != "":
        print(f"{file_name} [{len(paragraphs[file_name])}] [{len(paragraphs_old[file_name])}]", file=outfile)
        print(text_out, file=outfile)

In [10]:
outfile = open("csv/text_diff_out.txt", "w")
for file_name in paragraphs:
    if file_name in paragraphs_old:
        process(file_name, outfile=outfile)
outfile.close()