#Simiscore-legal
### Comparison between 'sledgehammer' and elaborate (feature-based) approach

In [2]:
# install required dependecies
%%capture
!pip install kshingle
!pip install datasketch

In [3]:
from bs4 import BeautifulSoup
from typing import List
import datasketch
import kshingle

In [4]:
# load example data 
!gdown --id 1UIRfK_5rMqm_s7ijXuzl7RQMWXQIJpYu

with open('metadata.txt', 'r', encoding='utf-8') as ptr:
    test_metadata = ptr.read()

Downloading...
From: https://drive.google.com/uc?id=1UIRfK_5rMqm_s7ijXuzl7RQMWXQIJpYu
To: /content/metadata.txt
  0% 0.00/9.77k [00:00<?, ?B/s]100% 9.77k/9.77k [00:00<00:00, 19.3MB/s]


In [5]:
metadata_soup = BeautifulSoup(test_metadata)

In [6]:
# load example data
def extract_bibl_strings_from_file(html_doc:BeautifulSoup, only_text:bool=False) -> List[List[str]]:
    if only_text:   
        return [ ' '.join([entry.text for entry in  tag.contents if entry != '\n']) for tag in html_doc.find_all('fundstelle')]
    return [str(tag) for tag in html_doc.find_all('fundstelle')]

In [7]:
# one-line bibliographic info is part of the multi-line data under the 'bibl' tag 
def extract_oneline_belege(html_doc:BeautifulSoup)-> List[str]:
    return [tag.text for tag in html_doc.find_all('bibl')]

In [8]:
metadata_oneliner = extract_oneline_belege(metadata_soup)

In [9]:
beleg_list = extract_bibl_strings_from_file(metadata_soup, True)

# 1. 'Sledgehammer' approach:
Treat bibliographic information as string, use k-shingle and minhashing to compute similarity scores.

In [9]:
def sledgehammer_similarity_matrix(belege:List[str], max_k:int=5)-> List[List[float]]:
    minhash_table = []
    for bib_entry in belege:
        shingle_set = kshingle.shingleset_k(bib_entry, max_k)
        minhash = datasketch.MinHash(num_perm=256)
        for shingle in shingle_set:
            minhash.update(shingle.encode("utf-8"))
        minhash_table.append(minhash)
    similarity_matrix = [
                [
                    minhash_table[i].jaccard(minhash_table[j])
                    for j in range(len(minhash_table))
                ]
                for i in range(len(minhash_table))
            ]
    return similarity_matrix

In [10]:
# using long bibliographic information or one-line examples?  
simi_scores_with_full_belege = sledgehammer_similarity_matrix(beleg_list)
simi_scores_with_oneline_bibl = sledgehammer_similarity_matrix(metadata_oneliner)

simi_scores_with_full_belege == simi_scores_with_oneline_bibl

False

# 2. Feature-based approach:


In [11]:
tag_names = {tag.name for tag in metadata_soup.find_all()}
tag_names

{'aufrufdatum',
 'autor',
 'beleg',
 'belegtext',
 'bibl',
 'body',
 'datum',
 'dokument',
 'fundstelle',
 'html',
 'korpus',
 'seite',
 'stichwort',
 'textklasse',
 'titel',
 'url'}

In [None]:
for tag in metadata_soup.find_all(['aufrufdatum', 'autor', 'datum', 'dokument', 'korpus', 'seite', 'textklasse', 'titel', 'url']):
    print(tag, tag.text)

to do:

- urls parsen
- hmtl tags aus bibliographischer Einträgen filtern > welche Tags gibt es?
- Feature extrahieren


In [17]:
def extract_features_from_beleg(beleg, tags_to_extract):
    beleg_soup = BeautifulSoup(beleg)
    for tag in beleg_soup.find_all(tags_to_extract):
        yield (tag.name, tag.text)

In [24]:
dwds_tags = ['aufrufdatum', 'autor','datum','dokument', 'korpus', 'seite', 'textklasse', 'titel', 'url']