* SCB_37_Dash_Inspector_Notebook

* Issue [37](https://github.com/salgo60/SCB-Wikidata/issues/37)
* denna Notebook [SCB_37_Dash_Inspector_Notebook.ipynb](https://github.com/salgo60/SCB-Wikidata/blob/main/notebook/SCB_37_Dash_Inspector_Notebook.ipynb)

In [4]:
import time

from datetime import datetime

now = datetime.now()
timestamp = now.timestamp()

start_time = time.time()
print("Start:", datetime.now().strftime("%Y-%m-%d %H:%M:%S"))

Start: 2025-11-20 13:40:03


In [6]:
# Notebook: Dash Inspector for Myndighetsregistret
# Läs in ALLA exporterade filer i underkatalogen "myndighetsreg/"
# Undersök vilka typer av bindestreck som förekommer i organisationsnummer
# Gruppera poster efter vilket Unicode-dash-tecken som används

import pandas as pd
import os
import re
from collections import defaultdict

# --- 1. Hitta alla filer i katalogen ---
DIR = 'myndighetsreg'
files = [os.path.join(DIR, f) for f in os.listdir(DIR) if f.lower().endswith('.txt')]
print('Hittade filer:', files)

# --- 2. Funktion: identifiera vilken typ av dash som används ---
DASH_MAP = {
    '-': 'ASCII hyphen (U+002D)',
    '\u2013': 'EN DASH (U+2013)',
    '\u2014': 'EM DASH (U+2014)',
    '\u2212': 'MINUS SIGN (U+2212)',
    '\u2010': 'HYPHEN (U+2010)',
    '\u2011': 'NON-BREAKING HYPHEN (U+2011)',
    '\ufe58': 'SMALL EM DASH (U+FE58)',
    '\u2043': 'HYPHEN BULLET (U+2043)'
}

ALL_DASHES = set(DASH_MAP.keys())


def find_dash_type(orgnr):
    """Returnerar en lista med vilka dash-tecken som förekommer i strängen."""
    if pd.isna(orgnr):
        return []

    s = str(orgnr)
    found = []
    for ch in s:
        if ch in ALL_DASHES:
            found.append(ch)
    return found

# --- 3. Läs in alla filer och samla dash-typer ---
groups = defaultdict(list)  # key = dash type, value = list of rows

for path in files:
    try:
        df = pd.read_csv(path, sep='\t', dtype=str, encoding='utf-8')
    except Exception as e:
        print('Fel vid läsning av', path, e)
        continue

    col = None
    for c in df.columns:
        if 'org' in c.lower():  # heuristik
            col = c
            break

    if not col:
        print('Ingen organisationsnummerkolumn hittad i', path)
        continue

    for i, row in df.iterrows():
        org = row[col]
        dash_types = find_dash_type(org)
        if not dash_types:
            groups['INGET DASH'].append((path, org))
        else:
            for d in dash_types:
                groups[DASH_MAP.get(d, f'UNKNOWN ({d})')].append((path, org))

# --- 4. Sammanställning ---
print("\n=== Sammanfattning av dash-typer ===")
for dash, rows in groups.items():
    print(f"{dash}: {len(rows)} poster")

# --- 5. Skapa en DataFrame för export ---
summary_rows = []
for dash, rows in groups.items():
    for path, org in rows:
        summary_rows.append({'dash_type': dash, 'file': path, 'orgnr': org})

summary_df = pd.DataFrame(summary_rows)
summary_df.to_csv('myndighetsreg/dash_analysis.csv', index=False)
print("\nExporterat: myndighetsreg/dash_analysis.csv")


FileNotFoundError: [Errno 2] No such file or directory: 'myndighetsreg'

In [5]:
 # End timer and calculate duration
end_time = time.time()
elapsed_time = end_time - start_time# Bygg audit-lager för den här etappen

# Print current date and total time
print("Date:", datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
minutes, seconds = divmod(elapsed_time, 60)
print("Total time elapsed: {:02.0f} minutes {:05.2f} seconds".format(minutes, seconds))


Date: 2025-11-20 13:40:04
Total time elapsed: 00 minutes 00.69 seconds
