<a href="https://colab.research.google.com/github/ronyates47/Gedcom-Utils/blob/main/03_2_3_2025_1250.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install pandas
!pip install python-gedcom
!pip install openpyxl
!pip install xlsxwriter
!pip install mlxtend


Collecting python-gedcom
  Downloading python_gedcom-1.0.0-py2.py3-none-any.whl.metadata (15 kB)
Downloading python_gedcom-1.0.0-py2.py3-none-any.whl (35 kB)
Installing collected packages: python-gedcom
Successfully installed python-gedcom-1.0.0
Collecting xlsxwriter
  Downloading XlsxWriter-3.2.2-py3-none-any.whl.metadata (2.8 kB)
Downloading XlsxWriter-3.2.2-py3-none-any.whl (165 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m165.1/165.1 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: xlsxwriter
Successfully installed xlsxwriter-3.2.2


In [6]:
# 03_2_1-2025_1823

#!/usr/bin/env python
"""
GEDCOM Composite Score Script using:
 - Chunk-based Parallel Processing for Speed
 - Anchor-suppression to avoid individual's own ID
 - **FP-Growth**-based Shared Segment Detection (instead of Apriori),
   which avoids repeated passes over the entire dataset

Exports final CSV/HTML sorted by "Yates DNA Ancestral Line".
"""

import csv
import glob
import logging
import functools
import os
from datetime import datetime
from collections import defaultdict, Counter
import numpy as np
import pandas as pd
from concurrent.futures import ProcessPoolExecutor
from tqdm import tqdm

# NEW: MLxtend library for FP-Growth
# Make sure you've done: pip install mlxtend
from mlxtend.frequent_patterns import fpgrowth

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

###############################################################################
# Global Variables
###############################################################################
anchor_gen1 = None
visited_pairs = set()
generation_table = []

###############################################################################
# Utility: chunk generator
###############################################################################
def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

###############################################################################
# GedcomDataset
###############################################################################
class GedcomDataset:
    def __init__(self, gen_person):
        self.gen_person = gen_person
        self.extractable_detail = {}
        self.anchor_gen1 = None

    def add_extractable_detail(self, key, value):
        self.extractable_detail[key] = value

    def get_gen_person(self):
        name = self.extractable_detail.get('NAME', '')
        parts = name.split('/', 1)
        first_name = parts[0].split(' ')[0]
        last_name = parts[1].rstrip('/') if len(parts) > 1 else ""
        self.anchor_gen1 = last_name.replace(" ", "") + first_name.replace(" ", "")
        global anchor_gen1
        anchor_gen1 = self.anchor_gen1
        return self.gen_person.strip('@')

    def get_anchor_gen1(self):
        return self.anchor_gen1

    def get_extractable_NPFX(self):
        return self.extractable_detail.get('NPFX', '')

    def get_extractable_cm(self):
        """
        Extract cM from NPFX field. If NPFX has a format like "175&someSort**someYDNA",
        the cM is '175'. If it doesn't parse cleanly, returns blank.
        """
        npfx_value = self.extractable_detail.get('NPFX', '')
        if '&' in npfx_value:
            cm_value = npfx_value.split('&')[0].strip()
        elif '**' in npfx_value:
            cm_value = npfx_value.split('**')[0].strip()
        else:
            cm_value = npfx_value.strip()
        try:
            int(cm_value)
            return cm_value
        except ValueError:
            return ''

    def get_extractable_sort(self):
        """
        If NPFX has "xxx&sortVal**ydnaVal", returns sortVal. If not found, blank.
        """
        npfx_value = self.extractable_detail.get('NPFX', '')
        if '&' in npfx_value:
            sort_part = npfx_value.split('&')[1]
            if '**' in sort_part:
                sort_value = sort_part.split('**')[0].strip()
            else:
                sort_value = sort_part.strip()
            return sort_value
        return ''

    def get_extractable_YDNA(self):
        """
        If NPFX has something like "...**ydnaVal", return ydnaVal. If not found, blank.
        """
        npfx_value = self.extractable_detail.get('NPFX', '')
        if '**' in npfx_value:
            ydna_value = npfx_value.split('**')[1].strip()
            return ydna_value
        return ''

    def get_extractable_FAMC(self):
        return self.extractable_detail.get('FAMC', '').strip('@')

###############################################################################
# Gedcom Class
###############################################################################
class Gedcom:
    def __init__(self, file_name):
        self.file_name = file_name
        self.gedcom_datasets = []
        self.filter_pool = []

    def parse_gedcom(self):
        with open(self.file_name, 'r', encoding='utf-8-sig') as f:
            lines = f.readlines()

        current_dataset = None
        npfx_count = 0
        ydna_count = 0
        total_count = 0

        for line in lines:
            parts = line.strip().split(' ', 2)
            level = int(parts[0])
            tag = parts[1]
            value = parts[2] if len(parts) > 2 else None

            if level == 0 and tag.startswith('@') and tag.endswith('@') and value == 'INDI':
                total_count += 1
                current_dataset = GedcomDataset(tag)
                self.gedcom_datasets.append(current_dataset)
            elif current_dataset is not None:
                if level == 1 and tag in ['NAME', 'FAMC']:
                    current_dataset.add_extractable_detail(tag, value)
                elif level == 2 and tag == 'NPFX':
                    npfx_count += 1
                    current_dataset.add_extractable_detail(tag, value)
                    if '**' in value:
                        ydna_count += 1

        autosomal_count = npfx_count - ydna_count
        print(f"GEDCOM contained {total_count} total records")
        print(f"Records tagged and filtered by NPFX: {npfx_count}")
        print(f"Records with YDNA information: {ydna_count}")
        print(f"Autosomal matches: {autosomal_count}")

        for ds in self.gedcom_datasets:
            if ds.get_extractable_NPFX():
                self.filter_pool.append(ds)

        # Optional second-level filter
        manual_filter_activated = True
        if manual_filter_activated:
            try:
                df = pd.read_excel('filtered_ids.xlsx')
            except FileNotFoundError:
                logger.warning("filtered_ids.xlsx not found. Skipping second-level manual filter.")
            else:
                manual_filtered_ids = set(df['ID'])
                self.filter_pool = [
                    d for d in self.filter_pool if d.get_gen_person() in manual_filtered_ids
                ]
                print(f"After manual filter, total records: {len(self.filter_pool)}")
                logger.info(f"After manual filter, total records: {len(self.filter_pool)}")

###############################################################################
# quick_extract_name
###############################################################################
def quick_extract_name(full_text):
    """
    Minimal function to extract a short name from a GEDCOM chunk.
    """
    name_marker = "\n1 NAME "
    idx = full_text.find(name_marker)
    if idx == -1:
        if full_text.startswith("1 NAME "):
            idx = 0
        else:
            return "UnknownName"
    start = idx + len(name_marker)
    end = full_text.find('\n', start)
    if end == -1:
        end = len(full_text)
    name_line = full_text[start:end].strip()
    if '/' not in name_line:
        return name_line[:10].replace(" ", "")
    first_name, last_name = name_line.split('/', 1)
    last_name = last_name.replace("/", "").strip()
    return last_name[:10].replace(" ", "") + first_name[:10].replace(" ", "")

###############################################################################
# Parents, Ancestors
###############################################################################
def find_parents(individual_id, generation, parents_map):
    global visited_pairs, generation_table
    if individual_id not in parents_map:
        return
    father_id, mother_id = parents_map[individual_id]
    if not father_id and not mother_id:
        return
    pair = (father_id, mother_id)
    if pair not in visited_pairs:
        visited_pairs.add(pair)
        generation_table.append((generation, pair))
    if father_id:
        find_parents(father_id, generation+1, parents_map)
    if mother_id:
        find_parents(mother_id, generation+1, parents_map)

def find_distant_ancestors(individual_id, parents_map, path=None):
    if path is None:
        path = []
    path.append(individual_id)
    if individual_id not in parents_map:
        return [path]
    father_id, mother_id = parents_map[individual_id]
    if not father_id and not mother_id:
        return [path]
    paths = []
    if father_id:
        paths.extend(find_distant_ancestors(father_id, parents_map, path[:]))
    if mother_id:
        paths.extend(find_distant_ancestors(mother_id, parents_map, path[:]))
    return paths if paths else [path]

###############################################################################
# filter_ancestral_line
###############################################################################
def filter_ancestral_line(winning_path_ids, generation_table_local, names_map):
    matching_table = []
    for generation, pair in generation_table_local:
        id1, id2 = pair
        if id1 in winning_path_ids or id2 in winning_path_ids:
            matching_table.append((generation, pair))
    matching_table.sort(key=lambda x: x[0])
    lines = []
    for gen, pair in matching_table:
        name_pair = [names_map.get(pid, "UnknownName") for pid in pair]
        lines.append(f"{name_pair[0]}&{name_pair[1]}")
    lines.reverse()
    return "~~~".join(lines)

###############################################################################
# process_record_wrapper (parallel)
###############################################################################
def process_record_wrapper(individual_id, gedcom_instance, parents_map, names_map):
    global generation_table, visited_pairs, anchor_gen1
    generation_table = []
    visited_pairs = set()

    find_parents(individual_id, 1, parents_map)
    distant_anc_paths = find_distant_ancestors(individual_id, parents_map)

    best_score = None
    best_path = None
    for path in distant_anc_paths:
        name_path = [names_map.get(pid, "UnknownName") for pid in path]
        score = 0
        for idx, nm in enumerate(name_path):
            if 'Yates' in nm:
                score += (idx + 1)
        if best_score is None or score > best_score:
            best_score = score
            best_path = path

    if not best_path:
        best_path = []

    # remove individual's own ID
    best_path_cleaned = [pid for pid in best_path if pid != individual_id]

    line_str = filter_ancestral_line(set(best_path_cleaned), generation_table, names_map)

    cm_value = ''
    sort_value = ''
    ydna_value = ''
    anchor_name = ''
    for ds in gedcom_instance.filter_pool:
        if ds.get_gen_person() == individual_id:
            cm_value = ds.get_extractable_cm()
            sort_value = ds.get_extractable_sort()
            ydna_value = ds.get_extractable_YDNA()
            anchor_name = ds.get_anchor_gen1()
            break

    short_name = names_map.get(individual_id, "UnknownName")
    # Return columns: ID#, Match to, Name, cM, Yates DNA Ancestral Line
    return [individual_id, sort_value, short_name, cm_value, line_str]

###############################################################################
# FP-Growth approach for frequent itemsets
###############################################################################
def fp_growth_itemsets(transactions, min_support_count=2, min_size=2):
    """
    Replaces the old Apriori function with FP-Growth from mlxtend.

    :param transactions: list of lists (each a list of items)
    :param min_support_count: e.g., 2 means itemsets must appear at least twice
    :param min_size: smallest itemset size you want to keep
    :return: dict {tuple_of_items: frequency}, in no particular order
    """
    from mlxtend.preprocessing import TransactionEncoder

    n_transactions = len(transactions)
    if n_transactions == 0:
        return {}

    # The support fraction needed so itemsets appear at least "min_support_count" times
    required_support = min_support_count / n_transactions

    # 1) Convert list-of-lists into one-hot encoded DataFrame
    te = TransactionEncoder()
    te_ary = te.fit_transform(transactions)
    df_bool = pd.DataFrame(te_ary, columns=te.columns_)

    # 2) Run fpgrowth
    logger.info("Running FP-Growth with support >= %.4f ...", required_support)
    results = fpgrowth(df_bool, min_support=required_support, use_colnames=True)

    # 3) Filter out itemsets smaller than min_size
    freq_dict = {}
    for idx, row in results.iterrows():
        itemset = row["itemsets"]
        if len(itemset) >= min_size:
            sup_frac = row["support"]
            freq = int(round(sup_frac * n_transactions))
            freq_dict[tuple(sorted(itemset))] = freq

    logger.info("FP-Growth found %d itemsets of size >= %d.", len(freq_dict), min_size)
    return freq_dict

###############################################################################
# compute_value_for_line with caching
###############################################################################
@functools.lru_cache(maxsize=5000)
def compute_value_for_line_cached(line_str, sorted_segments_hashable):
    if pd.isna(line_str) or not line_str.strip():
        return 0
    sorted_segments = dict(sorted_segments_hashable)
    lines = [s.strip() for s in line_str.split("~~~") if s.strip()]
    value = 0
    lines_copy = lines.copy()

    for seg, freq in sorted_segments.items():
        c_seg = Counter(seg)
        c_lines = Counter(lines_copy)
        if all(c_lines[k] >= c_seg[k] for k in c_seg):
            value += freq
            for k, ccount in c_seg.items():
                for _ in range(ccount):
                    lines_copy.remove(k)
    value += len(lines_copy)
    return value

###############################################################################
# main()
###############################################################################
def main():
    def select_gedcom():
        files = glob.glob("*.ged")
        if not files:
            print("No GEDCOM files found.")
            return None
        print("Automatically selecting the first GEDCOM file.")
        return files[0]

    gedcom_file_path = select_gedcom()
    if not gedcom_file_path:
        print("No GEDCOM file selected; exiting.")
        return

    # 1) Parse GEDCOM
    ged = Gedcom(gedcom_file_path)
    ged.parse_gedcom()
    filter_count = len(ged.filter_pool)
    print("Records tagged and filtered by NPFX:", filter_count)

    # 2) Build parents_map, names_map from raw GEDCOM
    with open(gedcom_file_path, 'r', encoding='utf-8') as f:
        raw_data = f.read()

    blocks = raw_data.split('\n0 ')
    all_records = {}
    for blk in blocks:
        blk = blk.strip()
        if not blk:
            continue
        flend = blk.find('\n')
        if flend == -1:
            flend = len(blk)
        first_line = blk[:flend]
        if '@' in first_line:
            start = first_line.find('@') + 1
            end = first_line.find('@', start)
            rec_id = first_line[start:end].strip()
            all_records[rec_id] = blk

    parents_map = {}
    names_map = {}

    for rec_id, txt in all_records.items():
        nm = quick_extract_name("\n" + txt)
        names_map[rec_id] = nm

    # gather families
    families = {}
    for rec_id, txt in all_records.items():
        if 'FAM' in txt[:50]:
            father_idx = txt.find('1 HUSB @')
            if father_idx != -1:
                start = father_idx + len('1 HUSB @')
                end = txt.find('@', start)
                husb_id = txt[start:end]
            else:
                husb_id = None

            wife_idx = txt.find('1 WIFE @')
            if wife_idx != -1:
                start = wife_idx + len('1 WIFE @')
                end = txt.find('@', start)
                wife_id = txt[start:end]
            else:
                wife_id = None

            kids = []
            lines_ = txt.split('\n')
            for ln in lines_:
                if ln.strip().startswith('1 CHIL @'):
                    s2 = ln.strip().split('1 CHIL @')[1]
                    kid_id = s2.split('@')[0]
                    kids.append(kid_id)

            families[rec_id] = (husb_id, wife_id, kids)

    for fam_id, (f_id, m_id, k_list) in families.items():
        for kid in k_list:
            parents_map[kid] = (f_id, m_id)

    # 3) gather ID list
    individual_ids = [d.get_gen_person() for d in ged.filter_pool]
    print(f"Processing {len(individual_ids)} individuals with chunk-based parallel...")

    # 4) chunk-based parallel to build lines
    combined_rows = []
    chunk_size = 20
    max_workers = 10
    logger.info("Starting chunk-based parallel processing with %d workers.", max_workers)


    total_records = len(individual_ids)
    from functools import partial

    with ProcessPoolExecutor(max_workers=max_workers) as executor, \
            tqdm(total=total_records, desc="Processing individuals") as pbar:
        for chunk in chunks(individual_ids, chunk_size):
            func = partial(
                process_record_wrapper,
                gedcom_instance=ged,
                parents_map=parents_map,
                names_map=names_map
            )
            results = list(executor.map(func, chunk))
            combined_rows.extend(results)
            pbar.update(len(chunk))

    # At this point, combined_rows has 5 columns:
    # ["ID#", "Match to", "Name", "cM", "Yates DNA Ancestral Line"]
    columns = ["ID#", "Match to", "Name", "cM", "Yates DNA Ancestral Line"]
    df = pd.DataFrame(combined_rows, columns=columns)

    def remove_prefix(row):
        prefix = "YatesJohn&HydeAlice~~~YatesThomas&WhiteFrances~~~"
        line = row["Yates DNA Ancestral Line"]
        if line.startswith(prefix):
            row["Yates DNA Ancestral Line"] = line[len(prefix):]
        return row

    df = df.apply(remove_prefix, axis=1)
    df.index += 1

    # 5) Instead of Apriori, use FP-Growth
    logger.info("Building transaction list for FP-Growth.")
    transactions = []
    for _, row in df.iterrows():
        line_str = row["Yates DNA Ancestral Line"]
        if pd.isna(line_str) or not line_str.strip():
            transactions.append([])
        else:
            items = [x.strip() for x in line_str.split("~~~") if x.strip()]
            transactions.append(items)

    logger.info("Running FP-Growth with min_support_count=2, min_size=2...")
    freq_itemsets_dict = fp_growth_itemsets(
        transactions,
        min_support_count=2,   # itemsets must appear at least 2 times
        min_size=2             # only itemsets of size >= 2
    )

    # Convert to a sorted structure so we can pass it to compute_value_for_line
    sorted_itemsets = dict(
        sorted(freq_itemsets_dict.items(), key=lambda x: (len(x[0]), x[1]), reverse=True)
    )
    shared_segments_hashable = frozenset(sorted_itemsets.items())

    # 6) Compute "Value"
    logger.info("Computing 'Value' for each row...")
    values = []
    for idx, row in tqdm(df.iterrows(), total=len(df), desc="Computing Values"):
        line_str = row["Yates DNA Ancestral Line"]
        val = compute_value_for_line_cached(line_str, shared_segments_hashable)
        values.append(val)

    df["Value"] = values

    # sort final by "Yates DNA Ancestral Line"
    df.sort_values(by=["Yates DNA Ancestral Line"], ascending=True, inplace=True)

    # 7) Z-Score, Robust Z-Score, etc.
    df["Value"] = pd.to_numeric(df["Value"], errors="coerce")
    mean_v = df["Value"].mean()
    std_v = df["Value"].std()
    if std_v == 0:
        df["Z-Score"] = 0
    else:
        df["Z-Score"] = (df["Value"] - mean_v) / std_v

    median_v = df["Value"].median()
    mad_v = np.median(np.abs(df["Value"] - median_v))
    if mad_v == 0:
        df["Robust Z-Score"] = 0
    else:
        df["Robust Z-Score"] = (df["Value"] - median_v) / (mad_v * 1.4826)

    df["Percentile Rank"] = df["Value"].rank(pct=True) * 100

    df["Composite Score"] = (
        df["Z-Score"].abs() +
        df["Robust Z-Score"].abs() +
        (df["Percentile Rank"] / 100)
    ) / 3
    df["Composite Score"] = df["Composite Score"].round(2)

    def assign_desc(s):
        if s >= 2.0:
            return "High"
        elif s >= 1.5:
            return "Moderate"
        elif s >= 1.0:
            return "Medium"
        elif s >= 0.5:
            return "Low"
        else:
            return "Very Low"

    df["Composite Significance"] = df["Composite Score"].apply(assign_desc)

    # ------------------------------------------------
    # FINAL COLUMNS:
    # We keep "cM" but drop "Name" from the final export:
    # ------------------------------------------------
    final_cols = [
        "ID#",
        "Match to",
        "cM",
        "Value",
        "Composite Score",
        "Composite Significance",
        "Yates DNA Ancestral Line"
    ]
    df = df[final_cols]

    logger.info("Final DataFrame columns: %s", df.columns.tolist())
    print(df.head(10))

    # 8) Export
    csv_name = "final_combined_df_with_composite_scores.csv"
    df.to_csv(csv_name, index=False)
    logger.info("Exported final DataFrame to '%s'.", csv_name)

    html_name = "HTML_combined_df_with_composite_scores.html"
    css_style = """
    <style>
    table {
      width: 100%;
      border-collapse: collapse;
      margin: 20px 0;
    }
    table, th, td {
      border: 1px solid #333;
    }
    th, td {
      padding: 8px 12px;
      text-align: center;
    }
    th {
      background-color: #f2f2f2;
    }
    /* Left-align the last column */
    td:nth-child(7) {
      text-align: left;
    }
    </style>
    """
    html_content = css_style + df.to_html(
        index=False,
        columns=final_cols,
        escape=False
    )
    with open(html_name, "w", encoding="utf-8") as f:
        f.write(html_content)
    logger.info("Exported HTML to '%s'.", html_name)

if __name__ == '__main__':
    main()

# 78 1 min 20 s
# 100_1 min 33s
# 250_4 min 9 s
# 250_2 min 52 s
# 250_2 min 50 s
# 500_10 min 24 s
# 500_ 10 min 30 s
# 750_49 min  19s
# 1,000_1 hr 47 min  49s

  and should_run_async(code)


Automatically selecting the first GEDCOM file.
GEDCOM contained 58272 total records
Records tagged and filtered by NPFX: 1301
Records with YDNA information: 90
Autosomal matches: 1211
After manual filter, total records: 78
Records tagged and filtered by NPFX: 78
Processing 78 individuals with chunk-based parallel...


Processing individuals: 100%|██████████| 78/78 [00:54<00:00,  1.43it/s]
Computing Values: 100%|██████████| 78/78 [00:14<00:00,  5.21it/s]


       ID#      Match to   cM  Value  Composite Score Composite Significance  \
55  I56154        hatpat   10      7             0.77                    Low   
15  I55749        hatpat  187      4             0.78                    Low   
69  I56904       hatpatm   13     76             0.17               Very Low   
72  I57027        hatpat   40     80             0.20               Very Low   
68  I56895       hatpatm   12     75             0.17               Very Low   
9   I54336        hatpat   20     79             0.18               Very Low   
8   I53819        hatpat   20     76             0.17               Very Low   
58  I56188        hatpat   15    131             0.79                    Low   
10  I54530  yates,ronald   16     76             0.17               Very Low   
11  I54602  yates,ronald   15     76             0.17               Very Low   

                             Yates DNA Ancestral Line  
55  SherrillJohnathan&YatesSarah~~~SherrillJonatho...  
15  Sho