<a href="https://colab.research.google.com/github/ronyates47/Gedcom-Utils/blob/main/01_29_1614_2025_Value_works.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
!pip install pandas
!pip install python-gedcom
!pip install openpyxl
!pip install xlsxwriter




In [7]:
# 01_28_2033(works!)_2025_stable

import csv
import glob
from gedcom.element.individual import IndividualElement
from gedcom.parser import Parser
import pandas as pd
from datetime import datetime

###############################################################################
#               Configure these Booleans to Enable/Disable Outputs            #
###############################################################################
GENERATE_MAIN_HTML = False
GENERATE_MINIMAL_HTML = False

anchor_gen1 = None

################################################################################
#                                GedcomDataset Class                           #
################################################################################
class GedcomDataset:
    def __init__(self, gen_person):
        self.gen_person = gen_person
        self.extractable_detail = {}
        self.anchor_gen1 = None  # Initialize anchor_gen1 here

    def add_extractable_detail(self, key, value):
        self.extractable_detail[key] = value

    def get_gen_person(self):
        name = self.extractable_detail.get('NAME', '')
        parts = name.split('/', 1)
        first_name = parts[0].split(' ')[0]
        last_name = parts[1].rstrip('/') if len(parts) > 1 else ""
        self.anchor_gen1 = last_name.replace(" ", "") + first_name.replace(" ", "")
        global anchor_gen1  # Declare that we're using the global variable
        anchor_gen1 = self.anchor_gen1  # Update the global variable
        return self.gen_person.strip('@')

    def get_anchor_gen1(self):
        return self.anchor_gen1

    def get_extractable_NPFX(self):
        return self.extractable_detail.get('NPFX', '')

    def get_extractable_cm(self):
        npfx_value = self.extractable_detail.get('NPFX', '')
        if '&' in npfx_value:
            cm_value = npfx_value.split('&')[0].strip()
        elif '**' in npfx_value:
            cm_value = npfx_value.split('**')[0].strip()
        else:
            cm_value = npfx_value.strip()
        try:
            int(cm_value)
            return cm_value
        except ValueError:
            return ''

    def get_extractable_sort(self):
        npfx_value = self.extractable_detail.get('NPFX', '')
        if '&' in npfx_value:
            sort_part = npfx_value.split('&')[1]
            if '**' in sort_part:
                sort_value = sort_part.split('**')[0].strip()
            else:
                sort_value = sort_part.strip()
            return sort_value
        else:
            return ''

    def get_extractable_YDNA(self):
        npfx_value = self.extractable_detail.get('NPFX', '')
        if '**' in npfx_value:
            ydna_value = npfx_value.split('**')[1].strip()
            return ydna_value
        else:
            return ''

    def get_extractable_FAMC(self):
        return self.extractable_detail.get('FAMC', '').strip('@')


################################################################################
#                           Utility Functions                                  #
################################################################################
def extract_name(record):
    """
    Extracts first and last name from a GEDCOM record.
    Handles missing or malformed names gracefully.
    """
    name_start = record.find('1 NAME ') + 6
    name_end = record.find('\n', name_start)

    if name_start == 5 or name_end == -1:  # Meaning '1 NAME ' was not found
        return "UnknownName"

    name = record[name_start:name_end].strip()

    # Handle cases where no '/' is present in the name
    if '/' not in name:
        return name[:10].replace(" ", "")  # Take first 10 characters as default name

    # Extract first and last name
    first_name, last_name = name.split('/', 1)
    first_name = first_name[:10]  # first 10 chars
    last_name = last_name[:10].rstrip('/')

    return last_name.replace(" ", "") + first_name.replace(" ", "")

name_to_id = {}  # Global dictionary to hold name->ID mapping

################################################################################
#                               Gedcom Class                                   #
################################################################################
class Gedcom:
    def __init__(self, file_name):
        self.file_name = file_name
        self.gedcom_datasets = []
        self.filter_pool = []

    @staticmethod
    def get_standard_name(file_path):
        file_name = file_path.split('/')[-1]
        if '.' in file_name:
            file_name = file_name.rsplit('.', 1)[0]
        standard_name = file_name.replace(' ', '_').lower()
        return standard_name

    def parse_gedcom(self):
        global name_to_id  # we’ll modify name_to_id
        with open(self.file_name, 'r', encoding='utf-8-sig') as f:
            gedcom_lines = f.readlines()

        current_dataset = None
        npfx_count = 0
        ydna_count = 0  # Count YDNA occurrences
        total_count = 0

        for line in gedcom_lines:
            parts = line.strip().split(' ', 2)
            level = int(parts[0])
            tag = parts[1]
            value = parts[2] if len(parts) > 2 else None

            if level == 0 and tag.startswith('@') and tag.endswith('@') and value == 'INDI':
                total_count += 1
                current_dataset = GedcomDataset(tag)
                self.gedcom_datasets.append(current_dataset)

                # Populate name_to_id
                individual_name = current_dataset.get_anchor_gen1()
                individual_id = current_dataset.get_gen_person()
                name_to_id[individual_name] = individual_id

            elif current_dataset is not None:
                if level == 1 and tag in ['NAME', 'FAMC']:
                    current_key = tag
                    current_dataset.add_extractable_detail(current_key, value)

                elif level == 2 and tag == 'NPFX':
                    npfx_count += 1
                    current_dataset.add_extractable_detail(tag, value)
                    if '**' in value:
                        ydna_count += 1  # YDNA found

        autosomal_count = npfx_count - ydna_count

        print(f'GEDCOM contained {total_count} total records')
        print(f'Records tagged and filtered by NPFX: {npfx_count}')
        print(f'Records with YDNA information: {ydna_count}')
        print(f'Autosomal matches: {autosomal_count}')

        # First-level filter: only those with NPFX
        for dataset in self.gedcom_datasets:
            if dataset.get_extractable_NPFX():
                self.filter_pool.append(dataset)

        # Optional second-level filter from an Excel file
        manual_filter_activated = True  # or False
        if manual_filter_activated:
            try:
                df = pd.read_excel('filtered_ids.xlsx')
            except FileNotFoundError:
                print("filtered_ids.xlsx not found. Skipping second-level manual filter.")
            else:
                manual_filtered_ids = set(df['ID'])
                print(f"Manual filter IDs loaded: {len(manual_filtered_ids) - 1}")

                self.filter_pool = [
                    dataset for dataset in self.filter_pool
                    if dataset.get_gen_person() in manual_filtered_ids
                ]
                print(f"After manual filter, total records: {len(self.filter_pool)}")


def input_prime_surname(last_prime_surname=None):
    if last_prime_surname:
        last_name = input(f"Enter prime_surname (default: {last_prime_surname}): ")
        if not last_name:
            last_name = last_prime_surname
    else:
        last_name = input("Enter prime_surname: ")
    return last_name

def select_gedcom_file():
    gedcom_files = glob.glob('*.ged')
    if not gedcom_files:
        print("No GEDCOM files found.")
        return None

    # Just automatically return the first GEDCOM file found
    print("Automatically selecting the first GEDCOM file.")
    return gedcom_files[0]

    # If you want a manual selection, uncomment the below while loop:
    #
    # while True:
    #     for i, file in enumerate(gedcom_files, start=1):
    #         print(f"{i}. {file}")
    #     try:
    #         selected_num = int(input("Enter the number of the GEDCOM file you want to use: "))
    #         if 1 <= selected_num <= len(gedcom_files):
    #             return gedcom_files[selected_num - 1]
    #         else:
    #             print("Invalid number. Please enter a valid number from the list.")
    #     except ValueError:
    #         print("Invalid input. Please enter a valid number.")

################################################################################
#          Execute GEDCOM Parsing & Build Our Filter Pool                      #
################################################################################
gedcom_file_path = select_gedcom_file()
if gedcom_file_path:
    gedcom_instance = Gedcom(gedcom_file_path)
    gedcom_instance.parse_gedcom()

    # Gather individuals (last_name, individual_id) from the filter pool
    individuals = []
    for dataset in gedcom_instance.filter_pool:
        individual_id = dataset.get_gen_person()
        last_name = dataset.get_anchor_gen1()
        individuals.append((last_name, individual_id))

    print(f'Records tagged and filtered by NPFX: {len(individuals)}')

    ################################################################################
    # Function: Extract ID from GEDCOM Record
    ################################################################################
    def extract_id(record):
        """
        Extracts the ID from a GEDCOM record.
        A valid ID is enclosed within '@' symbols.
        """
        id_start = record.find('@') + 1
        id_end = record.find('@', id_start)

        if id_start == 0 or id_end == -1:  # If '@' is missing
            return "UnknownID"

        return record[id_start:id_end].strip()

    # Read the GEDCOM file as raw text, parse out records
    with open(gedcom_file_path, 'r', encoding='utf-8') as file:
        data = file.read()

    data = data.split('\n0 ')  # Split records based on GEDCOM structure
    records = {extract_id(record): record for record in data}

else:
    print("No GEDCOM file selected; exiting.")
    raise SystemExit

################################################################################
#        Functions to Traverse & Score Ancestors, Build Data for DataFrame     #
################################################################################
def has_both_parents(records, mother_id, father_id):
    return mother_id in records and father_id in records

visited_pairs = set()
generation_table = []

def find_parents(individual_id, generation, records):
    if individual_id not in records:
        return
    record = records[individual_id]
    famc_start = record.find('1 FAMC @') + 8
    famc_end = record.find('@', famc_start)
    famc_id = record[famc_start:famc_end]
    if famc_id not in records:
        return

    fam_record = records[famc_id]
    wife_start = fam_record.find('1 WIFE @') + 8
    wife_end = fam_record.find('@', wife_start)
    mother_id = fam_record[wife_start:wife_end]

    husb_start = fam_record.find('1 HUSB @') + 8
    husb_end = fam_record.find('@', husb_start)
    father_id = fam_record[husb_start:husb_end]

    if mother_id and mother_id in records and father_id and father_id in records:
        parent_pair = (father_id, mother_id)
        if parent_pair not in visited_pairs:
            visited_pairs.add(parent_pair)
            generation_table.append((generation, parent_pair))

    if mother_id:
        find_parents(mother_id, generation + 1, records)
    if father_id:
        find_parents(father_id, generation + 1, records)

def find_distant_ancestors(individual_id, records, path=None):
    path = path if path is not None else []
    path.append(individual_id)

    if individual_id not in records:
        return [path]

    record = records[individual_id]
    famc_start = record.find('1 FAMC @') + 8
    famc_end = record.find('@', famc_start)
    famc_id = record[famc_start:famc_end]

    if famc_id not in records:
        return [path]

    fam_record = records[famc_id]
    wife_start = fam_record.find('1 WIFE @') + 8
    wife_end = fam_record.find('@', wife_start)
    mother_id = fam_record[wife_start:wife_end]

    husb_start = fam_record.find('1 HUSB @') + 8
    husb_end = fam_record.find('@', husb_start)
    father_id = fam_record[husb_start:husb_end]

    if father_id is None and mother_id is None:
        return [path]

    paths = []
    if father_id:
        new_path = list(path)
        paths.extend(find_distant_ancestors(father_id, records, new_path))
    if mother_id:
        new_path = list(path)
        paths.extend(find_distant_ancestors(mother_id, records, new_path))

    return paths

def calculate_score(distant_ancestors_paths, records):
    name_paths = []
    for path in distant_ancestors_paths:
        name_path = [extract_name(records.get(id, '')) for id in path]
        name_paths.append(name_path)

    path_scores = {}
    for idx, name_path in enumerate(name_paths):
        score = 0
        for generation, name in enumerate(name_path):
            if 'Yates' in name:
                score += 1 * (generation + 1)
        path_scores[idx] = score

    if path_scores:
        winning_path_index = max(path_scores, key=path_scores.get)
        winning_path_score = path_scores[winning_path_index]
        winning_path_names = name_paths[winning_path_index]
        winning_path_ids = distant_ancestors_paths[winning_path_index]
    else:
        winning_path_index = None
        winning_path_score = 0
        winning_path_names = []
        winning_path_ids = []

    return winning_path_score, winning_path_names, winning_path_ids

def filter_ancestral_line(winning_path_ids, generation_table):
    matching_table = []
    for generation, pair in generation_table:
        id1, id2 = pair
        if id1 in winning_path_ids or id2 in winning_path_ids:
            matching_table.append((generation, pair))
    return matching_table

def process_individual(individual_id, gedcom_instance, records):
    global generation_table
    global visited_pairs
    global anchor_gen1  # We'll update anchor_gen1 if found

    generation_table = []
    visited_pairs = set()

    # Build generation_table, visited_pairs
    find_parents(individual_id, 1, records)

    # All possible ancestor paths for that ID
    distant_ancestors_paths = find_distant_ancestors(individual_id, records)
    winning_path_score, winning_path_names, winning_path_ids = calculate_score(distant_ancestors_paths, records)
    filtered_ancestral_line = filter_ancestral_line(winning_path_ids, generation_table)
    filtered_ancestral_line.sort(key=lambda x: x[0])

    filtered_ancestral_line_names = []

    # Gather more info from the dataset
    for dataset in gedcom_instance.filter_pool:
        if dataset.get_gen_person() == individual_id:
            cm_value = dataset.get_extractable_cm()
            sort_value = dataset.get_extractable_sort()
            ydna_value = dataset.get_extractable_YDNA()
            anchor_gen1 = dataset.get_anchor_gen1()
            break
    else:
        cm_value = ''
        sort_value = ''
        ydna_value = ''
        anchor_gen1 = None

    # Build ancestral line (exclude anchor_gen1 itself)
    for generation, pair in filtered_ancestral_line:
        name_pair = [extract_name(records.get(id, '')) for id in pair]
        formatted_name_pair = f"{name_pair[0]}&{name_pair[1]}"
        filtered_ancestral_line_names.append(formatted_name_pair)

    # Reverse order
    filtered_ancestral_line_names.reverse()
    filtered_ancestral_line_str = "~~~".join(filtered_ancestral_line_names)

    # Check we did not accidentally include anchor_gen1
    if anchor_gen1 in filtered_ancestral_line_names:
        raise ValueError(
            f"anchor_gen1 ({anchor_gen1}) was mistakenly included in the ancestral line."
        )

    individual_data = {
        'cM': cm_value,
        'Sort': sort_value,
        'YDNA': ydna_value,
        'Filtered Ancestral Line': filtered_ancestral_line_str
    }

    return individual_data, filtered_ancestral_line_str

################################################################################
#         Build Rows for DataFrame from the Filter Pool                        #
################################################################################
combined_df_rows = []
for dataset in gedcom_instance.filter_pool:
    individual_id = dataset.get_gen_person()
    visited_pairs.clear()
    generation_table = []

    individual_data, filtered_ancestral_line_str = process_individual(
        individual_id, gedcom_instance, records
    )
    cm = individual_data["cM"]
    sort = individual_data["Sort"]
    individual_name = extract_name(records.get(individual_id, ""))

    combined_df_rows.append(
        [individual_id, sort, individual_name, cm, filtered_ancestral_line_str]
    )

################################################################################
#       NO NEED TO MODIFY ABOVE THIS SECTION 28-1-2025                         #
################################################################################

import pandas as pd
import csv
from datetime import datetime

################################################################################
#       Create and Populate Main DataFrame (combined_df)
################################################################################
columns = ["ID#", "Match to", "Name", "cM", "Yates DNA Ancestral Line"]
combined_df = pd.DataFrame(combined_df_rows, columns=columns)

# Initialize the value_store dictionary
value_store = {}

# Populate value_store with data from the DataFrame, including a placeholder for 'Value'
for _, row in combined_df.iterrows():
    value_store[row["ID#"]] = {
        "Match to": row["Match to"],
        "Name": row["Name"],
        "cM": row["cM"],
        "Yates DNA Ancestral Line": row["Yates DNA Ancestral Line"],
        "Value": None  # Placeholder for 'Value'
    }

################################################################################
#       Remove miscellaneous Distant ancestors (combined_df)
################################################################################
def remove_prefix(row):
    ancestral_line = row["Yates DNA Ancestral Line"]
    prefix_to_remove = "YatesJohn&HydeAlice~~~YatesThomas&WhiteFrances~~~"
    if ancestral_line.startswith(prefix_to_remove):
        row["Yates DNA Ancestral Line"] = ancestral_line[len(prefix_to_remove):]
    return row

combined_df = combined_df.apply(remove_prefix, axis=1)

# Order and clean up columns
ordered_columns = ["ID#", "Match to", "Name", "cM", "Yates DNA Ancestral Line"]
combined_df = combined_df[ordered_columns]
combined_df.index += 1
combined_df.sort_values(by=["Match to", "Yates DNA Ancestral Line"], ascending=[False, True], inplace=True)

import pandas as pd
import csv
from datetime import datetime
from collections import defaultdict

################################################################################
#       Segmentation and Frequency Analysis
################################################################################
def parse_line_to_pairs(line, delimiter="~~~"):
    """ Splits a given ancestral line into named pairs based on the delimiter. """
    return line.strip().split(delimiter)

def find_largest_common_prefix_at_least_two_pairs(lines_as_pairs):
    """
    Finds the largest common prefix that appears in at least two lines.
    Returns the longest shared segment and the indices of lines sharing it.
    """
    if not lines_as_pairs:
        return [], []

    segment_map = defaultdict(set)

    # Build a mapping of segment -> line indices
    for idx, pairs in enumerate(lines_as_pairs):
        for start in range(len(pairs)):
            for end in range(start + 2, len(pairs) + 1):  # Minimum length = 2
                segment = "~~~".join(pairs[start:end])
                segment_map[segment].add(idx)

    # Filter to segments shared by at least two lines
    shared_segments = {seg: idx_set for seg, idx_set in segment_map.items() if len(idx_set) >= 2}

    if not shared_segments:
        return [], []

    # Sort by segment length (descending) and frequency (descending)
    sorted_segments = sorted(
        shared_segments.items(),
        key=lambda x: (-len(x[0].split("~~~")), -len(x[1]))
    )

    # Select the most significant segment
    seg_str, idx_set = sorted_segments[0]
    return seg_str.split("~~~"), list(idx_set)

def iterative_segmentation_min_length_2(lines):
    """
    Performs iterative segmentation by extracting the longest common prefix
    that appears in at least two lines, until no such prefixes remain.
    """
    lines_as_pairs = [parse_line_to_pairs(line) for line in lines]
    segments = []

    while True:
        prefix_list, shared_indices = find_largest_common_prefix_at_least_two_pairs(lines_as_pairs)
        if not prefix_list or len(shared_indices) < 2:
            break  # No more valid shared prefixes

        seg_str = "~~~".join(prefix_list)
        segments.append((seg_str, set(shared_indices)))

        # Remove this segment from all affected lines
        prefix_len = len(prefix_list)
        for idx in shared_indices:
            lines_as_pairs[idx] = lines_as_pairs[idx][prefix_len:]

    # Process leftover segments
    leftover_map = {}
    for i, pairs in enumerate(lines_as_pairs):
        if pairs:
            leftover_str = "~~~".join(pairs)
            leftover_map.setdefault(leftover_str, set()).add(i)

    for leftover_str, idx_set in leftover_map.items():
        segments.append((leftover_str, idx_set))

    return segments, lines_as_pairs

def build_presence_absence(segments, original_lines):
    """
    Creates a presence/absence matrix for discovered segments.
    Each row represents an ancestral line, and columns represent discovered segments.
    """
    from_substring_sets = []
    for line in original_lines:
        pairs = parse_line_to_pairs(line)
        sset = set("~~~".join(pairs[i:j]) for i in range(len(pairs)) for j in range(i + 1, len(pairs) + 1))
        from_substring_sets.append(sset)

    seg_strings = [seg[0] for seg in segments]
    presence_matrix = []
    for sub_set in from_substring_sets:
        row = [1 if seg_str in sub_set else 0 for seg_str in seg_strings]
        presence_matrix.append(row)

    return pd.DataFrame(presence_matrix, columns=seg_strings)

################################################################################
#       Main Processing
################################################################################
# Extract all ancestral lines
all_lines = combined_df["Yates DNA Ancestral Line"].tolist()

# Perform segmentation
segments_found, leftover_info = iterative_segmentation_min_length_2(all_lines)

# Create a DataFrame for segment frequency analysis
seg_df = pd.DataFrame(
    [(s, len(idx_set)) for s, idx_set in segments_found],
    columns=["Segment", "Frequency"]
).sort_values(by="Frequency", ascending=False)

# Build presence/absence matrix
presence_df = build_presence_absence(segments_found, all_lines)

# Calculate line values based on segment presence
line_values = presence_df.sum(axis=1)
combined_df["Value"] = line_values

# Create mapping of ID# to Value
id_to_value_map = combined_df.set_index("ID#")["Value"].to_dict()

################################################################################
# Generate Debugging Report and Populate value_store (Print to Console)
################################################################################
def generate_debugging_report_and_update_store(record_ids, ancestral_lines, segments, value_store):
    """
    Print a debugging report directly to the console and update value_store with correct 'TOTAL' values.
    """
    print("\n")  # One line break before printing

    # Print header
    print("ID\tLine\tSegment\tFrequency\tContribution")

    # Iterate through each record and print rows
    for record_id, line in zip(record_ids, ancestral_lines):
        total_contribution = 0
        for segment, line_indices in segments:
            if record_ids.index(record_id) in line_indices:
                frequency = len(line_indices)
                contribution = frequency
                total_contribution += contribution
                print(f"{record_id}\t{line}\t{segment}\t{frequency}\t{contribution}")

        # Print the TOTAL row for each record
        print(f"{record_id}\t{line}\tTOTAL\t\t{total_contribution}")

        # Update value_store with the correct 'Value'
        if record_id in value_store:
            value_store[record_id]["Value"] = total_contribution
        else:
            print(f"WARNING: ID {record_id} not found in value_store")

    print("\n")  # One line break after printing

################################################################################
# Generate the Debugging Report and Update value_store
################################################################################
generate_debugging_report_and_update_store(
    record_ids=combined_df["ID#"].tolist(),
    ancestral_lines=all_lines,
    segments=segments_found,
    value_store=value_store
)

################################################################################
# FINAL CHECK: Ensure value_store is updated correctly
################################################################################
print("\nFinal Verification of value_store (AFTER update):")
for id_, data in value_store.items():
    print(f"ID#: {id_}, Match to: {data['Match to']}, Name: {data['Name']}, cM: {data['cM']}, Value: {data['Value']}")

# Ensure combined_df is updated as well
combined_df["Value"] = combined_df["ID#"].map(lambda id_: value_store[id_]["Value"] if id_ in value_store else None)

# Print final DataFrame verification
print("\nFinal combined_df Verification:")
print(combined_df[["ID#", "Value"]].head(10))

################################################################################
# Export Final Data
################################################################################
combined_df.to_csv("final_combined_df.csv", index=False)
seg_df.to_csv("segments_discovered.csv", index=False)
presence_df.to_csv("segments_presence_absence.csv", index=False)

print("\nFinal CSV Outputs Saved:")
print(" - final_combined_df.csv")
print(" - segments_discovered.csv")
print(" - segments_presence_absence.csv")


Automatically selecting the first GEDCOM file.
GEDCOM contained 58271 total records
Records tagged and filtered by NPFX: 1301
Records with YDNA information: 76
Autosomal matches: 1225
Manual filter IDs loaded: 5
After manual filter, total records: 6
Records tagged and filtered by NPFX: 6


ID	Line	Segment	Frequency	Contribution
I54840	YatesWilliam&ThornburyAnne~~~YatesWilliam&HenryJane~~~YatesJohn&BoardMaryPolly~~~YatesJames&JohnsonCharity~~~CoffmanAbraham&YatesCatherine~~~CoffmanWilliamAa&CainSarahThom~~~CoffmanJohnAlber&AmmonsFannieMae~~~CoffmanJohnAlber&DavisEdnaLilli~~~WilsonSearching&CoffmanDaughter	YatesWilliam&ThornburyAnne~~~YatesWilliam&HenryJane~~~YatesJohn&BoardMaryPolly~~~YatesJames&JohnsonCharity~~~CoffmanAbraham&YatesCatherine	2	2
I54840	YatesWilliam&ThornburyAnne~~~YatesWilliam&HenryJane~~~YatesJohn&BoardMaryPolly~~~YatesJames&JohnsonCharity~~~CoffmanAbraham&YatesCatherine~~~CoffmanWilliamAa&CainSarahThom~~~CoffmanJohnAlber&AmmonsFannieMae~~~CoffmanJohnAlber&DavisEdnaLil