<a href="https://colab.research.google.com/github/ronyates47/Gedcom-Utils/blob/main/Ztest__01_30_1711_2025_HTML%2BValue_works.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [12]:
!pip install pandas
!pip install python-gedcom
!pip install openpyxl
!pip install xlsxwriter




In [15]:
# Ztest_01_28_2033(works!)_2025_stable

# Standard Libraries
import csv
import glob
from datetime import datetime

# GEDCOM Parsing
from gedcom.element.individual import IndividualElement
from gedcom.parser import Parser

# Data Processing
import pandas as pd
from openpyxl import load_workbook
from openpyxl.styles import Alignment

anchor_gen1 = None

################################################################################
#                                GedcomDataset Class                           #
################################################################################
class GedcomDataset:
    def __init__(self, gen_person):
        self.gen_person = gen_person
        self.extractable_detail = {}
        self.anchor_gen1 = None  # Initialize anchor_gen1 here

    def add_extractable_detail(self, key, value):
        self.extractable_detail[key] = value

    def get_gen_person(self):
        name = self.extractable_detail.get('NAME', '')
        parts = name.split('/', 1)
        first_name = parts[0].split(' ')[0]
        last_name = parts[1].rstrip('/') if len(parts) > 1 else ""
        self.anchor_gen1 = last_name.replace(" ", "") + first_name.replace(" ", "")
        global anchor_gen1  # Declare that we're using the global variable
        anchor_gen1 = self.anchor_gen1  # Update the global variable
        return self.gen_person.strip('@')

    def get_anchor_gen1(self):
        return self.anchor_gen1

    def get_extractable_NPFX(self):
        return self.extractable_detail.get('NPFX', '')

    def get_extractable_cm(self):
        npfx_value = self.extractable_detail.get('NPFX', '')
        if '&' in npfx_value:
            cm_value = npfx_value.split('&')[0].strip()
        elif '**' in npfx_value:
            cm_value = npfx_value.split('**')[0].strip()
        else:
            cm_value = npfx_value.strip()
        try:
            int(cm_value)
            return cm_value
        except ValueError:
            return ''

    def get_extractable_sort(self):
        npfx_value = self.extractable_detail.get('NPFX', '')
        if '&' in npfx_value:
            sort_part = npfx_value.split('&')[1]
            if '**' in sort_part:
                sort_value = sort_part.split('**')[0].strip()
            else:
                sort_value = sort_part.strip()
            return sort_value
        else:
            return ''

    def get_extractable_YDNA(self):
        npfx_value = self.extractable_detail.get('NPFX', '')
        if '**' in npfx_value:
            ydna_value = npfx_value.split('**')[1].strip()
            return ydna_value
        else:
            return ''

    def get_extractable_FAMC(self):
        return self.extractable_detail.get('FAMC', '').strip('@')


################################################################################
#                           Utility Functions                                  #
################################################################################
def extract_name(record):
    """
    Extracts first and last name from a GEDCOM record.
    Handles missing or malformed names gracefully.
    """
    name_start = record.find('1 NAME ') + 6
    name_end = record.find('\n', name_start)

    if name_start == 5 or name_end == -1:  # Meaning '1 NAME ' was not found
        return "UnknownName"

    name = record[name_start:name_end].strip()

    # Handle cases where no '/' is present in the name
    if '/' not in name:
        return name[:10].replace(" ", "")  # Take first 10 characters as default name

    # Extract first and last name
    first_name, last_name = name.split('/', 1)
    first_name = first_name[:10]  # first 10 chars
    last_name = last_name[:10].rstrip('/')

    return last_name.replace(" ", "") + first_name.replace(" ", "")

name_to_id = {}  # Global dictionary to hold name->ID mapping

################################################################################
#                               Gedcom Class                                   #
################################################################################
class Gedcom:
    def __init__(self, file_name):
        self.file_name = file_name
        self.gedcom_datasets = []
        self.filter_pool = []

    @staticmethod
    def get_standard_name(file_path):
        file_name = file_path.split('/')[-1]
        if '.' in file_name:
            file_name = file_name.rsplit('.', 1)[0]
        standard_name = file_name.replace(' ', '_').lower()
        return standard_name

    def parse_gedcom(self):
        global name_to_id  # we’ll modify name_to_id
        with open(self.file_name, 'r', encoding='utf-8-sig') as f:
            gedcom_lines = f.readlines()

        current_dataset = None
        npfx_count = 0
        ydna_count = 0  # Count YDNA occurrences
        total_count = 0

        for line in gedcom_lines:
            parts = line.strip().split(' ', 2)
            level = int(parts[0])
            tag = parts[1]
            value = parts[2] if len(parts) > 2 else None

            if level == 0 and tag.startswith('@') and tag.endswith('@') and value == 'INDI':
                total_count += 1
                current_dataset = GedcomDataset(tag)
                self.gedcom_datasets.append(current_dataset)

                # Populate name_to_id
                individual_name = current_dataset.get_anchor_gen1()
                individual_id = current_dataset.get_gen_person()
                name_to_id[individual_name] = individual_id

            elif current_dataset is not None:
                if level == 1 and tag in ['NAME', 'FAMC']:
                    current_key = tag
                    current_dataset.add_extractable_detail(current_key, value)

                elif level == 2 and tag == 'NPFX':
                    npfx_count += 1
                    current_dataset.add_extractable_detail(tag, value)
                    if '**' in value:
                        ydna_count += 1  # YDNA found

        autosomal_count = npfx_count - ydna_count

        print(f'GEDCOM contained {total_count} total records')
        print(f'Records tagged and filtered by NPFX: {npfx_count}')
        print(f'Records with YDNA information: {ydna_count}')
        print(f'Autosomal matches: {autosomal_count}')

        # First-level filter: only those with NPFX
        for dataset in self.gedcom_datasets:
            if dataset.get_extractable_NPFX():
                self.filter_pool.append(dataset)

        # Optional second-level filter from an Excel file
        manual_filter_activated = True  # or False
        if manual_filter_activated:
            try:
                df = pd.read_excel('filtered_ids.xlsx')
            except FileNotFoundError:
                print("filtered_ids.xlsx not found. Skipping second-level manual filter.")
            else:
                manual_filtered_ids = set(df['ID'])
                print(f"Manual filter IDs loaded: {len(manual_filtered_ids) - 1}")

                self.filter_pool = [
                    dataset for dataset in self.filter_pool
                    if dataset.get_gen_person() in manual_filtered_ids
                ]
                print(f"After manual filter, total records: {len(self.filter_pool)}")


def input_prime_surname(last_prime_surname=None):
    if last_prime_surname:
        last_name = input(f"Enter prime_surname (default: {last_prime_surname}): ")
        if not last_name:
            last_name = last_prime_surname
    else:
        last_name = input("Enter prime_surname: ")
    return last_name

def select_gedcom_file():
    gedcom_files = glob.glob('*.ged')
    if not gedcom_files:
        print("No GEDCOM files found.")
        return None

    # Just automatically return the first GEDCOM file found
    print("Automatically selecting the first GEDCOM file.")
    return gedcom_files[0]

    # If you want a manual selection, uncomment the below while loop:
    #
    # while True:
    #     for i, file in enumerate(gedcom_files, start=1):
    #         print(f"{i}. {file}")
    #     try:
    #         selected_num = int(input("Enter the number of the GEDCOM file you want to use: "))
    #         if 1 <= selected_num <= len(gedcom_files):
    #             return gedcom_files[selected_num - 1]
    #         else:
    #             print("Invalid number. Please enter a valid number from the list.")
    #     except ValueError:
    #         print("Invalid input. Please enter a valid number.")

################################################################################
#          Execute GEDCOM Parsing & Build Our Filter Pool                      #
################################################################################
gedcom_file_path = select_gedcom_file()
if gedcom_file_path:
    gedcom_instance = Gedcom(gedcom_file_path)
    gedcom_instance.parse_gedcom()

    # Gather individuals (last_name, individual_id) from the filter pool
    individuals = []
    for dataset in gedcom_instance.filter_pool:
        individual_id = dataset.get_gen_person()
        last_name = dataset.get_anchor_gen1()
        individuals.append((last_name, individual_id))

    print(f'Records tagged and filtered by NPFX: {len(individuals)}')

    ################################################################################
    # Function: Extract ID from GEDCOM Record
    ################################################################################
    def extract_id(record):
        """
        Extracts the ID from a GEDCOM record.
        A valid ID is enclosed within '@' symbols.
        """
        id_start = record.find('@') + 1
        id_end = record.find('@', id_start)

        if id_start == 0 or id_end == -1:  # If '@' is missing
            return "UnknownID"

        return record[id_start:id_end].strip()

    # Read the GEDCOM file as raw text, parse out records
    with open(gedcom_file_path, 'r', encoding='utf-8') as file:
        data = file.read()

    data = data.split('\n0 ')  # Split records based on GEDCOM structure
    records = {extract_id(record): record for record in data}

else:
    print("No GEDCOM file selected; exiting.")
    raise SystemExit

################################################################################
#        Functions to Traverse & Score Ancestors, Build Data for DataFrame     #
################################################################################
def has_both_parents(records, mother_id, father_id):
    return mother_id in records and father_id in records

visited_pairs = set()
generation_table = []

def find_parents(individual_id, generation, records):
    if individual_id not in records:
        return
    record = records[individual_id]
    famc_start = record.find('1 FAMC @') + 8
    famc_end = record.find('@', famc_start)
    famc_id = record[famc_start:famc_end]
    if famc_id not in records:
        return

    fam_record = records[famc_id]
    wife_start = fam_record.find('1 WIFE @') + 8
    wife_end = fam_record.find('@', wife_start)
    mother_id = fam_record[wife_start:wife_end]

    husb_start = fam_record.find('1 HUSB @') + 8
    husb_end = fam_record.find('@', husb_start)
    father_id = fam_record[husb_start:husb_end]

    if mother_id and mother_id in records and father_id and father_id in records:
        parent_pair = (father_id, mother_id)
        if parent_pair not in visited_pairs:
            visited_pairs.add(parent_pair)
            generation_table.append((generation, parent_pair))

    if mother_id:
        find_parents(mother_id, generation + 1, records)
    if father_id:
        find_parents(father_id, generation + 1, records)

def find_distant_ancestors(individual_id, records, path=None):
    path = path if path is not None else []
    path.append(individual_id)

    if individual_id not in records:
        return [path]

    record = records[individual_id]
    famc_start = record.find('1 FAMC @') + 8
    famc_end = record.find('@', famc_start)
    famc_id = record[famc_start:famc_end]

    if famc_id not in records:
        return [path]

    fam_record = records[famc_id]
    wife_start = fam_record.find('1 WIFE @') + 8
    wife_end = fam_record.find('@', wife_start)
    mother_id = fam_record[wife_start:wife_end]

    husb_start = fam_record.find('1 HUSB @') + 8
    husb_end = fam_record.find('@', husb_start)
    father_id = fam_record[husb_start:husb_end]

    if father_id is None and mother_id is None:
        return [path]

    paths = []
    if father_id:
        new_path = list(path)
        paths.extend(find_distant_ancestors(father_id, records, new_path))
    if mother_id:
        new_path = list(path)
        paths.extend(find_distant_ancestors(mother_id, records, new_path))

    return paths

def calculate_score(distant_ancestors_paths, records):
    name_paths = []
    for path in distant_ancestors_paths:
        name_path = [extract_name(records.get(id, '')) for id in path]
        name_paths.append(name_path)

    path_scores = {}
    for idx, name_path in enumerate(name_paths):
        score = 0
        for generation, name in enumerate(name_path):
            if 'Yates' in name:
                score += 1 * (generation + 1)
        path_scores[idx] = score

    if path_scores:
        winning_path_index = max(path_scores, key=path_scores.get)
        winning_path_score = path_scores[winning_path_index]
        winning_path_names = name_paths[winning_path_index]
        winning_path_ids = distant_ancestors_paths[winning_path_index]
    else:
        winning_path_index = None
        winning_path_score = 0
        winning_path_names = []
        winning_path_ids = []

    return winning_path_score, winning_path_names, winning_path_ids

def filter_ancestral_line(winning_path_ids, generation_table):
    matching_table = []
    for generation, pair in generation_table:
        id1, id2 = pair
        if id1 in winning_path_ids or id2 in winning_path_ids:
            matching_table.append((generation, pair))
    return matching_table

def process_individual(individual_id, gedcom_instance, records):
    global generation_table
    global visited_pairs
    global anchor_gen1  # We'll update anchor_gen1 if found

    generation_table = []
    visited_pairs = set()

    # Build generation_table, visited_pairs
    find_parents(individual_id, 1, records)

    # All possible ancestor paths for that ID
    distant_ancestors_paths = find_distant_ancestors(individual_id, records)
    winning_path_score, winning_path_names, winning_path_ids = calculate_score(distant_ancestors_paths, records)
    filtered_ancestral_line = filter_ancestral_line(winning_path_ids, generation_table)
    filtered_ancestral_line.sort(key=lambda x: x[0])

    filtered_ancestral_line_names = []

    # Gather more info from the dataset
    for dataset in gedcom_instance.filter_pool:
        if dataset.get_gen_person() == individual_id:
            cm_value = dataset.get_extractable_cm()
            sort_value = dataset.get_extractable_sort()
            ydna_value = dataset.get_extractable_YDNA()
            anchor_gen1 = dataset.get_anchor_gen1()
            break
    else:
        cm_value = ''
        sort_value = ''
        ydna_value = ''
        anchor_gen1 = None

    # Build ancestral line (exclude anchor_gen1 itself)
    for generation, pair in filtered_ancestral_line:
        name_pair = [extract_name(records.get(id, '')) for id in pair]
        formatted_name_pair = f"{name_pair[0]}&{name_pair[1]}"
        filtered_ancestral_line_names.append(formatted_name_pair)

    # Reverse order
    filtered_ancestral_line_names.reverse()
    filtered_ancestral_line_str = "~~~".join(filtered_ancestral_line_names)

    # Check we did not accidentally include anchor_gen1
    if anchor_gen1 in filtered_ancestral_line_names:
        raise ValueError(
            f"anchor_gen1 ({anchor_gen1}) was mistakenly included in the ancestral line."
        )

    individual_data = {
        'cM': cm_value,
        'Sort': sort_value,
        'YDNA': ydna_value,
        'Filtered Ancestral Line': filtered_ancestral_line_str
    }

    return individual_data, filtered_ancestral_line_str

################################################################################
#         Build Rows for DataFrame from the Filter Pool                        #
################################################################################
combined_df_rows = []
for dataset in gedcom_instance.filter_pool:
    individual_id = dataset.get_gen_person()
    visited_pairs.clear()
    generation_table = []

    individual_data, filtered_ancestral_line_str = process_individual(
        individual_id, gedcom_instance, records
    )
    cm = individual_data["cM"]
    sort = individual_data["Sort"]
    individual_name = extract_name(records.get(individual_id, ""))

    combined_df_rows.append(
        [individual_id, sort, individual_name, cm, filtered_ancestral_line_str]
    )

################################################################################
#       NO NEED TO MODIFY ABOVE THIS SECTION 28-1-2025                         #
################################################################################

import pandas as pd
import csv
from datetime import datetime

################################################################################
#       Create and Populate Main DataFrame (combined_df)
################################################################################
columns = ["ID#", "Match to", "Name", "cM", "Yates DNA Ancestral Line"]
combined_df = pd.DataFrame(combined_df_rows, columns=columns)

# Initialize the value_store dictionary
value_store = {}

# Populate value_store with data from the DataFrame, including a placeholder for 'Value'
for _, row in combined_df.iterrows():
    value_store[row["ID#"]] = {
        "Match to": row["Match to"],
        "Name": row["Name"],
        "cM": row["cM"],
        "Yates DNA Ancestral Line": row["Yates DNA Ancestral Line"],
        "Value": None  # Placeholder for 'Value'
    }

################################################################################
#       Remove miscellaneous Distant ancestors (combined_df)
################################################################################
def remove_prefix(row):
    ancestral_line = row["Yates DNA Ancestral Line"]
    prefix_to_remove = "YatesJohn&HydeAlice~~~YatesThomas&WhiteFrances~~~"
    if ancestral_line.startswith(prefix_to_remove):
        row["Yates DNA Ancestral Line"] = ancestral_line[len(prefix_to_remove):]
    return row

combined_df = combined_df.apply(remove_prefix, axis=1)

# Order and clean up columns
ordered_columns = ["ID#", "Match to", "Name", "cM", "Yates DNA Ancestral Line"]
combined_df = combined_df[ordered_columns]
combined_df.index += 1
combined_df.sort_values(by=["Match to", "Yates DNA Ancestral Line"], ascending=[False, True], inplace=True)

import pandas as pd
import csv
from datetime import datetime
from collections import defaultdict

import pandas as pd
from collections import defaultdict
from itertools import combinations

import pandas as pd
from collections import defaultdict
from itertools import combinations

import pandas as pd
from collections import defaultdict
from itertools import combinations

import pandas as pd
from collections import defaultdict
from itertools import combinations

################################################################################
#       Segmentation and Frequency Analysis
################################################################################
def parse_line_to_pairs(line, delimiter="~~~"):
    """Splits a given ancestral line into named pairs based on the delimiter."""
    return line.strip().split(delimiter)

def identify_all_shared_segments(df, ancestral_col, min_shared=2, min_size=2):
    """
    Identify all possible shared segments (groups of min_size ancestral lines) across the dataset.

    Parameters:
    - df (pd.DataFrame): The DataFrame containing the dataset.
    - ancestral_col (str): The column name containing ancestral lines.
    - min_shared (int): Minimum number of records a segment must appear in to be considered shared.
    - min_size (int): Minimum number of ancestral lines in a segment.

    Returns:
    - shared_segments_sorted (dict): Dictionary with shared segment tuples as keys and their frequencies as values.
    """
    segment_counts = defaultdict(int)

    # Iterate over each record
    for idx, row in df.iterrows():
        # Ensure the ancestral_col exists and is not NaN
        if ancestral_col not in row or pd.isna(row[ancestral_col]):
            continue  # Skip if the column is missing or NaN

        lines = [line.strip() for line in row[ancestral_col].split('~~~') if line.strip()]

        # Generate all possible combinations of lines with size >= min_size
        for size in range(min_size, len(lines)+1):
            for subset in combinations(sorted(lines), size):
                segment_counts[subset] += 1

    # Filter segments that meet the minimum sharing threshold
    shared_segments = {segment: count for segment, count in segment_counts.items() if count >= min_shared}

    # Sort segments by size descending to prioritize larger segments
    shared_segments_sorted = dict(sorted(shared_segments.items(), key=lambda x: len(x[0]), reverse=True))

    return shared_segments_sorted

def calculate_value_multiple_segments_refined(df, ancestral_col, shared_segments):
    """
    Calculate the 'Revised_Value' metric for each record based on multiple shared segments.

    Parameters:
    - df (pd.DataFrame): The DataFrame containing the dataset.
    - ancestral_col (str): The column name containing ancestral lines.
    - shared_segments (dict): Dictionary of shared segments and their frequencies.

    Returns:
    - df (pd.DataFrame): DataFrame with an updated 'Revised_Value' column.
    """
    # Sort shared segments by size descending to prioritize larger segments
    sorted_segments = sorted(shared_segments.items(), key=lambda x: len(x[0]), reverse=True)

    # Initialize/Reinitialize Revised_Value column
    df['Revised_Value'] = 0

    # Iterate over each record
    for idx, row in df.iterrows():
        # Ensure the ancestral_col exists and is not NaN
        if ancestral_col not in row or pd.isna(row[ancestral_col]):
            df.at[idx, 'Revised_Value'] = 0
            continue  # Skip if the column is missing or NaN

        lines = [line.strip() for line in row[ancestral_col].split('~~~') if line.strip()]
        value = 0
        lines_copy = lines.copy()

        for segment, freq in sorted_segments:
            segment_list = list(segment)
            # Check if all lines in the segment are present in the record
            if all(line in lines_copy for line in segment_list):
                # Add to value: frequency
                value += freq
                # Remove these lines to prevent double-counting
                for line in segment_list:
                    lines_copy.remove(line)

        # Add 1 for each remaining unique ancestral line
        value += len(lines_copy)

        # Assign the calculated value
        df.at[idx, 'Revised_Value'] = value

    return df

################################################################################
#       Main Processing
################################################################################
# Example: Replace this with your actual DataFrame loading method
# For instance, if you're reading from a CSV:
# combined_df = pd.read_csv("your_data.csv")

# For demonstration, let's create a sample DataFrame similar to your structure
data = {
    'ID#': ['I52816', 'I47130', 'I52836', 'I52807', 'I52810', 'I52559', 'I52802', 'I52798'],
    'Surname': ['leedon', 'leedon', 'leedon', 'leedon', 'leedon', 'yates,timothyj', 'leedon', 'leedon'],
    'Person': ['DucksonPam', 'CrabtreeChadEdmun', 'RileySandra', 'RileyRonldPaul', 'LewallenDonaldChr', 'LeeDonnaStel', 'HoukW', 'HoukJeffrey'],
    'Number': [116, 25, 63, 173, 154, 12, 369, 448],
    'Value': [3, 3, 3, 3, 3, 1, 2, 2],
    'Yates DNA Ancestral Line': [
        'YatesWilliam&ParkerSally~~~YatesLevi&CooleyRebecca~~~YatesAmbroseDu&CooleyElizabeth~~~LeeGeorgeTra&YatesLarcena~~~LeeAlbertEls&LauderbackGoldieIre~~~WheelerRobertJef&YatesRosaAlafa~~~RileyAlveEdwin&WheelerWinifred',
        'YatesWilliam&ParkerSally~~~YatesLevi&CooleyRebecca~~~YatesAmbroseDu&CooleyElizabeth~~~WheelerRobertJef&YatesRosaAlafa~~~RileyAlveEdwin&WheelerWinifred~~~RileyRonaldRay&ShafferShirleyR',
        'YatesWilliam&ParkerSally~~~YatesLevi&CooleyRebecca~~~YatesAmbroseDu&CooleyElizabeth~~~LeeGeorgeTra&YatesLarcena~~~LeeAlbertEls&LauderbackGoldieIre~~~WheelerRobertJef&YatesRosaAlafa~~~RileyAlveEdwin&WheelerWinifred',
        'YatesWilliam&ParkerSally~~~YatesLevi&CooleyRebecca~~~YatesAmbroseDu&CooleyElizabeth~~~LeeGeorgeTra&YatesLarcena~~~LeeAlbertEls&LauderbackGoldieIre~~~WheelerRobertJef&YatesRosaAlafa~~~RileyAlveEdwin&WheelerWinifred~~~RileyAlvinPaul&KellyKathleeA',
        'YatesWilliam&ParkerSally~~~YatesLevi&CooleyRebecca~~~YatesAmbroseDu&CooleyElizabeth~~~LeeGeorgeTra&YatesLarcena~~~LeeAlbertEls&LauderbackGoldieIre~~~WheelerRobertJef&YatesRosaAlafa~~~McMullenRalphDona&WheelerDorothyKa',
        'YatesWilliam&ParkerSally~~~YatesLevi&CooleyRebecca~~~YatesAmbroseDu&CooleyElizabeth~~~LeeGeorgeTra&YatesLarcena~~~LeeAlbertEls&LauderbackGoldieIre~~~AdamsJohnHenry&LeeVioletEli',
        'YatesWilliam&ParkerSally~~~YatesLevi&CooleyRebecca~~~YatesAmbroseDu&CooleyElizabeth~~~LeeGeorgeTra&YatesLarcena~~~AdamsJohnHenry&LeeVioletEli~~~HoukGeorgeEdw&AdamsShirley',
        'YatesWilliam&ParkerSally~~~YatesLevi&CooleyRebecca~~~YatesAmbroseDu&CooleyElizabeth~~~LeeGeorgeTra&YatesLarcena~~~AdamsJohnHenry&LeeVioletEli~~~HoukGeorgeEdw&AdamsShirley'
    ]
}

combined_df = pd.DataFrame(data)

# Define the ancestral lines column name
ancestral_col = "Yates DNA Ancestral Line"

# Reorder columns so "Value" appears before "Yates DNA Ancestral Line"
ordered_columns = ["ID#", "Surname", "Person", "Number", "Value", "Yates DNA Ancestral Line"]
# Check if all ordered_columns exist in combined_df
missing_columns = set(ordered_columns) - set(combined_df.columns)
if missing_columns:
    print(f"Error: The following columns are missing in the DataFrame: {missing_columns}")
else:
    combined_df = combined_df[ordered_columns]
    print("Columns reordered successfully.")

# Perform segmentation to identify shared segments
shared_segments_found = identify_all_shared_segments(combined_df, ancestral_col, min_shared=2, min_size=2)

# Create a DataFrame for segment frequency analysis
seg_df = pd.DataFrame(
    [("~~~".join(seg), freq) for seg, freq in shared_segments_found.items()],
    columns=["Segment", "Frequency"]
).sort_values(by="Frequency", ascending=False)

# Display shared segments and their frequencies
print("\nAll Shared Segments and Frequencies:")
print(seg_df)

# Calculate 'Revised_Value' for each record based on shared segments
df_final = calculate_value_multiple_segments_refined(combined_df, ancestral_col, shared_segments_found)

# Display the final DataFrame with 'Revised_Value'
print("\nFinal DataFrame with Revised_Value:")
print(df_final[['ID#', 'Surname', 'Person', 'Value', 'Revised_Value']])

# Create mapping of ID# to Revised_Value
id_to_value_map = df_final.set_index("ID#")["Revised_Value"].to_dict()

# Optionally, display the mapping
print("\nMapping of ID# to Revised_Value:")
print(id_to_value_map)


################################################################################
# Update value_store Without Printing the Debugging Report
################################################################################
def update_value_store(record_ids, ancestral_lines, segments, value_store):
    """
    Updates value_store with correct 'TOTAL' values for each ID.
    """
    for record_id, line in zip(record_ids, ancestral_lines):
        total_contribution = 0  # Reset total per record
        for segment, freq in segments.items():
            # Check if the segment is present in the ancestral lines
            if all(s in line for s in segment):
                total_contribution += freq

        # Update value_store with the correct 'Value'
        if record_id in value_store:
            value_store[record_id]["Value"] = total_contribution
        else:
            print(f"WARNING: ID {record_id} not found in value_store")

################################################################################
# Initialize value_store
################################################################################
# Create value_store as a dictionary with ID# as keys and relevant information as values
value_store = combined_df.set_index("ID#").T.to_dict()

################################################################################
# Execute Value Calculation
################################################################################
# Extract ancestral lines
all_lines = combined_df["Yates DNA Ancestral Line"].tolist()
# Extract segments_found as shared_segments_found
segments_found = shared_segments_found

update_value_store(
    record_ids=combined_df["ID#"].tolist(),
    ancestral_lines=all_lines,
    segments=segments_found,
    value_store=value_store
)



################################################################################
# Ensure combined_df is updated before HTML output
################################################################################
combined_df["Value"] = combined_df["ID#"].map(lambda id_: value_store[id_]["Value"] if id_ in value_store else None)

# Create hotlink function
def create_hotlink(row):
    """
    Generates an HTML hyperlink for the ID# column.
    """
    url_base = "https://yates.one-name.net/tng/verticalchart.php?personID="
    additional_params = "&tree=tree1&parentset=0&display=vertical&generations=15"

    if pd.notnull(row["ID#"]):
        return f'<a href="{url_base}{row["ID#"]}{additional_params}">{row["ID#"]}</a>'
    return ""  # Return empty if no valid ID#

# Apply the hotlink function and replace "ID#" with clickable links
if "ID#" in combined_df.columns:
    combined_df["ID#"] = combined_df.apply(create_hotlink, axis=1)
else:
    print("Error: 'ID#' column not found in DataFrame")

# Reorder columns so "Value" appears before "Yates DNA Ancestral Line"
#ordered_columns = ["ID#", "Match to", "Name", "cM", "Value", "Yates DNA Ancestral Line"]
#combined_df = combined_df[ordered_columns]

# Sort dataset by "Yates DNA Ancestral Line" in descending order
combined_df.sort_values(by="Yates DNA Ancestral Line", ascending=False, inplace=True)

###############################################################################
#               Configure these Booleans to Enable/Disable Outputs            #
###############################################################################
GENERATE_MAIN_HTML = True
GENERATE_MINIMAL_HTML = False  # ✅ Ensure this variable is declared!

################################################################################
#      Conditional Output: MAIN HTML (Use Boolean Switch)
################################################################################
if GENERATE_MAIN_HTML:
    current_datetime = datetime.now().strftime("%Y-%m-%d_%H%M%S")
    output_html_path = f"htmloutput_{current_datetime}.html"

    # CSS for styling
    css_style = """
    <style>
    table, th, td {
      border: 1px solid black;
      border-collapse: collapse;
    }
    th {
      background-color: #f2f2f2;
      text-align: center;
    }
    td {
      text-align: center;
    }
    td:nth-child(6) {  /* Yates DNA Ancestral Line column */
      text-align: left;
    }
    </style>
    """

    # Generate the HTML table
    html_main = css_style + combined_df.to_html(
        index=False,
        classes="sortable",
        escape=False,
        border=0  # Let CSS handle borders
    )

    # Ensure "Yates DNA Ancestral Line" remains left-aligned
    html_main = html_main.replace(
        "<th>Yates DNA Ancestral Line</th>",
        '<th style="text-align:left;">Yates DNA Ancestral Line</th>'
    )

    # Save HTML to file
    with open(output_html_path, "w", encoding="utf-8") as f:
        f.write(html_main)

################################################################################
#      Minimal 2-Column HTML Output (Optional)
################################################################################
def generate_html_output_with_headers(dict_data):
    """
    Creates an HTML table containing only:
      - 'Match to'
      - 'Yates DNA Ancestral Line'
    sorted DESC by 'Yates DNA Ancestral Line'.
    """
    sorted_data = sorted(dict_data, key=lambda x: x.get("Yates DNA Ancestral Line", ""), reverse=True)

    html_head = """
<style>
table, th, td {
  border: 1px solid black;
  border-collapse: collapse;
}
th {
  background-color: #f2f2f2;
  text-align: left;
}
</style>
<table>
  <tr>
    <th style="text-align:left;">Match to</th>
    <th style="text-align:left;">Yates DNA Ancestral Line</th>
  </tr>
"""
    html_body = []
    for row in sorted_data:
        match_to = row.get("Match to", "")
        yates_line = row.get("Yates DNA Ancestral Line", "")
        html_body.append(f"  <tr><td>{match_to}</td><td>{yates_line}</td></tr>")

    return "\n".join([html_head] + html_body + ["</table>"])

if GENERATE_MINIMAL_HTML:
    records_for_html = combined_df[["Match to", "Yates DNA Ancestral Line"]].to_dict(orient="records")
    minimal_html_code = generate_html_output_with_headers(records_for_html)

    current_datetime = datetime.now().strftime("%Y-%m-%d_%H%M%S")
    minimal_html_path = f"minimal_htmloutput_{current_datetime}.html"

    with open(minimal_html_path, "w", encoding="utf-8") as f:
        f.write(minimal_html_code)

################################################################################
# Export Final Data
################################################################################
#combined_df.to_csv("final_combined_df.csv", index=False)
#seg_df.to_csv("segments_discovered.csv", index=False)
#presence_df.to_csv("segments_presence_absence.csv", index=False)




Automatically selecting the first GEDCOM file.
GEDCOM contained 58271 total records
Records tagged and filtered by NPFX: 1301
Records with YDNA information: 76
Autosomal matches: 1225
Manual filter IDs loaded: 7
After manual filter, total records: 8
Records tagged and filtered by NPFX: 8
Columns reordered successfully.

All Shared Segments and Frequencies:
                                               Segment  Frequency
156  YatesLevi&CooleyRebecca~~~YatesWilliam&ParkerS...          8
154  YatesAmbroseDu&CooleyElizabeth~~~YatesLevi&Coo...          8
155  YatesAmbroseDu&CooleyElizabeth~~~YatesWilliam&...          8
119  YatesAmbroseDu&CooleyElizabeth~~~YatesLevi&Coo...          8
109  LeeGeorgeTra&YatesLarcena~~~YatesLevi&CooleyRe...          7
..                                                 ...        ...
130  HoukGeorgeEdw&AdamsShirley~~~LeeGeorgeTra&Yate...          2
131  HoukGeorgeEdw&AdamsShirley~~~LeeGeorgeTra&Yate...          2
132  HoukGeorgeEdw&AdamsShirley~~~LeeGeorgeTra&