<a href="https://colab.research.google.com/github/ronyates47/Gedcom-Utils/blob/main/03mini_high(excellent)_01_31_2132_2025_stable.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install pandas
!pip install python-gedcom
!pip install openpyxl
!pip install xlsxwriter


Collecting python-gedcom
  Downloading python_gedcom-1.0.0-py2.py3-none-any.whl.metadata (15 kB)
Downloading python_gedcom-1.0.0-py2.py3-none-any.whl (35 kB)
Installing collected packages: python-gedcom
Successfully installed python-gedcom-1.0.0
Collecting xlsxwriter
  Downloading XlsxWriter-3.2.2-py3-none-any.whl.metadata (2.8 kB)
Downloading XlsxWriter-3.2.2-py3-none-any.whl (165 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m165.1/165.1 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: xlsxwriter
Successfully installed xlsxwriter-3.2.2


In [9]:
# 03mini-high_01_31_2029_2025_stable

# Standard Libraries
import csv
import glob
from datetime import datetime

# GEDCOM Parsing
from gedcom.element.individual import IndividualElement
from gedcom.parser import Parser

# Data Processing
import pandas as pd
from openpyxl import load_workbook
from openpyxl.styles import Alignment

anchor_gen1 = None

################################################################################
#                                GedcomDataset Class                           #
################################################################################
class GedcomDataset:
    def __init__(self, gen_person):
        self.gen_person = gen_person
        self.extractable_detail = {}
        self.anchor_gen1 = None  # Initialize anchor_gen1 here

    def add_extractable_detail(self, key, value):
        self.extractable_detail[key] = value

    def get_gen_person(self):
        name = self.extractable_detail.get('NAME', '')
        parts = name.split('/', 1)
        first_name = parts[0].split(' ')[0]
        last_name = parts[1].rstrip('/') if len(parts) > 1 else ""
        self.anchor_gen1 = last_name.replace(" ", "") + first_name.replace(" ", "")
        global anchor_gen1  # Declare that we're using the global variable
        anchor_gen1 = self.anchor_gen1  # Update the global variable
        return self.gen_person.strip('@')

    def get_anchor_gen1(self):
        return self.anchor_gen1

    def get_extractable_NPFX(self):
        return self.extractable_detail.get('NPFX', '')

    def get_extractable_cm(self):
        npfx_value = self.extractable_detail.get('NPFX', '')
        if '&' in npfx_value:
            cm_value = npfx_value.split('&')[0].strip()
        elif '**' in npfx_value:
            cm_value = npfx_value.split('**')[0].strip()
        else:
            cm_value = npfx_value.strip()
        try:
            int(cm_value)
            return cm_value
        except ValueError:
            return ''

    def get_extractable_sort(self):
        npfx_value = self.extractable_detail.get('NPFX', '')
        if '&' in npfx_value:
            sort_part = npfx_value.split('&')[1]
            if '**' in sort_part:
                sort_value = sort_part.split('**')[0].strip()
            else:
                sort_value = sort_part.strip()
            return sort_value
        else:
            return ''

    def get_extractable_YDNA(self):
        npfx_value = self.extractable_detail.get('NPFX', '')
        if '**' in npfx_value:
            ydna_value = npfx_value.split('**')[1].strip()
            return ydna_value
        else:
            return ''

    def get_extractable_FAMC(self):
        return self.extractable_detail.get('FAMC', '').strip('@')


################################################################################
#                           Utility Functions                                  #
################################################################################
def extract_name(record):
    """
    Extracts first and last name from a GEDCOM record.
    Handles missing or malformed names gracefully.
    """
    name_start = record.find('1 NAME ') + 6
    name_end = record.find('\n', name_start)
    if name_start == 5 or name_end == -1:  # Meaning '1 NAME ' was not found
        return "UnknownName"
    name = record[name_start:name_end].strip()
    # Handle cases where no '/' is present in the name
    if '/' not in name:
        return name[:10].replace(" ", "")  # Take first 10 characters as default name
    # Extract first and last name
    first_name, last_name = name.split('/', 1)
    first_name = first_name[:10]  # first 10 chars
    last_name = last_name[:10].rstrip('/')
    return last_name.replace(" ", "") + first_name.replace(" ", "")

name_to_id = {}  # Global dictionary to hold name->ID mapping

################################################################################
#                               Gedcom Class                                   #
################################################################################
class Gedcom:
    def __init__(self, file_name):
        self.file_name = file_name
        self.gedcom_datasets = []
        self.filter_pool = []

    @staticmethod
    def get_standard_name(file_path):
        file_name = file_path.split('/')[-1]
        if '.' in file_name:
            file_name = file_name.rsplit('.', 1)[0]
        standard_name = file_name.replace(' ', '_').lower()
        return standard_name

    def parse_gedcom(self):
        global name_to_id  # we’ll modify name_to_id
        with open(self.file_name, 'r', encoding='utf-8-sig') as f:
            gedcom_lines = f.readlines()

        current_dataset = None
        npfx_count = 0
        ydna_count = 0  # Count YDNA occurrences
        total_count = 0

        for line in gedcom_lines:
            parts = line.strip().split(' ', 2)
            level = int(parts[0])
            tag = parts[1]
            value = parts[2] if len(parts) > 2 else None

            if level == 0 and tag.startswith('@') and tag.endswith('@') and value == 'INDI':
                total_count += 1
                current_dataset = GedcomDataset(tag)
                self.gedcom_datasets.append(current_dataset)
                # Populate name_to_id
                individual_name = current_dataset.get_anchor_gen1()
                individual_id = current_dataset.get_gen_person()
                name_to_id[individual_name] = individual_id
            elif current_dataset is not None:
                if level == 1 and tag in ['NAME', 'FAMC']:
                    current_key = tag
                    current_dataset.add_extractable_detail(current_key, value)
                elif level == 2 and tag == 'NPFX':
                    npfx_count += 1
                    current_dataset.add_extractable_detail(tag, value)
                    if '**' in value:
                        ydna_count += 1  # YDNA found

        autosomal_count = npfx_count - ydna_count
        print(f'GEDCOM contained {total_count} total records')
        print(f'Records tagged and filtered by NPFX: {npfx_count}')
        print(f'Records with YDNA information: {ydna_count}')
        print(f'Autosomal matches: {autosomal_count}')

        # First-level filter: only those with NPFX
        for dataset in self.gedcom_datasets:
            if dataset.get_extractable_NPFX():
                self.filter_pool.append(dataset)

        # Optional second-level filter from an Excel file
        manual_filter_activated = True  # or False
        if manual_filter_activated:
            try:
                df = pd.read_excel('filtered_ids.xlsx')
            except FileNotFoundError:
                print("filtered_ids.xlsx not found. Skipping second-level manual filter.")
            else:
                manual_filtered_ids = set(df['ID'])
                print(f"Manual filter IDs loaded: {len(manual_filtered_ids) - 1}")
                self.filter_pool = [
                    dataset for dataset in self.filter_pool
                    if dataset.get_gen_person() in manual_filtered_ids
                ]
                print(f"After manual filter, total records: {len(self.filter_pool)}")


def input_prime_surname(last_prime_surname=None):
    if last_prime_surname:
        last_name = input(f"Enter prime_surname (default: {last_prime_surname}): ")
        if not last_name:
            last_name = last_prime_surname
    else:
        last_name = input("Enter prime_surname: ")
    return last_name

def select_gedcom_file():
    gedcom_files = glob.glob('*.ged')
    if not gedcom_files:
        print("No GEDCOM files found.")
        return None
    # Automatically select the first GEDCOM file found.
    print("Automatically selecting the first GEDCOM file.")
    return gedcom_files[0]

################################################################################
#          Execute GEDCOM Parsing & Build Our Filter Pool                      #
################################################################################
gedcom_file_path = select_gedcom_file()
if gedcom_file_path:
    gedcom_instance = Gedcom(gedcom_file_path)
    gedcom_instance.parse_gedcom()
    # Gather individuals (last_name, individual_id) from the filter pool
    individuals = []
    for dataset in gedcom_instance.filter_pool:
        individual_id = dataset.get_gen_person()
        last_name = dataset.get_anchor_gen1()
        individuals.append((last_name, individual_id))
    print(f'Records tagged and filtered by NPFX: {len(individuals)}')

    ################################################################################
    # Function: Extract ID from GEDCOM Record
    ################################################################################
    def extract_id(record):
        """
        Extracts the ID from a GEDCOM record.
        A valid ID is enclosed within '@' symbols.
        """
        id_start = record.find('@') + 1
        id_end = record.find('@', id_start)
        if id_start == 0 or id_end == -1:
            return "UnknownID"
        return record[id_start:id_end].strip()

    # Read the GEDCOM file as raw text and split into records
    with open(gedcom_file_path, 'r', encoding='utf-8') as file:
        data = file.read()
    data = data.split('\n0 ')
    records = {extract_id(record): record for record in data}
else:
    print("No GEDCOM file selected; exiting.")
    raise SystemExit

################################################################################
#        Functions to Traverse & Score Ancestors, Build Data for DataFrame     #
################################################################################
def has_both_parents(records, mother_id, father_id):
    return mother_id in records and father_id in records

visited_pairs = set()
generation_table = []

def find_parents(individual_id, generation, records):
    if individual_id not in records:
        return
    record = records[individual_id]
    famc_start = record.find('1 FAMC @') + 8
    famc_end = record.find('@', famc_start)
    famc_id = record[famc_start:famc_end]
    if famc_id not in records:
        return
    fam_record = records[famc_id]
    wife_start = fam_record.find('1 WIFE @') + 8
    wife_end = fam_record.find('@', wife_start)
    mother_id = fam_record[wife_start:wife_end]
    husb_start = fam_record.find('1 HUSB @') + 8
    husb_end = fam_record.find('@', husb_start)
    father_id = fam_record[husb_start:husb_end]
    if mother_id and mother_id in records and father_id and father_id in records:
        parent_pair = (father_id, mother_id)
        if parent_pair not in visited_pairs:
            visited_pairs.add(parent_pair)
            generation_table.append((generation, parent_pair))
    if mother_id:
        find_parents(mother_id, generation + 1, records)
    if father_id:
        find_parents(father_id, generation + 1, records)

def find_distant_ancestors(individual_id, records, path=None):
    path = path if path is not None else []
    path.append(individual_id)
    if individual_id not in records:
        return [path]
    record = records[individual_id]
    famc_start = record.find('1 FAMC @') + 8
    famc_end = record.find('@', famc_start)
    famc_id = record[famc_start:famc_end]
    if famc_id not in records:
        return [path]
    fam_record = records[famc_id]
    wife_start = fam_record.find('1 WIFE @') + 8
    wife_end = fam_record.find('@', wife_start)
    mother_id = fam_record[wife_start:wife_end]
    husb_start = fam_record.find('1 HUSB @') + 8
    husb_end = fam_record.find('@', husb_start)
    father_id = fam_record[husb_start:husb_end]
    if father_id is None and mother_id is None:
        return [path]
    paths = []
    if father_id:
        new_path = list(path)
        paths.extend(find_distant_ancestors(father_id, records, new_path))
    if mother_id:
        new_path = list(path)
        paths.extend(find_distant_ancestors(mother_id, records, new_path))
    return paths

def calculate_score(distant_ancestors_paths, records):
    name_paths = []
    for path in distant_ancestors_paths:
        name_path = [extract_name(records.get(id, '')) for id in path]
        name_paths.append(name_path)
    path_scores = {}
    for idx, name_path in enumerate(name_paths):
        score = 0
        for generation, name in enumerate(name_path):
            if 'Yates' in name:
                score += 1 * (generation + 1)
        path_scores[idx] = score
    if path_scores:
        winning_path_index = max(path_scores, key=path_scores.get)
        winning_path_score = path_scores[winning_path_index]
        winning_path_names = name_paths[winning_path_index]
        winning_path_ids = distant_ancestors_paths[winning_path_index]
    else:
        winning_path_index = None
        winning_path_score = 0
        winning_path_names = []
        winning_path_ids = []
    return winning_path_score, winning_path_names, winning_path_ids

def filter_ancestral_line(winning_path_ids, generation_table):
    matching_table = []
    for generation, pair in generation_table:
        id1, id2 = pair
        if id1 in winning_path_ids or id2 in winning_path_ids:
            matching_table.append((generation, pair))
    return matching_table

def process_individual(individual_id, gedcom_instance, records):
    global generation_table
    global visited_pairs
    global anchor_gen1  # We'll update anchor_gen1 if found

    generation_table = []
    visited_pairs = set()
    # Build generation_table and visited_pairs
    find_parents(individual_id, 1, records)
    distant_ancestors_paths = find_distant_ancestors(individual_id, records)
    winning_path_score, winning_path_names, winning_path_ids = calculate_score(distant_ancestors_paths, records)
    filtered_ancestral_line = filter_ancestral_line(winning_path_ids, generation_table)
    filtered_ancestral_line.sort(key=lambda x: x[0])
    filtered_ancestral_line_names = []

    # Gather more info from the dataset
    for dataset in gedcom_instance.filter_pool:
        if dataset.get_gen_person() == individual_id:
            cm_value = dataset.get_extractable_cm()
            sort_value = dataset.get_extractable_sort()
            ydna_value = dataset.get_extractable_YDNA()
            anchor_gen1 = dataset.get_anchor_gen1()
            break
    else:
        cm_value = ''
        sort_value = ''
        ydna_value = ''
        anchor_gen1 = None

    # Build ancestral line (exclude anchor_gen1 itself)
    for generation, pair in filtered_ancestral_line:
        name_pair = [extract_name(records.get(id, '')) for id in pair]
        formatted_name_pair = f"{name_pair[0]}&{name_pair[1]}"
        filtered_ancestral_line_names.append(formatted_name_pair)
    filtered_ancestral_line_names.reverse()
    filtered_ancestral_line_str = "~~~".join(filtered_ancestral_line_names)
    if anchor_gen1 in filtered_ancestral_line_names:
        raise ValueError(f"anchor_gen1 ({anchor_gen1}) was mistakenly included in the ancestral line.")
    individual_data = {
        'cM': cm_value,
        'Sort': sort_value,
        'YDNA': ydna_value,
        'Filtered Ancestral Line': filtered_ancestral_line_str
    }
    return individual_data, filtered_ancestral_line_str

################################################################################
#         Build Rows for DataFrame from the Filter Pool                        #
################################################################################
combined_df_rows = []
for dataset in gedcom_instance.filter_pool:
    individual_id = dataset.get_gen_person()
    visited_pairs.clear()
    generation_table = []
    individual_data, filtered_ancestral_line_str = process_individual(
        individual_id, gedcom_instance, records
    )
    cm = individual_data["cM"]
    sort = individual_data["Sort"]
    individual_name = extract_name(records.get(individual_id, ""))
    combined_df_rows.append(
        [individual_id, sort, individual_name, cm, filtered_ancestral_line_str]
    )

################################################################################
#       NO NEED TO MODIFY ABOVE THIS SECTION 28-1-2025                         #
################################################################################

# Create and Populate Main DataFrame (combined_df) using all desired columns
columns = ["ID#", "Match to", "Name", "cM", "Yates DNA Ancestral Line"]
combined_df = pd.DataFrame(combined_df_rows, columns=columns)

print("\nDEBUG: 481-Initial DataFrame created from combined_df_rows")
print("Columns:", combined_df.columns.tolist())
print(combined_df.head(5))

# Initialize the value_store dictionary (if needed)
value_store = {}
for _, row in combined_df.iterrows():
    value_store[row["ID#"]] = {
        "Match to": row["Match to"],
        "Name": row["Name"],
        "cM": row["cM"],
        "Yates DNA Ancestral Line": row["Yates DNA Ancestral Line"],
        "Value": None  # Placeholder for 'Value'
    }

################################################################################
#       Remove miscellaneous Distant ancestors (combined_df)
################################################################################
def remove_prefix(row):
    ancestral_line = row["Yates DNA Ancestral Line"]
    prefix_to_remove = "YatesJohn&HydeAlice~~~YatesThomas&WhiteFrances~~~"
    if ancestral_line.startswith(prefix_to_remove):
        row["Yates DNA Ancestral Line"] = ancestral_line[len(prefix_to_remove):]
    return row

combined_df = combined_df.apply(remove_prefix, axis=1)
print("\nDEBUG: 517-DataFrame after removing prefix")
print("Columns:", combined_df.columns.tolist())
print(combined_df.head(5))

# Order and clean up columns (do not drop 'Match to' and 'Name')
ordered_columns = ["ID#", "Match to", "Name", "cM", "Yates DNA Ancestral Line"]
combined_df = combined_df[ordered_columns]
combined_df.index += 1
combined_df.sort_values(by=["Match to", "Yates DNA Ancestral Line"], ascending=[False, True], inplace=True)
print("\nDEBUG: 530-DataFrame after reordering and sorting")
print("Columns:", combined_df.columns.tolist())
print(combined_df.head(5))

import numpy as np
from collections import defaultdict
from itertools import combinations

################################################################################
# Ensure Required Columns Exist in Memory (for processing)
################################################################################
expected_columns = ["ID#", "Value", "Yates DNA Ancestral Line"]
for col in expected_columns:
    if col not in combined_df.columns:
        if col == "Value":
            combined_df[col] = 0
        else:
            combined_df[col] = "Unknown"
print("✅ Required columns for processing are now present in combined_df.")

################################################################################
#       Segmentation and Frequency Analysis
################################################################################
def parse_line_to_pairs(line, delimiter="~~~"):
    return line.strip().split(delimiter) if pd.notna(line) else []

def identify_all_shared_segments(df, ancestral_col, min_shared=2, min_size=2):
    segment_counts = defaultdict(int)
    for _, row in df.iterrows():
        if pd.isna(row[ancestral_col]):
            continue
        lines = [line.strip() for line in row[ancestral_col].split('~~~') if line.strip()]
        for size in range(min_size, len(lines) + 1):
            for subset in combinations(sorted(lines), size):
                segment_counts[subset] += 1
    shared_segments = {segment: count for segment, count in segment_counts.items() if count >= min_shared}
    return dict(sorted(shared_segments.items(), key=lambda x: len(x[0]), reverse=True))

def calculate_value(df, ancestral_col, shared_segments):
    sorted_segments = sorted(shared_segments.items(), key=lambda x: len(x[0]), reverse=True)
    df["Value"] = 0
    for idx, row in df.iterrows():
        if pd.isna(row[ancestral_col]):
            df.at[idx, "Value"] = 0
            continue
        lines = [line.strip() for line in row[ancestral_col].split('~~~') if line.strip()]
        value = 0
        lines_copy = lines.copy()
        for segment, freq in sorted_segments:
            segment_list = list(segment)
            if all(line in lines_copy for line in segment_list):
                value += freq
                for line in segment_list:
                    lines_copy.remove(line)
        value += len(lines_copy)
        df.at[idx, "Value"] = value
    return df

################################################################################
#       Main Processing for Segmentation & Value Calculation
################################################################################
# Create a separate DataFrame for processing so that we do not lose "Match to" and "Name"
df_process = combined_df[["ID#", "Yates DNA Ancestral Line"]].copy()
df_process["Value"] = 0
print("\nDEBUG: Processing DataFrame (df_process) created for segmentation")
print("Columns:", df_process.columns.tolist())
print(df_process.head(5))

ancestral_col = "Yates DNA Ancestral Line"
shared_segments_found = identify_all_shared_segments(df_process, ancestral_col, min_shared=2, min_size=2)
seg_df = pd.DataFrame(
    [("~~~".join(seg), freq) for seg, freq in shared_segments_found.items()],
    columns=["Segment", "Frequency"]
).sort_values(by="Frequency", ascending=False)
print("\n✅ All Shared Segments and Frequencies:")
print(seg_df)

df_process = calculate_value(df_process, ancestral_col, shared_segments_found)
print("\n✅ Final Processed DataFrame with Value (df_process):")
print(df_process[["ID#", "Value"]])
id_to_value_map = df_process.set_index("ID#")["Value"].to_dict()
print("\n✅ Mapping of ID# to Value:")
print(id_to_value_map)

# Merge the calculated "Value" column back into combined_df.
merged_df = combined_df.merge(df_process[["ID#", "Value"]], on="ID#", how="left", suffixes=("", "_new"))
if "Value" not in merged_df.columns and "Value_new" in merged_df.columns:
    merged_df.rename(columns={"Value_new": "Value"}, inplace=True)
else:
    # If both exist, choose the one from the processing DataFrame.
    if "Value_new" in merged_df.columns:
        merged_df["Value"] = merged_df["Value_new"]
        merged_df.drop(columns=["Value_new"], inplace=True)
combined_df = merged_df

combined_df.sort_values(by="Yates DNA Ancestral Line", ascending=False, inplace=True)
print("\n✅ Processing Complete! 🚀")
print("Combined DataFrame now (first 5 rows):")
print(combined_df.head(5))

################################################################################
#       Generate HTML Output (Optional)
################################################################################
GENERATE_HTML = False  # Set to True if HTML output is needed
if GENERATE_HTML:
    current_datetime = datetime.now().strftime("%Y-%m-%d_%H%M%S")
    output_html_path = f"html_output_{current_datetime}.html"
    css_style = """
    <style>
    table, th, td {
      border: 1px solid black;
      border-collapse: collapse;
    }
    th {
      background-color: #f2f2f2;
      text-align: center;
    }
    td {
      text-align: center;
    }
    td:nth-child(6) {
      text-align: left;
    }
    </style>
    """
    html_content = css_style + combined_df.to_html(index=False, escape=False, border=0)
    with open(output_html_path, "w", encoding="utf-8") as f:
        f.write(html_content)
    print(f"✅ HTML output generated: {output_html_path}")

################################################################################
# Export Final Data (Optional)
################################################################################
EXPORT_CSV = False  # Change to True if needed
if EXPORT_CSV:
    combined_df.to_csv("final_combined_df.csv", index=False)
    seg_df.to_csv("segments_discovered.csv", index=False)
print("✅ Data processing complete!")

################################################################################
# Detailed Report Generation for Segment Breakdown
################################################################################
sorted_segments = sorted(shared_segments_found.items(), key=lambda x: len(x[0]), reverse=True)
report_data = []
for idx, row in combined_df.iterrows():
    individual_id = row["ID#"]
    ancestral_line = row["Yates DNA Ancestral Line"]
    if pd.isna(ancestral_line) or ancestral_line.strip() == "":
        continue
    segments = [seg.strip() for seg in ancestral_line.split("~~~") if seg.strip()]
    segments_copy = segments.copy()
    cumulative_value = 0
    breakdown_entries = []
    for segment_tuple, freq in sorted_segments:
        segment_list = list(segment_tuple)
        if all(seg in segments_copy for seg in segment_list):
            cumulative_value += freq
            breakdown_entries.append({
                "Individual ID": individual_id,
                "Segment": "&".join(segment_list),
                "Segment Type": "Shared",
                "Frequency": freq,
                "Contribution": freq,
                "Cumulative Value": cumulative_value
            })
            for seg in segment_list:
                segments_copy.remove(seg)
    for leftover in segments_copy:
        cumulative_value += 1
        breakdown_entries.append({
            "Individual ID": individual_id,
            "Segment": leftover,
            "Segment Type": "Leftover",
            "Frequency": 1,
            "Contribution": 1,
            "Cumulative Value": cumulative_value
        })
    breakdown_entries.append({
        "Individual ID": individual_id,
        "Segment": "Final Calculated Value",
        "Segment Type": "",
        "Frequency": "",
        "Contribution": "",
        "Cumulative Value": cumulative_value
    })
    report_data.extend(breakdown_entries)
    report_data.append({
        "Individual ID": "",
        "Segment": "",
        "Segment Type": "",
        "Frequency": "",
        "Contribution": "",
        "Cumulative Value": ""
    })
report_df = pd.DataFrame(report_data, columns=[
    "Individual ID", "Segment", "Segment Type", "Frequency", "Contribution", "Cumulative Value"
])
report_csv_filename = "detailed_segments_report.csv"
report_df.to_csv(report_csv_filename, index=False)
print(f"Detailed segment breakdown report has been exported to '{report_csv_filename}'.")

################################################################################
# Calculate Additional Scores: Standard Z-Score, Robust Z-Score, and Percentile Rank
################################################################################
combined_df["Value"] = pd.to_numeric(combined_df["Value"], errors='coerce')
mean_value = combined_df["Value"].mean()
std_value = combined_df["Value"].std()
combined_df["Z-Score"] = (combined_df["Value"] - mean_value) / std_value
median_value = combined_df["Value"].median()
mad_value = np.median(np.abs(combined_df["Value"] - median_value))
if mad_value == 0:
    combined_df["Robust Z-Score"] = 0
else:
    combined_df["Robust Z-Score"] = (combined_df["Value"] - median_value) / (mad_value * 1.4826)
combined_df["Percentile Rank"] = combined_df["Value"].rank(pct=True) * 100

################################################################################
# Repair DataFrame Columns: Set to "ID#", "Match to", "Name", "Value",
# "Z-Score", "Robust Z-Score", "Percentile Rank", "Yates DNA Ancestral Line"
################################################################################
final_columns = ["ID#", "Match to", "Name", "Value", "Z-Score", "Robust Z-Score", "Percentile Rank", "Yates DNA Ancestral Line"]
for col in final_columns:
    if col not in combined_df.columns:
        combined_df[col] = None
combined_df = combined_df[final_columns]
print("\nDEBUG: Final DataFrame after repairing columns")
print("Columns:", combined_df.columns.tolist())
print(combined_df.head(10))

################################################################################
# Export the final DataFrame with the additional scores to CSV
################################################################################
output_filename = "final_combined_df_with_scores.csv"
combined_df.to_csv(output_filename, index=False)
print(f"Final DataFrame with Z-Scores, Robust Z-Scores, and Percentile Ranks exported to '{output_filename}'.")




Automatically selecting the first GEDCOM file.
GEDCOM contained 58271 total records
Records tagged and filtered by NPFX: 1301
Records with YDNA information: 76
Autosomal matches: 1225
Manual filter IDs loaded: 7
After manual filter, total records: 8
Records tagged and filtered by NPFX: 8

DEBUG: 481-Initial DataFrame created from combined_df_rows
Columns: ['ID#', 'Match to', 'Name', 'cM', 'Yates DNA Ancestral Line']
      ID#        Match to               Name   cM  \
0  I47130          leedon  CrabtreeChadEdmun   25   
1  I52559  yates,timothyj       LeeDonnaStel   12   
2  I52798          leedon        HoukJeffrey  448   
3  I52802          leedon              HoukW  369   
4  I52807          leedon     RileyRonldPaul  173   

                            Yates DNA Ancestral Line  
0  YatesWilliam&ParkerSally~~~YatesLevi&CooleyReb...  
1  YatesWilliam&ParkerSally~~~YatesLevi&CooleyReb...  
2  YatesWilliam&ParkerSally~~~YatesLevi&CooleyReb...  
3  YatesWilliam&ParkerSally~~~YatesLevi&C