<a href="https://colab.research.google.com/github/ronyates47/Gedcom-Utils/blob/main/a_Surname_GEDCOM_Utility_stable_v_230812_1041_hrs_EXP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install pandas
!pip install python-gedcom

Collecting python-gedcom
  Downloading python_gedcom-1.0.0-py2.py3-none-any.whl (35 kB)
Installing collected packages: python-gedcom
Successfully installed python-gedcom-1.0.0


In [7]:
# Base script-stable-selects GEDCOM-correct output

import glob
from gedcom.element.individual import IndividualElement
from gedcom.parser import Parser
import pandas as pd

class Gedcom:
    def __init__(self, file_name):
        self.file_name = file_name
        self.gedcom_datasets = []
        self.filter_pool = []

    @staticmethod
    def get_standard_name(file_path):
        file_name = file_path.split("/")[-1]
        if "." in file_name:
            file_name = file_name.rsplit(".", 1)[0]
        standard_name = file_name.replace(" ", "_").lower()
        return standard_name

def select_gedcom_file():  # (THIS BIT MAKES USER PICK THE GEDCOM FROM COLAB CONTENTS, ANY NAME CAN BE USED)
    gedcom_files = glob.glob("*.ged")
    if not gedcom_files:
        print("No GEDCOM files found in the Colab contents.")
        return None

    print("List of GEDCOM files:")
    for i, file_name in enumerate(gedcom_files, 1):
        print(f"{i}. {file_name}")

    while True:
        try:
            selected_num = int(
                input("Enter the number of the GEDCOM file you want to use: ")
            )
            if 1 <= selected_num <= len(gedcom_files):
                return gedcom_files[selected_num - 1]
            else:
                print("Invalid number. Please enter a valid number from the list.")
        except ValueError:
            print("Invalid input. Please enter a valid number.")

# Call the function to let the user select the GEDCOM file
gedcom_file_path = select_gedcom_file()
if gedcom_file_path:
    # Use the selected GEDCOM file path to create an instance of the Gedcom class
    gedcom_instance = Gedcom(gedcom_file_path)


# Initialize the last_prime_surname variable
last_prime_surname = None

# Print the value of prime_surname to the console


def input_prime_surname(
    last_prime_surname=None,
):  # (THIS BIT DEFINES, SURNAME USING AND THEREAFTER PRIME_VALUE TO SELECT DNA_LINE)
    if last_prime_surname:
        last_name = input(f"Enter prime_surname (default: {last_prime_surname}): ")
        if not last_name:
            last_name = last_prime_surname
    else:
        last_name = input("Enter prime_surname: ")
    return last_name


# Call the function to let the user input prime_surname
prime_surname = input_prime_surname(last_prime_surname)

# Store the value of prime_surname for later use
last_prime_surname = prime_surname




def parse_gedcom(
        self,
    ):  # (THIS BIT FILTERS PREFIX IN OR OUT TO CREAT SMALLLER POOL)
        with open(self.file_name, "r", encoding="utf-8-sig") as f:
            gedcom_lines = f.readlines()

        current_dataset = None
        npfx_count = 0
        total_count = 0
        for line in gedcom_lines:
            parts = line.strip().split(" ", 2)
            level = int(parts[0])
            tag = parts[1]
            value = parts[2] if len(parts) > 2 else None

            if (
                level == 0
                and tag.startswith("@")
                and tag.endswith("@")
                and value == "INDI"
            ):
                total_count += 1
                current_dataset = GedcomDataset(tag)
                self.gedcom_datasets.append(current_dataset)
            elif current_dataset is not None:
                if level == 1 and tag in ["NAME", "FAMC"]:
                    current_key = tag
                    current_dataset.add_extractable_detail(current_key, value)
                elif level == 2 and tag == "NPFX":
                    npfx_count += 1
                    current_dataset.add_extractable_detail(tag, value)

                print(f"Found {total_count} total records")  # (MAY NOT NEED)

        for dataset in self.gedcom_datasets:
            if dataset.get_extractable_NPFX():
                self.filter_pool.append(dataset)


class GedcomDataset:  # (THIS BIT WORKS ONLY ON FILTERED RECORDS, CREATE THE RECORD NAME, cM AND SORT)
    def __init__(self, gen_person):
        self.gen_person = gen_person
        self.extractable_detail = {}

    def add_extractable_detail(self, key, value):
        self.extractable_detail[key] = value

    def get_gen_person(self):
        name = self.extractable_detail.get("NAME", "")
        first_name, last_name = name.split("/", 1)
        first_name = first_name.split(" ")[0]
        last_name = last_name.rstrip("/")
        self.anchor_gen1 = last_name.replace(" ", "") + first_name.replace(" ", "")
        return self.gen_person.strip("@")

    def get_anchor_gen1(self):
        return self.anchor_gen1

    def get_extractable_NPFX(self):
        return self.extractable_detail.get("NPFX", "")

    def get_extractable_cm(self):
        npfx_value = self.extractable_detail.get("NPFX", "")
        if "&" in npfx_value:
            cm_value = npfx_value.split("&")[0].strip()
        else:
            cm_value = npfx_value.strip()
        try:
            int(cm_value)
            return cm_value
        except ValueError:
            return "error"

    def get_extractable_sort(self):
        npfx_value = self.extractable_detail.get("NPFX", "")
        if "&" in npfx_value:
            sort_value = npfx_value.split("&")[1].strip()
            return sort_value
        else:
            return ""

    def get_extractable_FAMC(self):
        return self.extractable_detail.get("FAMC", "").strip("@")


# Function definitions
def extract_id(record):
    id_start = record.find("@") + 1
    id_end = record.find("@", id_start)
    return record[id_start:id_end]


def extract_name(
    record,
):  # (THIS BIT DEFINES THE NUMBER OF CHARACTERS SHOWN IN FIRST NAME, MANAGES INTIALS AND MIDDLE NAMES)
    name_start = record.find("1 NAME ") + 6
    name_end = record.find("\n", name_start)
    name = record[name_start:name_end]
    first_name, last_name = name.split("/", 1)
    first_name = first_name[
        :10
    ]  # Use slicing syntax to extract the first 10 characters of the first_name variable
    last_name = last_name[:10].rstrip(
        "/"
    )  # Use slicing syntax to extract the first 10 characters of the last_name variable
    return last_name.replace(" ", "") + first_name.replace(" ", "")


def find_parents(
    individual_id, generation, records
):  # (THIS BIT BEGINS BUILDING ANCESTRAL LINE FOR EACH RECORD)
    if individual_id not in records:
        return
    record = records[individual_id]
    famc_start = record.find("1 FAMC @") + 8
    famc_end = record.find("@", famc_start)
    famc_id = record[famc_start:famc_end]
    if famc_id not in records:
        return

    fam_record = records[famc_id]
    wife_start = fam_record.find("1 WIFE @") + 8
    wife_end = fam_record.find("@", wife_start)
    mother_id = fam_record[wife_start:wife_end]

    husb_start = fam_record.find("1 HUSB @") + 8
    husb_end = fam_record.find("@", husb_start)
    father_id = fam_record[husb_start:husb_end]

    if mother_id and mother_id in records:
        mother_record = records[mother_id]
        mother_name = extract_name(mother_record)
    else:
        mother_name = None

    if father_id and father_id in records:
        father_record = records[father_id]
        father_name = extract_name(father_record)
    else:
        father_name = None

    if mother_name is not None and father_name is not None:
        parent_pair = father_name + "&" + mother_name
        if parent_pair not in visited_pairs:
            visited_pairs.add(parent_pair)
            if has_both_parents(records, mother_id, father_id):
                generation_table.loc[len(generation_table)] = [generation, parent_pair]

    if mother_id:
        find_parents(mother_id, generation + 1, records)

    if father_id:
        find_parents(father_id, generation + 1, records)


def has_both_parents(records, mother_id, father_id):
    return mother_id in records and father_id in records


visited_pairs = set()


def process_individual(
    individual_id, gedcom_instance
):  # (THIS BIT DEFINES PARENT PAIR AS A GENERATION AND CREATE A TABLE)
    global generation_table
    generation_table = pd.DataFrame(columns=["Generation", "Parent Pair"])
    global visited_pairs
    visited_pairs = set()

    find_parents(individual_id, 1, records)

    # Concatenate the value of the anchor_gen1 attribute to the beginning of the generation table #(THIS BIT MAKES RECORD PERSON THE 1ST GENERATION)
    for dataset in gedcom_instance.filter_pool:
        if dataset.get_gen_person() == individual_id:
            anchor_gen1 = dataset.get_anchor_gen1()
            generation_table.loc[0] = [1, anchor_gen1]
            break

    generation_table = generation_table.sort_values(
        "Generation", ascending=False
    ).reset_index(drop=True)

# (list of tuples, each tuple contains an individual ID & corresponding individual data returned by the process_individual function)

individuals = []
for dataset in gedcom_instance.filter_pool:
    individual_id = dataset.get_gen_person()
    individual_data = process_individual(individual_id, gedcom_instance)
    individuals.append((individual_id, individual_data))

    individual_data = {}
    for dataset in gedcom_instance.filter_pool:
        if dataset.get_gen_person() == individual_id:
            individual_data["cM"] = dataset.get_extractable_cm()
            individual_data["Sort"] = dataset.get_extractable_sort()
            break

    individual_data["Parent Pairs A10"] = "|".join(
        [f"{row['Parent Pair']}" for index, row in generation_table[:10].iterrows()]
    )
    return individual_data

    # Initialize the list of individuals
    individuals = []

    # Iterate over the filter_pool list and add each individual's last name and ID to the individuals list
    for dataset in gedcom_instance.filter_pool:
        individual_id = dataset.get_gen_person()
        last_name = dataset.get_anchor_gen1()
        individuals.append((last_name, individual_id))

    print(f"Total records found: {len(individuals)}")

    # Read the GEDCOM file and split it into individual and family records
    with open(gedcom_file_path, "r") as file:
        data = file.read()
    data = data.split("\n0 ")
    records = {extract_id(record): record for record in data}

    # Global variables
    generation_table = None
    visited_pairs = None


# Run the process_individual function for each individual in the individuals list and create a combined DataFrame
combined_df_rows = []
for name, individual_id in individuals:
    individual_data = process_individual(individual_id, gedcom_instance)
    cm = individual_data["cM"]
    sort = individual_data["Sort"]
    parent_pairs_a10 = individual_data["Parent Pairs A10"]
    most_distant_ancestor = generation_table.iloc[0][
        "Parent Pair"
    ]  # Get the most distant ancestor from the last row of generation_table
    combined_df_rows.append(
        [individual_id, name, sort, cm, most_distant_ancestor, parent_pairs_a10]
    )

combined_df = pd.DataFrame(
    combined_df_rows,
    columns=[
        "ID#",
        "Name",
        "Match to",
        "cM",
        "Most Distant Ancestor",
        "Ancestral Line A10",
    ],
)

# Function to create hotlinks
def create_hotlink(row):
    url_base = "https://yates.one-name.net/tng/verticalchart.php?personID="
    person_id = row["ID#"]
    hotlink = f'<a href="{url_base}{person_id}&tree=tree1&parentset=0&display=vertical&generations=8" target="_blank">{person_id}</a>'
    return hotlink


# Apply the hotlink function to create the 'LUN#' column
combined_df["LUN#"] = combined_df.apply(lambda row: create_hotlink(row), axis=1)

# Change the order of the columns
combined_df = combined_df[
    [
        "ID#",
        "Name",
        "Match to",
        "cM",
        "Most Distant Ancestor",
        "LUN#",
        "Ancestral Line A10",
    ]
]

# Adjust index to start from 1 instead of 0
combined_df.index = combined_df.index + 1

# Print all records from the DataFrame
print(combined_df)

# Export the combined_df DataFrame to an Excel file
combined_df.to_excel("/content/output.xlsx", index=False)

List of GEDCOM files:
1. dna_generations.ged
Enter the number of the GEDCOM file you want to use: 1
Enter prime_surname: yates


SyntaxError: ignored