<a href="https://colab.research.google.com/github/ronyates47/Gedcom-Utils/blob/main/2023_0916_1258_hrs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [28]:
# total script

import glob
from gedcom.element.individual import IndividualElement
from gedcom.parser import Parser
import pandas as pd
import os

# Declare global variables at the top of the script (not indented)
global name, cM, sort
name = None
cM = None
sort = None

# Reset the global variables
find_parents_new_counter = 0
last_pair_counter = 0
visited_pairs = set()

# Initialize the last_prime_surname variable
last_prime_surname = 'Yates'

def select_gedcom_file():
    gedcom_files = glob.glob('*.ged')
    if not gedcom_files:
        print("No GEDCOM files found in the Colab contents.")
        return None

    print("List of GEDCOM files:")
    for i, file_name in enumerate(gedcom_files, 1):
        print(f"{i}. {file_name}")

    # Auto-select the first file
    print("Auto-selected the first GEDCOM file.")
    return gedcom_files[0]

def extract_id(record):
    import re  # Importing the regex module
    match = re.search(r'@([A-Za-z0-9]+)@', record)  # Updated regex to capture alphanumeric IDs

    if match:
        return match.group(1)
    else:
        return None  # Return None if no ID is found

def extract_id(record):
    import re  # Importing the regex module
    match = re.search(r'@([A-Za-z0-9]+)@', record)  # Updated regex to capture alphanumeric IDs

    if match:
        return match.group(1)
    else:
        return None  # Return None if no ID is found

# Gedcom Class Definition
class Gedcom:
    def __init__(self, file_name):
        self.file_name = file_name
        self.gedcom_datasets = []
        self.filter_pool = []
        self.parse_gedcom()
        self.filter_individuals()

    def parse_gedcom(self):
        with open(self.file_name, 'r', encoding='utf-8-sig') as f:
            gedcom_lines = f.readlines()

        current_dataset = None
        for line in gedcom_lines:
            parts = line.strip().split(' ', 2)
            level = int(parts[0])
            tag = parts[1]
            value = parts[2] if len(parts) > 2 else None

            if level == 0 and tag.startswith('@') and tag.endswith('@') and value == 'INDI':
                current_dataset = GedcomDataset(tag.strip('@'))
                self.gedcom_datasets.append(current_dataset)
            elif current_dataset is not None:
                if level == 1 and tag in ['NAME', 'FAMC', 'cm', 'sort']:  # Added 'cm', 'sort'
                    current_key = tag
                    current_dataset.add_extractable_detail(current_key, value)
                elif level == 2 and tag == 'NPFX':
                    current_dataset.add_extractable_detail(tag, value)

    def filter_individuals(self):
        for dataset in self.gedcom_datasets:
            if dataset.get_extractable_NPFX():
                self.filter_pool.append(dataset)


class GedcomDataset:
    def __init__(self, individual_id):
        self.individual_id = individual_id
        self.extractable_detail = {}
        self.anchor_gen1 = None
        self.ancestral_line = []

    def get_anchor_gen1(self):
        return self.anchor_gen1

    def add_extractable_detail(self, key, value):
        self.extractable_detail[key] = value

    def get_extractable_cm(self):
        return self.extractable_detail.get('cm', '')

    def get_extractable_sort(self):
        return self.extractable_detail.get('sort', '')

    def get_gen_person(self):
        global name, anchor_gen1  # Declare as global variables
        name = self.extractable_detail.get('NAME', '')
        first_name, last_name = name.split('/', 1)
        first_name = first_name.split(' ')[0]
        last_name = last_name.rstrip('/')
        anchor_gen1 = last_name.replace(" ", "") + first_name.replace(" ", "")
        self.anchor_gen1 = anchor_gen1  # Set the instance variable

        return self.individual_id.strip('@')

    def get_extractable_cm(self):
        global cM  # Declare cM as a global variable
        npfx_value = self.extractable_detail.get('NPFX', '')
        if '&' in npfx_value:
            cm_value = npfx_value.split('&')[0].strip()
        else:
            cm_value = npfx_value.strip()
        try:
            int(cm_value)
            return cm_value
        except ValueError:
            return 'error'

    def get_extractable_sort(self):
        npfx_value = self.extractable_detail.get('NPFX', '')
        if '&' in npfx_value:
            sort_value = npfx_value.split('&')[1].strip()
            return sort_value
        else:
            return ''

    def get_extractable_NPFX(self):
        return self.extractable_detail.get('NPFX', '')

def input_prime_surname(last_prime_surname=None):
    global surname  # Declare surname as a global variable
    if last_prime_surname:
        last_name = last_prime_surname  # Use the default last_prime_surname
    else:
        last_name = "Unknown"  # Or some other default value when last_prime_surname is not available

    surname = last_name  # Assign the value of last_name to the global surname variable
    return last_name

# Initialize last_prime_surname with a value, if you have one. Otherwise, it will be None.
last_prime_surname = None

# Call the function to get prime_surname
prime_surname = input_prime_surname(last_prime_surname)

# Store the value of prime_surname for later use
last_prime_surname = prime_surname

gedcom_file_path = select_gedcom_file()

if gedcom_file_path:
    # Use the selected GEDCOM file path to create an instance of the Gedcom class
    gedcom_instance = Gedcom(gedcom_file_path)
    gedcom_instance.parse_gedcom()

# Initialize the list of individuals
    individuals = []

# Iterate over the filter_pool list and add each individual's last name and ID to the individuals list
for dataset in gedcom_instance.filter_pool:
    individual_id = dataset.get_gen_person()
    last_name = dataset.anchor_gen1
    npfx = dataset.extractable_detail.get('NPFX', 'No NPFX Found')
    cm = dataset.extractable_detail.get('cM', 'No cM Found')
    sort_value = dataset.extractable_detail.get('Sort', 'No Sort Found')
    individuals.append((last_name, individual_id))

# Read the GEDCOM file and split it into individual and family records
with open(gedcom_file_path, 'r') as file:
    data = file.read()

# Now let's address the issue with splitting 'data'
data = data.split('\n0 ')

# Process the records
records = {extract_id(record): record for record in data}

parent_pairs = []
children = {}
last_pair_counter = 0
last_pair_counter = 0
find_parents_new_counter = 0

parent_pairs = []
children = {}
ancestral_lines = {}

import re  # Importing the regex module

def extract_name(record):
    match = re.search(r'1 NAME (.+)', record)
    if match:
        return match.group(1)
    else:
        return None  # Return None if no name is found

def has_both_parents(records, mother_id, father_id):
    if mother_id in records and father_id in records:
        return True
    return False

def find_parents_new(individual_id, generation, records, parent_pairs, children, ancestral_lines, current_line=None, last_pairs=None):
    if current_line is None:
        current_line = []

    if individual_id not in records:
        return

    record = records[individual_id]
    famc_start = record.find('1 FAMC @') + 8
    famc_end = record.find('@', famc_start)
    famc_id = record[famc_start:famc_end]
    if famc_id not in records:
        return

    fam_record = records[famc_id]
    wife_start = fam_record.find('1 WIFE @') + 8
    wife_end = fam_record.find('@', wife_start)
    mother_id = fam_record[wife_start:wife_end]

    husb_start = fam_record.find('1 HUSB @') + 8
    husb_end = fam_record.find('@', husb_start)
    father_id = fam_record[husb_start:husb_end]

    mother_has_parents = mother_id in records and '1 FAMC' in records[mother_id]
    father_has_parents = father_id in records and '1 FAMC' in records[father_id]

    if mother_id and mother_id in records:
        mother_name = extract_name(records[mother_id])
    else:
        mother_name = None

    if father_id and father_id in records:
        father_name = extract_name(records[father_id])
    else:
        father_name = None

    if mother_name and father_name:
        parent_pair = f"{father_name}&{mother_name}"
        current_line.append((generation, parent_pair))
        if parent_pair not in visited_pairs:
            visited_pairs.add(parent_pair)
            if mother_has_parents and father_has_parents:
                parent_pairs.append((generation, parent_pair))
                ancestral_lines[parent_pair] = list(current_line)

    if not mother_has_parents and not father_has_parents and mother_id and father_id:
        last_pairs.add(f"{extract_name(records.get(mother_id, ''))}&{extract_name(records.get(father_id, ''))}")

    if mother_id:
        find_parents_new(mother_id, generation + 1, records, parent_pairs, children, ancestral_lines, current_line, last_pairs)

    if father_id:
        find_parents_new(father_id, generation + 1, records, parent_pairs, children, ancestral_lines, current_line, last_pairs)

    if current_line:
        current_line.pop()

# Existing code to find parents and populate last_pairs
last_pairs = set()  # Create an empty set to hold the "last pairs"
find_parents_new(individual_id, 1, records, parent_pairs, children, ancestral_lines, last_pairs=last_pairs)

# Print the total number of last pairs found
print(f'{len(last_pairs)} last pairs were found')  # This will execute only once

# New modification: Save the last pairs as keys in the ancestral_lines dictionary
for last_pair in last_pairs:
    if last_pair not in ancestral_lines:
        ancestral_lines[last_pair] = []  # You can populate this list as needed

# ... (your code above-CREATES CORRECT LAST PAIR 'HEAD OF BRANCH')
last_pairs = set()  # Create an empty set to hold the "last pairs"
find_parents_new(individual_id, 1, records, parent_pairs, children, ancestral_lines, last_pairs=last_pairs)

# Convert the last_pairs set to a DataFrame
last_pairs_df = pd.DataFrame(list(last_pairs), columns=['Last Pair'])

# Export to Excel
last_pairs_df.to_excel('last_pairs.xlsx', index=False)
print(last_pairs_df)

# ... (rest of your code, if any)


# Read the GEDCOM file and split it into individual and family records
with open(gedcom_file_path, 'r') as file:
    data = file.read()

# Split data by lines starting with '0 '
data = data.split('\n0 ')

# Process the records
records = {extract_id(record): record for record in data}

# Your code to print the results
#for last_pair, ancestral_line in ancestral_lines.items():
#    print(f'Head of Branch line (100 each surname) {last_pair}:')
#    for generation, parent_pair in ancestral_line:
#        print(f'  Descendants of this branch line (1 each surname) {generation}: {parent_pair}')

# Initialize last_pairs as an empty set
last_pairs = set()

# Now call the function with last_pairs as an argument
find_parents_new(individual_id, 1, records, parent_pairs, children, ancestral_lines, last_pairs=last_pairs)

# Debug print and DataFrame
#ancestral_lines_df = pd.DataFrame(list(ancestral_lines.items()), columns=['Parent Pair', 'Ancestral Line'])
#print(ancestral_lines_df.to_string())
#print(ancestral_lines_df)

# Your code to print the results
#for last_pair, ancestral_line in ancestral_lines.items():
#    print(f'Branch Head of line {last_pair}:')
#    for generation, parent_pair in ancestral_line:
#        print(f'  Branch Descendant of this branch line {generation}: {parent_pair}')

# Count the total number of parent pairs found
#parent_pair_count = len(parent_pairs)
#print(f'{parent_pair_count} parent pairs were found')

for parent_pair in ancestral_lines.keys():
    if parent_pair in last_pairs:
        print(f"Branch Head of line {parent_pair}:")


# Count the total number of last pairs found
#        print(f'{len(last_pairs)} last pairs were found')

#      last_pair_count = len(last_pairs)  # Count the number of entries in the set last_pairs
#print(f'{last_pair_count} last pairs were found')  # Print the count

def process_individual_new(individual_id, gedcom_instance):
    # Initialize individual_data at the start
    individual_data = {}

    global generation_table
    generation_table = pd.DataFrame(columns=['Generation', 'Parent Pair'])
    global visited_pairs
    visited_pairs = set()

    parent_pairs = []
    children = {}
    ancestral_lines = {}

    # Changed records to gedcom_instance.gedcom_datasets
    find_parents_new(individual_id, 1, gedcom_instance.gedcom_datasets, parent_pairs, children, ancestral_lines, last_pairs=last_pairs)


    anchor_gen1 = None  # Initialize as None to ensure it gets a value later

    # Here you populate individual_data (now that it's already initialized)
    for dataset in gedcom_instance.filter_pool:
        if dataset.get_gen_person() == individual_id:
            anchor_gen1 = dataset.get_anchor_gen1()
            individual_data['cM'] = dataset.get_extractable_cm()
            individual_data['Sort'] = dataset.get_extractable_sort()
            break

    # Add anchor_gen1 to the beginning of each ancestral line
    for last_pair, ancestral_line in ancestral_lines.items():
        if anchor_gen1 is not None:  # Check if anchor_gen1 has been assigned a value
            ancestral_line.insert(0, (1, anchor_gen1))

    individual_data['Last Pairs'] = ancestral_lines
    individual_data['anchor_gen1'] = anchor_gen1  # Add anchor_gen1 to individual_data

    return individual_data

individual_data = process_individual_new(individual_id, gedcom_instance)
last_pairs = individual_data['Last Pairs']

def trace_children(individual_id, parent_pairs, children, visited=None):
    # Create a list to store the children of the individual
    individual_children = []

    # Create a set to keep track of visited individuals
    if visited is None:
        visited = set()

    # Check if the individual has already been visited
    if individual_id in visited:
        # If the individual has already been visited, stop the recursion and return an empty list
        return individual_children

    # Add the individual to the set of visited individuals
    visited.add(individual_id)

    # Find the name of the individual
    individual_name = None
    for pair in parent_pairs:
        if "lastpair" in pair[1]:
            names = pair[1].split('&')
            if individual_id in children[names[0]]:
                individual_name = names[0]
            elif individual_id in children[names[1]]:
                individual_name = names[1]
            break

    # Check if the individual has any children
    if individual_name in children:
        # Add the children of the individual to the list
        individual_children.extend(children[individual_name])

        # Recursively find the children of the individual's children
        for child_id in children[individual_name]:
            child_children = trace_children(child_id, parent_pairs, children, visited)
            individual_children.extend(child_children)

    return individual_children

descendants = trace_children(individual_id, parent_pairs, children)
#print(f'The ancestoral line of {individual_id} is: {descendants}')

import pandas as pd  # If you haven't already imported it

# user enters the surname=target surname
target_surname = surname

# Initialize a counter for the sub-branches
branch_sub_counter = 0

# Initialize your dictionary here
branch_surname_count = {}
formatted_lines_list = []  # This list will hold all your formatted ancestral lines for writing to Excel

# Initialize variables to keep track of the highest worth and its corresponding branch line number
highest_worth = 0
highest_worth_reference = None

# Initialize a dictionary to store the formatted_ancestral_line for each unique_reference
formatted_lines = {}



# Loop through each ancestral line and print the details
for last_pair, ancestral_line in ancestral_lines.items():

    # Increment the branch sub counter
    branch_sub_counter += 1

    # Create a unique reference for this branch
    unique_reference = f"branch line number: {branch_sub_counter}"

    # Initialize counters for this specific branch
    head_count = 0
    descendant_count = 0
    generations = len(ancestral_line)  # Count the number of generations for the tiebreaker

    if target_surname in last_pair:
        head_count += 1

    # Reverse the order of the ancestral line so that it goes from oldest to newest
    reversed_ancestral_line = reversed(ancestral_line)

    # Create the formatted ancestral line string
    formatted_ancestral_line = "|".join([pair for gen, pair in reversed_ancestral_line])

    # Add the formatted line to the list
    formatted_lines_list.append(formatted_ancestral_line)

    # Store this formatted_ancestral_line in our dictionary
    formatted_lines[unique_reference] = formatted_ancestral_line

    # Now use last_pair_in_line as the head of the branch line

    if target_surname in last_pair:  # I've assumed you meant the last pair in the ancestral line
            descendant_count += 1  # Increment the counter for 'Descendant'

    # Calculate the total worth for this branch
    total_worth = (head_count * 100) + descendant_count

    # Tie-breaking logic
    if total_worth > highest_worth or (total_worth == highest_worth and generations > branch_surname_count.get(highest_worth_reference, {}).get('Generations', 0)):
        highest_worth = total_worth
        highest_worth_reference = unique_reference

    # Store the counts in our dictionary
    branch_surname_count[unique_reference] = {'Head': head_count, 'Descendant': descendant_count, 'Total Worth': total_worth, 'Generations': generations}

# After the loop, highest_worth_reference holds the unique_reference of the branch line with the highest worth
# Assign the formatted_ancestral_line of the highest worth branch to dnaline

# Check if highest_worth_reference exists in formatted_lines, otherwise set a default value for dnaline
if highest_worth_reference in formatted_lines:
    dnaline = formatted_lines[highest_worth_reference]
else:
    dnaline = "No Ancestral Line Found"  # Default value

dnaline_dict = {}

dnaline_dict[individual_id] = dnaline  # <-- This line saves dnaline to dnaline_dict for the current individual_id

import pandas as pd  # Make sure to import pandas

# Initialize combined_df_rows as an empty list before your loop
combined_df_rows = []

#dnaline = formatted_lines[highest_worth_reference]
#print(f"The branch with the highest worth is {highest_worth_reference}, and its DNA line is {dnaline}")


# Loop through the individuals
for name, individual_id in individuals:
    individual_data = process_individual_new(individual_id, gedcom_instance)
    cM = individual_data['cM']
    sort = individual_data['Sort']
    anchor_gen1 = individual_data['anchor_gen1']
    dnaline_value = dnaline_dict.get(individual_id, "N/A")  # Use get() method to safely retrieve dnaline
    most_distant_ancestor = dnaline_value.split('|')[0] if dnaline_value != "N/A" else "N/A"
    combined_df_rows.append([individual_id, anchor_gen1, sort, cM, most_distant_ancestor, dnaline_value])

# Create the DataFrame
combined_df = pd.DataFrame(combined_df_rows, columns=['ID#', 'Name', 'Match to', 'cM', 'most_distant_ancestor', 'dnaline'])

# Adjust index
combined_df.index = combined_df.index + 1

print(combined_df)

# Export to Excel
#combined_df.to_excel('/content/output.xlsx', index=False)


List of GEDCOM files:
1. yates-one-name-study.ged
Auto-selected the first GEDCOM file.
4 last pairs were found
                              Last Pair
0       Joan /Ashendon/&Richard /Yates/
1     Christian /Bourne/&William /Tapp/
2  Meredith /Adkins/&Richard /Shockley/
3     Elinor /Brown/&Richard /Warfield/
Branch Head of line Joan /Ashendon/&Richard /Yates/:
Branch Head of line Christian /Bourne/&William /Tapp/:
Branch Head of line Meredith /Adkins/&Richard /Shockley/:
Branch Head of line Elinor /Brown/&Richard /Warfield/:
        ID#                Name         Match to  cM  \
1    I13817        YatesWilliam      yates,johnh  11   
2    I21743          HuntKelsey    yates,robertd  15   
3    I23678       JohnsonDonald    yates,andreal  53   
4    I26925            HudsonJL    yates,andreal  27   
5    I31861         ChurchDebra  yates,patricial  19   
..      ...                 ...              ...  ..   
371  I50494       BeaversLaNell     yates,ronald  16   
372  I50511       St

In [12]:
!pip install pandas
!pip install python-gedcom
!pip install openpyxl



In [13]:
# Write DataFrame to an Excel file
ancestral_lines_df.to_excel("ancestral_lines.xlsx", index=False, engine='openpyxl')
