<a href="https://colab.research.google.com/github/ronyates47/Gedcom-Utils/blob/main/a_Surname_Utility_stable_v_230826_1328_hrs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [24]:
#!pip install pandas
#!pip install python-gedcom
# Base script-stable-selects GEDCOM-correct output

import glob
from gedcom.element.individual import IndividualElement
from gedcom.parser import Parser
import pandas as pd
import os


# Initialize the last_prime_surname variable
last_prime_surname = None

class Gedcom:
    def __init__(self, file_name):
        self.file_name = file_name
        self.gedcom_datasets = []
        self.filter_pool = []
        self.parse_gedcom()
        self.filter_individuals()

    @staticmethod
    def get_standard_name(file_path):
        file_name = file_path.split('/')[-1]
        if '.' in file_name:
            file_name = file_name.rsplit('.', 1)[0]
        standard_name = file_name.replace(' ', '_').lower()
        return standard_name

    def parse_gedcom(self):
        with open(self.file_name, 'r', encoding='utf-8-sig') as f:
            gedcom_lines = f.readlines()

        current_dataset = None
        npfx_count = 0
        total_count = 0
        for line in gedcom_lines:
            parts = line.strip().split(' ', 2)
            level = int(parts[0])
            tag = parts[1]
            value = parts[2] if len(parts) > 2 else None

            if level == 0 and tag.startswith('@') and tag.endswith('@') and value == 'INDI':
                total_count += 1
                current_dataset = GedcomDataset(tag.strip('@'))
                self.gedcom_datasets.append(current_dataset)
            elif current_dataset is not None:
                if level == 1 and tag in ['NAME', 'FAMC']:
                    current_key = tag
                    current_dataset.add_extractable_detail(current_key, value)
                elif level == 2 and tag == 'NPFX':
                    npfx_count += 1
                    current_dataset.add_extractable_detail(tag, value)

    def filter_individuals(self):
        # Check if the excel file exists
        if os.path.exists('/content/shortged.xlsx'):
            # Read the excel file
            df = pd.read_excel('/content/shortged.xlsx')
            # Get the full names from column B as a list
            allowed_full_names = df.iloc[:, 1].tolist()

            # Filter the gedcom datasets to only include individuals with allowed full names
            self.gedcom_datasets = [dataset for dataset in self.gedcom_datasets if any(allowed_name in dataset.get_full_name() for allowed_name in allowed_full_names)]

            # Print the number of individuals that passed the filtering process
            print('Records Moved into short_pool:', len(self.gedcom_datasets))


        for dataset in self.gedcom_datasets:
            if dataset.get_extractable_NPFX():
                self.filter_pool.append(dataset)


class GedcomDataset:

    def __init__(self, individual_id):
        self.individual_id = individual_id
        self.extractable_detail = {}
        self.ancestral_line = []  # Initialize the ancestral_line attribute as an empty list


    def add_extractable_detail(self, key, value):
        self.extractable_detail[key] = value

    def get_gen_person(self):
        name = self.extractable_detail.get('NAME', '')
        first_name, last_name = name.split('/', 1)
        first_name = first_name.split(' ')[0]
        last_name = last_name.rstrip('/')
        self.anchor_gen1 = last_name.replace(" ", "") + first_name.replace(" ", "")
        return self.individual_id.strip('@')


    def get_anchor_gen1(self):
        return self.anchor_gen1

    def get_extractable_NPFX(self):
        return self.extractable_detail.get('NPFX', '')

    def get_extractable_cm(self):
        npfx_value = self.extractable_detail.get('NPFX', '')
        if '&' in npfx_value:
            cm_value = npfx_value.split('&')[0].strip()
        else:
            cm_value = npfx_value.strip()
        try:
            int(cm_value)
            return cm_value
        except ValueError:
            return 'error'

    def get_extractable_sort(self):
        npfx_value = self.extractable_detail.get('NPFX', '')
        if '&' in npfx_value:
            sort_value = npfx_value.split('&')[1].strip()
            return sort_value
        else:
            return ''

    def get_full_name(self):
        name = self.extractable_detail.get('NAME', '')
        if '/' in name:
            first_name, last_name = name.split('/', 1)
            first_name = first_name.strip()
            last_name = last_name.rstrip('/')
            full_name = last_name.replace(" ", "") + first_name.replace(" ", "")
        else:
            full_name = name.replace(" ", "")
        return full_name


    def get_extractable_FAMC(self):
        return self.extractable_detail.get('FAMC', '').strip('@')

    def find_ancestors(self, individual_id, records):
        if individual_id not in records:
            return
        record = records[individual_id]
        famc_start = record.find('1 FAMC @') + 8
        famc_end = record.find('@', famc_start)
        famc_id = record[famc_start:famc_end]
        if famc_id not in records:
            return

        fam_record = records[famc_id]
        wife_start = fam_record.find('1 WIFE @') + 8
        wife_end = fam_record.find('@', wife_start)
        mother_id = fam_record[wife_start:wife_end]

        husb_start = fam_record.find('1 HUSB @') + 8
        husb_end = fam_record.find('@', husb_start)
        father_id = fam_record[husb_start:husb_end]

        if mother_id and mother_id in records:
            mother_record = records[mother_id]
            mother_name = extract_name(mother_record)
            self.ancestral_line.append(mother_name)
            self.find_ancestors(mother_id, records)

        if father_id and father_id in records:
            father_record = records[father_id]
            father_name = extract_name(father_record)
            self.ancestral_line.append(father_name)
            self.find_ancestors(father_id, records)

def select_gedcom_file():
    gedcom_files = glob.glob('*.ged')
    if not gedcom_files:
        print("No GEDCOM files found in the Colab contents.")
        return None

    print("List of GEDCOM files:")
    for i, file_name in enumerate(gedcom_files, 1):
        print(f"{i}. {file_name}")

    while True:
        try:
            selected_num = int(input("Enter the number of the GEDCOM file you want to use: "))
            if 1 <= selected_num <= len(gedcom_files):
                return gedcom_files[selected_num - 1]
            else:
                print("Invalid number. Please enter a valid number from the list.")
        except ValueError:
            print("Invalid input. Please enter a valid number.")

# Function definitions
def extract_id(record):
    id_start = record.find('@') + 1
    id_end = record.find('@', id_start)
    return record[id_start:id_end]

def extract_name(record):
    name_start = record.find('1 NAME ') + 6
    name_end = record.find('\n', name_start)
    name = record[name_start:name_end]
    first_name, last_name = name.split('/', 1)
    first_name = first_name[:10] # Use slicing syntax to extract the first 10 characters of the first_name variable
    last_name = last_name[:10].rstrip('/') # Use slicing syntax to extract the first 10 characters of the last_name variable
    return last_name.replace(" ", "") + first_name.replace(" ", "")

def find_parents(individual_id, generation, records):
    if individual_id not in records:
        return
    record = records[individual_id]
    famc_start = record.find('1 FAMC @') + 8
    famc_end = record.find('@', famc_start)
    famc_id = record[famc_start:famc_end]
    if famc_id not in records:
        return

    fam_record = records[famc_id]
    wife_start = fam_record.find('1 WIFE @') + 8
    wife_end = fam_record.find('@', wife_start)
    mother_id = fam_record[wife_start:wife_end]

    husb_start = fam_record.find('1 HUSB @') + 8
    husb_end = fam_record.find('@', husb_start)
    father_id = fam_record[husb_start:husb_end]

    if mother_id and mother_id in records:
        mother_record = records[mother_id]
        mother_name = extract_name(mother_record)
    else:
        mother_name = None

    if father_id and father_id in records:
        father_record = records[father_id]
        father_name = extract_name(father_record)
    else:
        father_name = None

    if mother_name is not None and father_name is not None:
        parent_pair = father_name + "&" + mother_name
        if parent_pair not in visited_pairs:
            visited_pairs.add(parent_pair)
            if has_both_parents(records, mother_id, father_id):
                generation_table.loc[len(generation_table)] = [generation, parent_pair]

    if mother_id:
        find_parents(mother_id, generation + 1, records)

    if father_id:
        find_parents(father_id, generation + 1, records)

def has_both_parents(records, mother_id, father_id):
    return mother_id in records and father_id in records

visited_pairs = set()

def process_individual(individual_id, gedcom_instance):
    global generation_table
    generation_table = pd.DataFrame(columns=['Generation', 'Parent Pair'])
    global visited_pairs
    visited_pairs = set()

    find_parents(individual_id, 1, records)

    # Concatenate the value of the anchor_gen1 attribute to the beginning of the generation table
    for dataset in gedcom_instance.filter_pool:
        if dataset.get_gen_person() == individual_id:
            # Truncate the ancestral line of this individual
#            dataset.truncate_ancestral_line(prime_surname)
            anchor_gen1 = dataset.get_anchor_gen1()
            generation_table.loc[0] = [1, anchor_gen1]
            break

    generation_table = generation_table.sort_values('Generation', ascending=False).reset_index(drop=True)

    individual_data = {}
    for dataset in gedcom_instance.filter_pool:
        if dataset.get_gen_person() == individual_id:
            individual_data['cM'] = dataset.get_extractable_cm()
            individual_data['Sort'] = dataset.get_extractable_sort()
            break

    individual_data['Parent Pairs A10'] = '|'.join([f"{row['Parent Pair']}" for index, row in generation_table[:10].iterrows()])
    return individual_data

# Print the value of prime_surname to the console

def input_prime_surname(last_prime_surname=None):
    if last_prime_surname:
        last_name = input(f"Enter prime_surname (default: {last_prime_surname}): ")
        if not last_name:
            last_name = last_prime_surname
    else:
        last_name = input("Enter prime_surname: ")
    return last_name

# Call the function to let the user input prime_surname
prime_surname = input_prime_surname(last_prime_surname)

# Store the value of prime_surname for later use
last_prime_surname = prime_surname

# Call the function to let the user select the GEDCOM file
gedcom_file_path = select_gedcom_file()
if gedcom_file_path:
    # Use the selected GEDCOM file path to create an instance of the Gedcom class
    gedcom_instance = Gedcom(gedcom_file_path)
    gedcom_instance.parse_gedcom()

# Initialize the list of individuals
    individuals = []

    # Iterate over the filter_pool list and add each individual's last name and ID to the individuals list
    for dataset in gedcom_instance.filter_pool:
        individual_id = dataset.get_gen_person()
        last_name = dataset.get_anchor_gen1()
        individuals.append((last_name, individual_id))

    print(f'Records Moved into Samaller Pile {len(individuals)}')

    # Read the GEDCOM file and split it into individual and family records
    with open(gedcom_file_path, 'r') as file:
        data = file.read()
    data = data.split('\n0 ')
    records = {extract_id(record): record for record in data}
#*****************************************************************************************************
#NEW BELOW*****************************************************************************************************
def find_parents_new(individual_id, generation, records, parent_pairs, children, index=0):
    # Increment the find_parents_new_counter each time the function is called
    global find_parents_new_counter
    find_parents_new_counter += 1

    if individual_id not in records:
        return
    record = records[individual_id]
    famc_start = record.find('1 FAMC @') + 8
    famc_end = record.find('@', famc_start)
    famc_id = record[famc_start:famc_end]
    if famc_id not in records:
        return

    fam_record = records[famc_id]
    wife_start = fam_record.find('1 WIFE @') + 8
    wife_end = fam_record.find('@', wife_start)
    mother_id = fam_record[wife_start:wife_end]

    husb_start = fam_record.find('1 HUSB @') + 8
    husb_end = fam_record.find('@', husb_start)
    father_id = fam_record[husb_start:husb_end]

    if mother_id and mother_id in records:
        mother_record = records[mother_id]
        mother_name = extract_name(mother_record)
        if mother_name not in children:
            children[mother_name] = []
        children[mother_name].append(individual_id)
    else:
        mother_name = None

    if father_id and father_id in records:
        father_record = records[father_id]
        father_name = extract_name(father_record)
        if father_name not in children:
            children[father_name] = []
        children[father_name].append(individual_id)
    else:
        father_name = None

    if mother_name is not None and father_name is not None:
        parent_pair = father_name + "&" + mother_name
        if parent_pair not in visited_pairs:
            visited_pairs.add(parent_pair)
            if has_both_parents(records, mother_id, father_id):
                parent_pairs.append((generation, parent_pair, index))

                # Increment the last_pair_counter each time a last pair is found
                global last_pair_counter
                last_pair_counter += 1

                # Add "lastpair" label to the last pair of parents that were found
                parent_pairs[-1] = (parent_pairs[-1][0], parent_pairs[-1][1] + " (lastpair)")

    if mother_id:
        find_parents_new(mother_id, generation + 1, records, parent_pairs, children, index)

    if father_id:
        find_parents_new(father_id, generation + 1, records, parent_pairs, children, index)


# Call the find_parents_new function
find_parents_new(individual_id, 1, records, parent_pairs, children)

# Count the total number of parent pairs found
parent_pair_count = len(parent_pairs)
print(f'{parent_pair_count} parent pairs were found')

# Count the number of last pairs found
last_pair_count = 0
for pair in parent_pairs:
    if "lastpair" in pair[1]:
        last_pair_count += 1
print(f'{last_pair_count} last pairs were found')

# Print the results
print(f'find_parents_new was called {find_parents_new_counter} times')

def process_individual_new(individual_id, gedcom_instance):
    global generation_table
    generation_table = pd.DataFrame(columns=['Generation', 'Parent Pair'])
    global visited_pairs
    visited_pairs = set()

    parent_pairs = []
    children = {}
    find_parents_new(individual_id, 1, records, parent_pairs, children)

    # Concatenate the value of the anchor_gen1 attribute to the beginning of the generation table
    for dataset in gedcom_instance.filter_pool:
        if dataset.get_gen_person() == individual_id:
            # Truncate the ancestral line of this individual
#            dataset.truncate_ancestral_line(prime_surname)
            anchor_gen1 = dataset.get_anchor_gen1()
            parent_pairs.insert(0, (1, anchor_gen1))
            break

    parent_pairs = sorted(parent_pairs, key=lambda x: x[0], reverse=True)

    individual_data = {}
    for dataset in gedcom_instance.filter_pool:
        if dataset.get_gen_person() == individual_id:
            individual_data['cM'] = dataset.get_extractable_cm()
            individual_data['Sort'] = dataset.get_extractable_sort()
            break

#    individual_data['Parent Pairs A10_new'] = '|'.join([f"{pair[1]}" for pair in parent_pairs[:10]])

    last_pairs = {}
    counter = 1
    for pair in parent_pairs:
        if "lastpair" in pair[1]:
            last_pairs[f'Last Pair {counter}'] = pair
            counter += 1

    individual_data['Last Pairs'] = last_pairs

    return individual_data

individual_data = process_individual_new(individual_id, gedcom_instance)
last_pairs = individual_data['Last Pairs']

for key, value in last_pairs.items():
    print(f'{key}: {value[1]}', end='\t')

#NEW ABOVE*****************************************************************************************************
#*****************************************************************************************************
    # Global variables
    generation_table = None
    visited_pairs = None


#*****************************************************************************************************
#NEW BELOW*****************************************************************************************************
# Run the process_individual_new function for each individual in the individuals list and create a combined DataFrame

# Adjust index to start from 1 instead of 0
combined_df_new.index = combined_df_new.index + 1

print("\n")
print("\n")


# Print all records from the DataFrame
print(combined_df_new)

# Export the combined_df DataFrame to an Excel file
combined_df_new.to_excel('/content/output.xlsx', index=False)

Enter prime_surname: yates
List of GEDCOM files:
1. yates-one-name-study.ged
Enter the number of the GEDCOM file you want to use: 1
Records Moved into short_pool: 1
Records Moved into Samaller Pile 1
72 parent pairs were found
12 last pairs were found
find_parents_new was called 75 times
Last Pair 1: KunkelJohannMi&SammerAnnaCath (lastpair)	Last Pair 2: MoserJohannGe&EbertAnnaEva (lastpair)	Last Pair 3: MoserJohannMa&KunkelMargareth (lastpair)	Last Pair 4: RoperJohn&ClayMaryObei (lastpair)	Last Pair 5: MoserGeorgFre&LieserMariaBar (lastpair)	Last Pair 6: RoperJamesDav&YatesSarahAnn (lastpair)	Last Pair 7: HoffmanSamuelH.&MoserMariaBar (lastpair)	Last Pair 8: RoperJamesC.&O'NeilMary (lastpair)	Last Pair 9: RoperElijah&HoffmanCatherine (lastpair)	Last Pair 10: RoperDavidAsb&EpleyLettie (lastpair)	Last Pair 11: RoperJohnBynu&UptonSarah (lastpair)	Last Pair 12: HudsonLonnie&RoperMyrtleRo (lastpair)	



      ID#      Name       Match to  cM          Most Distant Ancestor  \
8  I26925  Huds

In [2]:
!pip install pandas
!pip install python-gedcom

Collecting python-gedcom
  Downloading python_gedcom-1.0.0-py2.py3-none-any.whl (35 kB)
Installing collected packages: python-gedcom
Successfully installed python-gedcom-1.0.0
