<a href="https://colab.research.google.com/github/ronyates47/Gedcom-Utils/blob/main/2023_0911_2019_multi_cell.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install pandas
!pip install python-gedcom

Collecting python-gedcom
  Downloading python_gedcom-1.0.0-py2.py3-none-any.whl (35 kB)
Installing collected packages: python-gedcom
Successfully installed python-gedcom-1.0.0


In [2]:
#Cell 1: Import Libraries
import glob
from gedcom.element.individual import IndividualElement
from gedcom.parser import Parser
import pandas as pd
import os




In [17]:
#Cell 2:Initialize Global Variables

# Reset the global variables
find_parents_new_counter = 0
last_pair_counter = 0
visited_pairs = set()

# Initialize the last_prime_surname variable
last_prime_surname = 'Yates'




In [4]:
# Cell 3: Define Gedcom Class

import os
import pandas as pd

# Adding filter_individuals method to the Gedcom class
# Gedcom Class Definition
class Gedcom:
    def __init__(self, file_name):
        self.file_name = file_name
        self.gedcom_datasets = []
        self.filter_pool = []
        self.parse_gedcom()
        self.filter_individuals()

    def parse_gedcom(self):
        # Assuming this is where your existing parse_gedcom code is
        with open(self.file_name, 'r', encoding='utf-8-sig') as f:
            gedcom_lines = f.readlines()

        current_dataset = None
        for line in gedcom_lines:
            parts = line.strip().split(' ', 2)
            level = int(parts[0])
            tag = parts[1]
            value = parts[2] if len(parts) > 2 else None

            if level == 0 and tag.startswith('@') and tag.endswith('@') and value == 'INDI':
                current_dataset = GedcomDataset(tag.strip('@'))
                self.gedcom_datasets.append(current_dataset)
            elif current_dataset is not None:
                if level == 1 and tag in ['NAME', 'FAMC']:
                    current_key = tag
                    current_dataset.add_extractable_detail(current_key, value)
                elif level == 2 and tag == 'NPFX':
                    current_dataset.add_extractable_detail(tag, value)
    def filter_individuals(self):
        if os.path.exists('/content/shortged.xlsx'):
          df = pd.read_excel('/content/shortged.xlsx')
          allowed_individual_ids = df.iloc[:, 0].dropna().str.strip().tolist()  # Drop NaN and remove extra spaces
          print("Cleaned Allowed IDs:", allowed_individual_ids)  # Debugging line

          self.gedcom_datasets = [dataset for dataset in self.gedcom_datasets if dataset.individual_id.strip() in allowed_individual_ids]  # Remove extra spaces from dataset IDs
          print('Records Moved into short_pool:', len(self.gedcom_datasets))  # Debugging line

          for dataset in self.gedcom_datasets:
              if dataset.get_extractable_NPFX():
                  self.filter_pool.append(dataset)

        # Debug print before
        print("Before clearing ancestral_line:")
        for dataset in self.gedcom_datasets:
            print(dataset.ancestral_line)

        # Add the following line to clear any previous ancestral lines
        for dataset in self.gedcom_datasets:
            dataset.ancestral_line = []

        # Debug print after
        print("After clearing ancestral_line:")
        for dataset in self.gedcom_datasets:
            print(dataset.ancestral_line)

class GedcomDataset:
    def __init__(self, individual_id):
        self.individual_id = individual_id
        self.extractable_detail = {}

    def add_extractable_detail(self, key, value):
        self.extractable_detail[key] = value

    def get_extractable_NPFX(self):
        return self.extractable_detail.get('NPFX', '')


In [5]:
# Cell 6: Define GedcomDataset Class

# Combined Cell 6 & 7

class GedcomDataset:
    def __init__(self, individual_id):
        self.individual_id = individual_id
        self.extractable_detail = {}
        self.anchor_gen1 = None  # Initialize the anchor_gen1 attribute as None
        self.ancestral_line = []  # Initialize the ancestral_line attribute as an empty list

    def add_extractable_detail(self, key, value):
        self.extractable_detail[key] = value

    def get_anchor_gen1(self):
        return self.anchor_gen1

    def get_gen_person(self):
        return self.individual_id

    def get_extractable_NPFX(self):
        return self.extractable_detail.get('NPFX', '')


In [6]:
# Cell 8: Functions and Global Variable

# Declare anchor_gen1 as a global variable
global anchor_gen1
anchor_gen1 = None

# Function to get the individual's identifier
def get_gen_person(self):
    global anchor_gen1
    name = self.extractable_detail.get('NAME', '')
    first_name, last_name = name.split('/', 1)
    first_name = first_name.split(' ')[0]
    last_name = last_name.rstrip('/')
    anchor_gen1 = last_name.replace(" ", "") + first_name.replace(" ", "")
    return self.individual_id.strip('@')

# Function to get the anchor_gen1
def get_anchor_gen1(self):
    return self.anchor_gen1

# Function to get NPFX (Prefix Information)
def get_extractable_NPFX(self):
    return self.extractable_detail.get('NPFX', '')



In [7]:
# Cell 9: More Functions for Extractable Data

# Function to get the centimorgan value
def get_extractable_cm(self):
    npfx_value = self.extractable_detail.get('NPFX', '')
    if '&' in npfx_value:
        cm_value = npfx_value.split('&')[0].strip()
    else:
        cm_value = npfx_value.strip()
    try:
        int(cm_value)
        return cm_value
    except ValueError:
        return 'error'

# Function to get the sort value
def get_extractable_sort(self):
    npfx_value = self.extractable_detail.get('NPFX', '')
    if '&' in npfx_value:
        sort_value = npfx_value.split('&')[1].strip()
        return sort_value
    else:
        return ''


In [8]:
# Cell 10: More Functions for Extractable Data (continued)

# Function to get full name
def get_full_name(self):
    name = self.extractable_detail.get('NAME', '')
    if '/' in name:
        first_name, last_name = name.split('/', 1)
        first_name = first_name.strip()
        last_name = last_name.rstrip('/')
        full_name = last_name.replace(" ", "") + first_name.replace(" ", "")
    else:
        full_name = name.replace(" ", "")
    return full_name

# Function to get FAMC (Family ID)
def get_extractable_FAMC(self):
    return self.extractable_detail.get('FAMC', '').strip('@')


In [9]:
# Cell 11: Functions for Finding Ancestors

# Function to find ancestors recursively
def find_ancestors(self, individual_id, records):
    if individual_id not in records:
        return
    record = records[individual_id]

    # Extract FAMC ID
    famc_start = record.find('1 FAMC @') + 8
    famc_end = record.find('@', famc_start)
    famc_id = record[famc_start:famc_end]
    if famc_id not in records:
        return

    # Extract family record
    fam_record = records[famc_id]

    # Find mother ID
    wife_start = fam_record.find('1 WIFE @') + 8
    wife_end = fam_record.find('@', wife_start)
    mother_id = fam_record[wife_start:wife_end]

    # Find father ID
    husb_start = fam_record.find('1 HUSB @') + 8
    husb_end = fam_record.find('@', husb_start)
    father_id = fam_record[husb_start:husb_end]

    # If mother ID exists, find her ancestors
    if mother_id and mother_id in records:
        mother_record = records[mother_id]
        mother_name = extract_name(mother_record)
        self.ancestral_line.append(mother_name)
        self.find_ancestors(mother_id, records)

    # If father ID exists, find his ancestors
    if father_id and father_id in records:
        father_record = records[father_id]
        father_name = extract_name(father_record)
        self.ancestral_line.append(father_name)
        self.find_ancestors(father_id, records)



In [10]:
# Cell 12: Function to Select GEDCOM File

import glob  # This is for file listing

# Function to automatically select a GEDCOM file
def select_gedcom_file():
    # Get a list of all .ged files in the current directory
    gedcom_files = glob.glob('*.ged')

    # If there are no .ged files, then print a message and return None
    if not gedcom_files:
        print("No GEDCOM files found in the Colab contents.")
        return None

    # Print a list of available .ged files
    print("List of GEDCOM files:")
    for i, file_name in enumerate(gedcom_files, 1):
        print(f"{i}. {file_name}")

    # Automatically select the first file in the list
    selected_num = 1
    return gedcom_files[selected_num - 1]



In [11]:
# Cell #13: Function Definitions

# Function to extract ID from a record
def extract_id(record):
    id_start = record.find('@') + 1
    id_end = record.find('@', id_start)
    return record[id_start:id_end]

# Function to extract and format the name from a record
def extract_name(record):
    name_start = record.find('1 NAME ') + 6
    name_end = record.find('\n', name_start)
    name = record[name_start:name_end]
    first_name, last_name = name.split('/', 1)
    first_name = first_name[:10]
    last_name = last_name[:10].rstrip('/')
    return last_name.replace(" ", "") + first_name.replace(" ", "")

# Function to find parents of an individual and record them
def find_parents(individual_id, generation, records):
    if individual_id not in records:
        return
    record = records[individual_id]
    famc_start = record.find('1 FAMC @') + 8
    famc_end = record.find('@', famc_start)
    famc_id = record[famc_start:famc_end]
    if famc_id not in records:
        return

    fam_record = records[famc_id]
    wife_start = fam_record.find('1 WIFE @') + 8
    wife_end = fam_record.find('@', wife_start)
    mother_id = fam_record[wife_start:wife_end]

    husb_start = fam_record.find('1 HUSB @') + 8
    husb_end = fam_record.find('@', husb_start)
    father_id = fam_record[husb_start:husb_end]

    if mother_id and mother_id in records:
        mother_record = records[mother_id]
        mother_name = extract_name(mother_record)
    else:
        mother_name = None

    if father_id and father_id in records:
        father_record = records[father_id]
        father_name = extract_name(father_record)
    else:
        father_name = None

    if mother_name is not None and father_name is not None:
        parent_pair = father_name + "&" + mother_name
        if parent_pair not in visited_pairs:
            visited_pairs.add(parent_pair)
            if has_both_parents(records, mother_id, father_id):
                generation_table.loc[len(generation_table)] = [generation, parent_pair]

    if mother_id:
        find_parents(mother_id, generation + 1, records)

    if father_id:
        find_parents(father_id, generation + 1, records)

# Function to check if both parents exist
def has_both_parents(records, mother_id, father_id):
    return mother_id in records and father_id in records


In [20]:
# cell 14

visited_pairs = set()

def process_individual(individual_id, gedcom_instance):
    global generation_table
    generation_table = pd.DataFrame(columns=['Generation', 'Parent Pair'])
    global visited_pairs
    visited_pairs = set()

    find_parents(individual_id, 1, records)

    # Concatenate the value of the anchor_gen1 attribute to the beginning of the generation table
    for dataset in gedcom_instance.filter_pool:
        if dataset.get_gen_person() == individual_id:
            # Truncate the ancestral line of this individual
#            dataset.truncate_ancestral_line(prime_surname)
            anchor_gen1 = dataset.get_anchor_gen1()
            generation_table.loc[0] = [1, anchor_gen1]
            break

    generation_table = generation_table.sort_values('Generation', ascending=False).reset_index(drop=True)

    individual_data = {}
    for dataset in gedcom_instance.filter_pool:
        if dataset.get_gen_person() == individual_id:
            individual_data['cM'] = dataset.get_extractable_cm()
            individual_data['Sort'] = dataset.get_extractable_sort()
            break

    individual_data['Parent Pairs A10'] = '|'.join([f"{row['Parent Pair']}" for index, row in generation_table[:10].iterrows()])
    return individual_data

def input_prime_surname(last_prime_surname=None):
    global surname  # Declare surname as a global variable
    if last_prime_surname:
        last_name = input(f"Enter prime_surname (default: {last_prime_surname}): ")
        if not last_name:
            last_name = last_prime_surname
    else:
        last_name = input("Enter prime_surname: ")
    surname = last_name  # Assign the value of last_name to the global surname variable
    return last_name

# Call the function to let the user input prime_surname
prime_surname = input_prime_surname(last_prime_surname)

# Store the value of prime_surname for later use
last_prime_surname = prime_surname
#prime_surname = input_prime_surname(last_prime_surname)


gedcom_file_path = select_gedcom_file()

if gedcom_file_path:
    # Use the selected GEDCOM file path to create an instance of the Gedcom class
    gedcom_instance = Gedcom(gedcom_file_path)
    gedcom_instance.parse_gedcom()

# Initialize the list of individuals
    individuals = []

    # Iterate over the filter_pool list and add each individual's last name and ID to the individuals list
    for dataset in gedcom_instance.filter_pool:
        individual_id = dataset.get_gen_person()
        last_name = dataset.get_anchor_gen1()
        individuals.append((last_name, individual_id))

    print(f'Records Moved into Samaller Pile {len(individuals)}')

    # Read the GEDCOM file and split it into individual and family records
    with open(gedcom_file_path, 'r') as file:
        data = file.read()
    data = data.split('\n0 ')
    records = {extract_id(record): record for record in data}


Enter prime_surname (default: Yates): 
List of GEDCOM files:
1. yates-one-name-study.ged
Cleaned Allowed IDs: ['I42478']
Records Moved into short_pool: 1
Before clearing ancestral_line:
[]
After clearing ancestral_line:
[]
Records Moved into Samaller Pile 1


In [14]:
# cell 15

parent_pairs = []
children = {}
last_pair_counter = 0
last_pair_counter = 0
find_parents_new_counter = 0

parent_pairs = []
children = {}
ancestral_lines = {}

def find_parents_new(individual_id, generation, records, parent_pairs, children, ancestral_lines, current_line=None):
    if current_line is None:
        current_line = []

    if individual_id not in records:
        return
    record = records[individual_id]
    famc_start = record.find('1 FAMC @') + 8
    famc_end = record.find('@', famc_start)
    famc_id = record[famc_start:famc_end]
    if famc_id not in records:
        return

    fam_record = records[famc_id]
    wife_start = fam_record.find('1 WIFE @') + 8
    wife_end = fam_record.find('@', wife_start)
    mother_id = fam_record[wife_start:wife_end]

    husb_start = fam_record.find('1 HUSB @') + 8
    husb_end = fam_record.find('@', husb_start)
    father_id = fam_record[husb_start:husb_end]

    if mother_id and mother_id in records:
        mother_record = records[mother_id]
        mother_name = extract_name(mother_record)
        if mother_name not in children:
            children[mother_name] = []
        children[mother_name].append(individual_id)
    else:
        mother_name = None

    if father_id and father_id in records:
        father_record = records[father_id]
        father_name = extract_name(father_record)
        if father_name not in children:
            children[father_name] = []
        children[father_name].append(individual_id)
    else:
        father_name = None

    if mother_name is not None and father_name is not None:
        parent_pair = father_name + "&" + mother_name
        current_line.append((generation, parent_pair))
        if parent_pair not in visited_pairs:
            visited_pairs.add(parent_pair)
            if has_both_parents(records, mother_id, father_id):
                parent_pairs.append((generation, parent_pair))
                ancestral_lines[parent_pair] = list(current_line)

    if mother_id:
        find_parents_new(mother_id, generation + 1, records, parent_pairs, children, ancestral_lines, current_line)

    if father_id:
        find_parents_new(father_id, generation + 1, records, parent_pairs, children, ancestral_lines, current_line)



    if current_line:  # Check if the list is not empty
        current_line.pop()

In [21]:
# cell 16

ancestral_lines = {}
find_parents_new(individual_id, 1, records, parent_pairs, children, ancestral_lines)

for last_pair, ancestral_line in ancestral_lines.items():
    print(f'Head of Branch line (100 each surname) {last_pair}:')
    for generation, parent_pair in ancestral_line:
        print(f'  Descendants of this branch line (1 each surname) {generation}: {parent_pair}')

# Count the total number of parent pairs found
parent_pair_count = len(parent_pairs)
print(f'{parent_pair_count} parent pairs were found')

# NEW LINE: Count the total number of last pairs found
last_pair_count = len(ancestral_lines)  # Count the number of keys in the dictionary
print(f'{last_pair_count} (20230908-1438) last pairs were found')  # Print the count

# NEW: Count the total number of last pairs found
last_pair_count = len(ancestral_lines)
print(f'{last_pair_count} last pairs were found')


def process_individual_new(individual_id, gedcom_instance):
    global generation_table
    generation_table = pd.DataFrame(columns=['Generation', 'Parent Pair'])
    global visited_pairs
    visited_pairs = set()

    parent_pairs = []
    children = {}
    ancestral_lines = {}
    find_parents_new(individual_id, 1, records, parent_pairs, children, ancestral_lines)

    # Add anchor_gen1 to the beginning of each ancestral line
    for last_pair, ancestral_line in ancestral_lines.items():
        # Check if anchor_gen1 has been assigned a value
        if anchor_gen1 is not None:
            ancestral_line.insert(0, (1, anchor_gen1))

    individual_data = {}
    individual_data['Last Pairs'] = ancestral_lines

    return individual_data


individual_data = process_individual_new(individual_id, gedcom_instance)
last_pairs = individual_data['Last Pairs']



def trace_children(individual_id, parent_pairs, children, visited=None):
    # Create a list to store the children of the individual
    individual_children = []

    # Create a set to keep track of visited individuals
    if visited is None:
        visited = set()

    # Check if the individual has already been visited
    if individual_id in visited:
        # If the individual has already been visited, stop the recursion and return an empty list
        return individual_children

    # Add the individual to the set of visited individuals
    visited.add(individual_id)

    # Find the name of the individual
    individual_name = None
    for pair in parent_pairs:
        if "lastpair" in pair[1]:
            names = pair[1].split('&')
            if individual_id in children[names[0]]:
                individual_name = names[0]
            elif individual_id in children[names[1]]:
                individual_name = names[1]
            break

    # Check if the individual has any children
    if individual_name in children:
        # Add the children of the individual to the list
        individual_children.extend(children[individual_name])

        # Recursively find the children of the individual's children
        for child_id in children[individual_name]:
            child_children = trace_children(child_id, parent_pairs, children, visited)
            individual_children.extend(child_children)

    return individual_children

descendants = trace_children(individual_id, parent_pairs, children)
print(f'The ancestors of individual {individual_id} are: {descendants}')

# user enters the surname=target surname
target_surname = surname

# Initialize a counter for the sub-branches
branch_sub_counter = 0

# Initialize your dictionary here
branch_surname_count = {}
formatted_lines_list = []  # This list will hold all your formatted ancestral lines for writing to Excel

# Initialize variables to keep track of the highest worth and its corresponding branch line number
highest_worth = 0
highest_worth_reference = None

# Initialize a dictionary to store the formatted_ancestral_line for each unique_reference
formatted_lines = {}

# Loop through each ancestral line and print the details
for last_pair, ancestral_line in ancestral_lines.items():

    # Increment the branch sub counter
    branch_sub_counter += 1

    # NEW CODE: Get the last pair from the ancestral line (which should be the head of the branch)
    last_pair_in_line = ancestral_line[-1][1] if ancestral_line else None  # Grab the last element's parent_pair in ancestral_line

    # Create a unique reference for this branch
    unique_reference = f"branch line number: {branch_sub_counter}"

    # Initialize counters for this specific branch
    head_count = 0
    descendant_count = 0
    generations = len(ancestral_line)  # Count the number of generations for the tiebreaker

    if target_surname in last_pair:
        head_count += 1

    # Reverse the order of the ancestral line so that it goes from oldest to newest
    reversed_ancestral_line = reversed(ancestral_line)

    # Create the formatted ancestral line string
    formatted_ancestral_line = "|".join([pair for gen, pair in reversed_ancestral_line])

    # Add the formatted line to the list
    formatted_lines_list.append(formatted_ancestral_line)

    # Store this formatted_ancestral_line in our dictionary
    formatted_lines[unique_reference] = formatted_ancestral_line

    # NEW CODE: Now use last_pair_in_line as the head of the branch line
    print(f'Head of Branch line (100 each surname) {last_pair_in_line} ({unique_reference}):')
    for generation, parent_pair in ancestral_line:
        print(f'  Descendants of this branch line (1 each surname) {generation}: {parent_pair}')
    print(f"  Formatted Ancestral Line: {formatted_ancestral_line}")

    if target_surname in parent_pair:
            descendant_count += 1  # Increment the counter for 'Descendant'

    # Calculate the total worth for this branch
    total_worth = (head_count * 100) + descendant_count

    # Tie-breaking logic
    if total_worth > highest_worth or (total_worth == highest_worth and generations > branch_surname_count.get(highest_worth_reference, {}).get('Generations', 0)):
        highest_worth = total_worth
        highest_worth_reference = unique_reference

    # Store the counts in our dictionary
    branch_surname_count[unique_reference] = {'Head': head_count, 'Descendant': descendant_count, 'Total Worth': total_worth, 'Generations': generations}

# After the loop, print out the counts per branch line
for branch, counts in branch_surname_count.items():
    print(f"{branch} - 'Head' occurrences: {counts['Head']}, 'Descendant' occurrences: {counts['Descendant']}, 'Total Worth': {counts['Total Worth']}, 'Generations': {counts['Generations']}")

# Now, highest_worth_reference holds the unique_reference of the branch line with the highest worth
# Assign the formatted_ancestral_line of the highest worth branch to dnaline
dnaline = formatted_lines[highest_worth_reference]
print(f"The branch with the highest worth is {highest_worth_reference}, and its DNA line is {dnaline}")

# Once your loop is done, use pandas to write the list to an Excel file
df = pd.DataFrame(formatted_lines_list, columns=['Formatted Ancestral Lines'])
df.to_excel('Formatted_Ancestral_Lines.xlsx', index=False)

Head of Branch line (100 each surname) RoperGaryScot&RobersonNancyMel:
  Descendants of this branch line (1 each surname) 1: RoperGaryScot&RobersonNancyMel
Head of Branch line (100 each surname) RobersonYatesEdm&HillNancyYou:
  Descendants of this branch line (1 each surname) 1: RoperGaryScot&RobersonNancyMel
  Descendants of this branch line (1 each surname) 2: RobersonYatesEdm&HillNancyYou
Head of Branch line (100 each surname) RobersonJohnBost&McDadeClara:
  Descendants of this branch line (1 each surname) 1: RoperGaryScot&RobersonNancyMel
  Descendants of this branch line (1 each surname) 2: RobersonYatesEdm&HillNancyYou
  Descendants of this branch line (1 each surname) 3: RobersonJohnBost&McDadeClara
Head of Branch line (100 each surname) RobersonJamesRun&MorganElizabeth:
  Descendants of this branch line (1 each surname) 1: RoperGaryScot&RobersonNancyMel
  Descendants of this branch line (1 each surname) 2: RobersonYatesEdm&HillNancyYou
  Descendants of this branch line (1 each 