<a href="https://colab.research.google.com/github/ronyates47/Gedcom-Utils/blob/main/a_last_chance_2023_0920_1514_error_free.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [11]:
#!pip install pandas
#!pip install python-gedcom


import glob
from gedcom.element.individual import IndividualElement
from gedcom.parser import Parser
import pandas as pd
import os

# Initialize your global variables
individual = None
individual_id = None
anchor_gen1 = None
find_parents_new_counter = 0
last_pair_counter = 0
visited_pairs = set()

last_prime_surname = None

def select_gedcom_file():
    """
    Select the GEDCOM file from the user.

    Returns:
        The path to the selected GEDCOM file.
    """

    gedcom_file_path = filedialog.askopenfilename(
        filetypes=[('GEDCOM Files', '*.ged')],
        initialdir=os.getcwd()
    )
    return gedcom_file_path

class Gedcom:
    def __init__(self, file_name):
        self.file_name = file_name
        self.gedcom_lines = []  # Initialize it here
        self.gedcom_datasets = []
        self.filter_pool = []

        # Call the private _parse_gedcom() method
        self._parse_gedcom()
        self.parse_individuals()
        self.filter_individuals()

    def _parse_gedcom(self):
        with open(self.file_name, 'r', encoding='utf-8-sig') as f:
            self.gedcom_lines = f.readlines()  # Populate the class attribute here

    def parse_individuals(self):
        # Existing code initializing current_dataset, npfx_count, total_count
        current_dataset = None
        npfx_count = 0
        total_count = 0

        for line in self.gedcom_lines:  # Use it here
            parts = line.strip().split(' ', 2)
            level = int(parts[0])
            tag = parts[1]
            value = parts[2] if len(parts) > 2 else None

            if level == 0 and tag.startswith('@') and tag.endswith('@') and value == 'INDI':
                total_count += 1
                current_dataset = GedcomDataset(tag.strip('@'))
                self.gedcom_datasets.append(current_dataset)
            elif current_dataset is not None:
                if level == 1 and tag in ['NAME', 'FAMC']:
                    current_key = tag
                    current_dataset.add_extractable_detail(current_key, value)
                elif level == 2 and tag == 'NPFX':
                    npfx_count += 1

                # Additional check for None value
                    if value is None:
                        print(f"Warning: Found None value for NPFX in dataset {current_dataset}.")
                    else:
                        current_dataset.add_extractable_detail(tag, value)

    def filter_individuals(self):
        # Check if the excel file exists
        if os.path.exists('/content/shortged.xlsx'):
            # Read the excel file
            df = pd.read_excel('/content/shortged.xlsx')
            # Get the individual_ids from column A as a list
            allowed_individual_ids = df.iloc[:, 0].tolist()

            # Filter the gedcom datasets to only include individuals with allowed individual_ids
            self.gedcom_datasets = [dataset for dataset in self.gedcom_datasets if dataset.individual_id in allowed_individual_ids]

            # Print the number of individuals that passed the filtering process
            print('Records Moved into short_pool:', len(self.gedcom_datasets))

        # Initializing the list of individuals
        self.individuals = []  # Making it an attribute of the class so it can be accessed elsewhere in the class

        # Filling the 'individuals' list with individual_id and names
        for dataset in self.gedcom_datasets:
            individual_id = dataset.individual_id  # Assuming you get the id this way
            name = dataset.get_gen_person()  # Here, I'm assuming that get_gen_person() returns the name
            self.individuals.append((name, individual_id))

    @staticmethod
    def get_standard_name(file_path):
        file_name = file_path.split('/')[-1]
        if '.' in file_name:
            file_name = file_name.rsplit('.', 1)[0]
        standard_name = file_name.replace(' ', '_').lower()
        return standard_name

      # Ensure the class IndividualRecord is well defined as you've shown
class IndividualRecord:
    def __init__(self, id, cm, sort):
        self.id = id
        self.cm = cm
        self.sort = sort

# Your GedcomDataset class should now include the new methods
class GedcomDataset:
    def __init__(self, individual_id):
        self.individual_id = individual_id
        self.extractable_detail = {}
        self.anchor_gen1 = None  # Initialize the anchor_gen1 attribute as None
        self.ancestral_line = []  # Initialize the ancestral_line attribute as an empty list

    def add_extractable_detail(self, key, value):
        self.extractable_detail[key] = value

    def get_anchor_gen1(self):
        return self.anchor_gen1

    def get_gen_person(self):
        return self.individual_id

    def get_extractable_cm(self):
        return self.extractable_detail.get('cm', '')

    def get_extractable_sort(self):
        return self.extractable_detail.get('sort', '')

    def get_extractable_NPFX(self):
        return self.extractable_detail.get('NPFX', '')

    def get_gen_person(self):
        name = self.extractable_detail.get('NAME', '')
        first_name, last_name = name.split('/', 1)
        first_name = first_name.split(' ')[0]
        last_name = last_name.rstrip('/')
        self.anchor_gen1 = last_name.replace(" ", "") + first_name.replace(" ", "")  # Store the value
        self.individual_id = self.individual_id.strip('@')
        return self.anchor_gen1  # Return the updated value

    def get_anchor_gen1(self):
        return self.anchor_gen1

    def get_extractable_NPFX(self):
        return self.extractable_detail.get('NPFX', '')

    def get_extractable_cm(self):
        npfx_value = self.extractable_detail.get('NPFX', '')
        if '&' in npfx_value:
            cm_value = npfx_value.split('&')[0].strip()
        else:
            cm_value = npfx_value.strip()
        try:
            int(cm_value)
            return cm_value
        except ValueError:
            return 'error'

    def get_extractable_sort(self):
        npfx_value = self.extractable_detail.get('NPFX', '')
        if '&' in npfx_value:
            sort_value = npfx_value.split('&')[1].strip()
            return sort_value
        else:
            return ''

    def get_full_name(self):
        name = self.extractable_detail.get('NAME', '')
        if '/' in name:
            first_name, last_name = name.split('/', 1)
            first_name = first_name.strip()
            last_name = last_name.rstrip('/')
            full_name = last_name.replace(" ", "") + first_name.replace(" ", "")
        else:
            full_name = name.replace(" ", "")
        return full_name

    def get_extractable_FAMC(self):
        return self.extractable_detail.get('FAMC', '').strip('@')

    def find_ancestors(self, individual_id, records):
        if individual_id not in records:
            return
        record = records[individual_id]
        famc_start = record.find('1 FAMC @') + 8
        famc_end = record.find('@', famc_start)
        famc_id = record[famc_start:famc_end]
        if famc_id not in records:
            return

        fam_record = records[famc_id]
        wife_end = fam_record.find('@', wife_start)
        mother_id = fam_record[wife_start:wife_end]

        husb_start = fam_record.find('1 HUSB @') + 8
        husb_end = fam_record.find('@', husb_start)
        father_id = fam_record[husb_start:husb_end]

        if mother_id and mother_id in records:
            mother_record = records[mother_id]
            mother_name = extract_name(mother_record)
            self.ancestral_line.append(mother_name)
            self.find_ancestors(mother_id, records)

        if father_id and father_id in records:
            father_record = records[father_id]
            father_name = extract_name(father_record)
            self.ancestral_line.append(father_name)
            self.find_ancestors(father_id, records)

def select_gedcom_file():
    gedcom_files = glob.glob('*.ged')
    if not gedcom_files:
        print("No GEDCOM files found in the Colab contents.")
        return None

    print("List of GEDCOM files:")
    for i, file_name in enumerate(gedcom_files, 1):
        print(f"{i}. {file_name}")

    selected_num = 1
    return gedcom_files[selected_num - 1]

#We set the value of selected_num to 1.
gedcom_file_path = select_gedcom_file()
if gedcom_file_path is None:
    print("No GEDCOM file selected. Exiting.")
    exit()

# Function definitions
def extract_id(record):
  """Extracts the individual ID from a GEDCOM record.

  Args:
    record: A GEDCOM record.

  Returns:
    The individual ID, or None if the record does not contain an individual ID.
  """

  parts = record.split('\n')
  for part in parts:
    if part.startswith('@INDI@'):
      return part[7:11]
  return None

def extract_name(record):
  """Extracts the name of the individual from a GEDCOM record.

  Args:
    record: A GEDCOM record.

  Returns:
    The individual's name, or None if the record does not contain the individual's name.
  """

  parts = record.split('\n')
  for part in parts:
    if part.startswith('NAME '):
      return part[5:]
  return None

# Now, read the GEDCOM file and split it into individual and family records
with open(gedcom_file_path, 'r') as file:
  data = file.read()
  data = data.split('\n0 ')
  records = {extract_id(record): record for record in data}

gedcom_instance = Gedcom(gedcom_file_path)
gedcom_instance.filter_individuals()

def find_parents(individual_id, generation, records):
    if individual_id not in records:
        return
    record = records[individual_id]

    print(f"record is: {record}")  # Debugging statement

    famc_start = record.find('1 FAMC @') + 8
    famc_end = record.find('@', famc_start)
    famc_id = record[famc_start:famc_end]

    if famc_id not in records:
        return

    fam_record = records[famc_id]

    print(f"fam_record is: {fam_record}")  # Debugging statement

    try:
        wife_start = fam_record.find('1 WIFE @') + 8
        wife_end = fam_record.find('@', wife_start)
        mother_id = fam_record[wife_start:wife_end]
    except AttributeError:
        print("AttributeError at wife section.")

    try:
        husb_start = fam_record.find('1 HUSB @') + 8
        husb_end = fam_record.find('@', husb_start)
        father_id = fam_record[husb_start:husb_end]
    except AttributeError:
        print("AttributeError at husband section.")
    if mother_id and mother_id in records:
        mother_record = records[mother_id]
        mother_name = extract_name(mother_record)
    else:
        mother_name = None

    if father_id and father_id in records:
        father_record = records[father_id]
        father_name = extract_name(father_record)
    else:
        father_name = None

    if mother_name is not None and father_name is not None:
        parent_pair = father_name + "&" + mother_name
        if parent_pair not in visited_pairs:
            visited_pairs.add(parent_pair)
            if has_both_parents(records, mother_id, father_id):
                generation_table.loc[len(generation_table)] = [generation, parent_pair]

    if mother_id:
        find_parents(mother_id, generation + 1, records)

    if father_id:
        find_parents(father_id, generation + 1, records)

def has_both_parents(records, mother_id, father_id):
    return mother_id in records and father_id in records

visited_pairs = set()

import gedcom

def input_prime_surname(last_prime_surname="Yates"):  # Set default surname to "Yates"
    global surname  # Declare surname as a global variable
    if last_prime_surname:
        last_name = input(f"Enter prime_surname (default: {last_prime_surname}): ")
        if not last_name:
            last_name = last_prime_surname
    else:
        last_name = input("Enter prime_surname: ")
    surname = last_name  # Assign the value of last_name to the global surname variable
    return last_name

# Call the function to let the user input prime_surname
prime_surname = input_prime_surname()

# Store the value of prime_surname for later use
last_prime_surname = prime_surname

last_pairs_dict = {}  # This dictionary will hold the last_pair for each individual_id

# Initialize parent_pairs, children, and ancestral_lines
parent_pairs = []
children = {}
ancestral_lines = {}

def find_parents_new(individual_id, generation, records, parent_pairs, children, ancestral_lines):
    global last_pairs_dict  # Declare as global so we can update it
    current_line = []

    if individual_id not in records:
        print("individual_id not in records")
        return

    record = records[individual_id]
    famc_start = record.find('1 FAMC @') + 8
    famc_end = record.find('@', famc_start)
    famc_id = record[famc_start:famc_end]

    if famc_id not in records:
        return

    fam_record = records[famc_id]
    wife_start = fam_record.find('1 WIFE @') + 8
    wife_end = fam_record.find('@', wife_start)
    mother_id = fam_record[wife_start:wife_end]

    husb_start = fam_record.find('1 HUSB @') + 8
    husb_end = fam_record.find('@', husb_start)
    father_id = fam_record[husb_start:husb_end]

    # Last pair check - begin
    if mother_id not in records and father_id not in records:
        last_pair = "None&None"
        last_pairs_dict[individual_id] = last_pair
        return
    # Last pair check - end

    if mother_id and mother_id in records:
        mother_record = records[mother_id]
        mother_name = extract_name(mother_record)
        if mother_name not in children:
            children[mother_name] = []
        children[mother_name].append(individual_id)
    else:
        mother_name = None

    if father_id and father_id in records:
        father_record = records[father_id]
        father_name = extract_name(father_record)
        if father_name not in children:
            children[father_name] = []
        children[father_name].append(individual_id)
    else:
        father_name = None

    if mother_name and father_name:
        parent_pair = father_name + "&" + mother_name
        current_line.append((generation, parent_pair))
        if parent_pair not in visited_pairs:
            visited_pairs.add(parent_pair)
            if has_both_parents(records, mother_id, father_id):
                parent_pairs.append((generation, parent_pair))
                ancestral_lines[parent_pair] = list(current_line)

    if mother_id:
        find_parents_new(mother_id, generation + 1, records, parent_pairs, children, ancestral_lines)

    if father_id:
        find_parents_new(father_id, generation + 1, records, parent_pairs, children, ancestral_lines)

    current_line.pop()

ancestral_lines = {}
find_parents_new(individual_id, 1, records, parent_pairs, children, ancestral_lines)

for last_pair, ancestral_line in ancestral_lines.items():
    print(f'Ancestral line for last pair {last_pair}:')
    for generation, parent_pair in ancestral_line:
        print(f'  Generation {generation}: {parent_pair}')

def process_individual_new(individual_id):
    global generation_table  # DataFrame to hold generation and parent pair info
    global visited_pairs     # Set to hold visited parent pairs
    global individual        # Individual GEDCOM element

    # Initialize globals
    generation_table = pd.DataFrame(columns=['Generation', 'Parent Pair'])
    visited_pairs = set()

    # Local variables
    parent_pairs = []
    children = {}
    ancestral_lines = {}

    find_parents_new(individual_id, 1, records, parent_pairs, children, ancestral_lines)

    # Add anchor_gen1 to the beginning of each ancestral line
    for last_pair, ancestral_line in ancestral_lines.items():
        # Check if anchor_gen1 has been assigned a value
        if anchor_gen1 is not None:
            ancestral_line.insert(0, (1, anchor_gen1))

    individual_data = {}
    individual_data['Last Pairs'] = ancestral_lines

    return individual_data

individual_data = process_individual_new(individual_id)
last_pairs = individual_data['Last Pairs']

def trace_children(individual_id, parent_pairs, children, visited=None):
    # Create a list to store the children of the individual
    individual_children = []

    # Create a set to keep track of visited individuals
    if visited is None:
        visited = set()

    # Check if the individual has already been visited
    if individual_id in visited:
        # If the individual has already been visited, stop the recursion and return an empty list
        return individual_children

    # Add the individual to the set of visited individuals
    visited.add(individual_id)

    # Find the name of the individual
    individual_name = None
    for pair in parent_pairs:
        if "lastpair" in pair[1]:
            names = pair[1].split('&')
            if individual_id in children[names[0]]:
                individual_name = names[0]
            elif individual_id in children[names[1]]:
                individual_name = names[1]
            break

    # Check if the individual has any children
    if individual_name in children:
        # Add the children of the individual to the list
        individual_children.extend(children[individual_name])

        # Recursively find the children of the individual's children
        for child_id in children[individual_name]:
            child_children = trace_children(child_id, parent_pairs, children, visited)
            individual_children.extend(child_children)

    return individual_children

descendants = trace_children(individual_id, parent_pairs, children)
print(f'The ancestors of individual {individual_id} are: {descendants}')

# user enters the surname=target surname
target_surname = surname

# Define a scoring function that counts the number of occurrences of the target surname in an ancestral line
def score_ancestral_line(ancestral_line_list):
    score = 0
    for line in map(str, ancestral_line_list):
        if target_surname in line:
            score += 1
    return score

# Initialize an empty dictionary to store the scores
scores = {}

# Convert the tuples into a list
ancestral_lines_list = list(ancestral_lines.items())

# Compute and store the scores for each last pair
for last_pair, ancestral_line in ancestral_lines_list:
    scores[last_pair] = score_ancestral_line(ancestral_line)

# Find the final-line with the highest score
if scores:
    final_pair = max(scores, key=scores.get)
else:
    print("Scores dictionary is empty. Cannot find the maximum.")
    final_pair = None

if final_pair is not None:
    final_score = scores[final_pair]
else:
    print("Final pair is None. Cannot find the final score.")
    final_score = None

# Print the last pair and score for each ancestral line
#for last_pair, score in scores.items():
#    print(f"Last pair: {last_pair} (score: {score})")

# Print the final-line with a designation of 'final-line' to the highest score
# print(f"dnaline: {final_pair} (score: {final_score})")

# Initialize a dictionary to store the cumulative scores for each branch line
cumulative_scores = {}

# Calculate and store the cumulative scores for each branch line and generation
for last_pair, ancestral_line in ancestral_lines_list:
    cumulative_score = 0
    for generation, line in enumerate(ancestral_line, start=1):
        generation_score = score_ancestral_line([line])
        cumulative_score += generation_score
        cumulative_scores.setdefault(line, 0)  # Initialize cumulative score for line if not exists
        cumulative_scores[line] += cumulative_score

# Print the cumulative scores for each branch line and generation
for line, cumulative_score in cumulative_scores.items():
    print(f"Ancestral line: {line}, Cumulative Score = {cumulative_score}")

# Find the final-line with the highest cumulative score
if cumulative_scores:  # Checks if the dictionary is not empty
    final_line = max(cumulative_scores, key=cumulative_scores.get)
    final_cumulative_score = cumulative_scores[final_line]
else:
    print("Cumulative scores are empty. Cannot find the final line or score.")
    final_line = None
    final_cumulative_score = None

if cumulative_scores:  # Checks if the dictionary is not empty
    final_line = max(cumulative_scores, key=cumulative_scores.get)
    final_cumulative_score = cumulative_scores[final_line]
else:
    print("Cumulative scores are empty. Cannot find the final line or score.")
    final_line = None
    final_cumulative_score = None

if final_line is not None:
    final_cumulative_score = cumulative_scores[final_line]
else:
    print("Final line is None. Cannot access cumulative_scores[final_line].")
    final_cumulative_score = None

# Print the final-line with the highest cumulative score
print(f"dnaline: {final_line}, Cumulative Score = {final_cumulative_score}")

# Run the process_individual function for each individual in the individuals list and create a combined DataFrame
combined_df_rows = []

for name, individual_id in gedcom_instance.individuals:
    individual_data = process_individual_new(individual_id)

    cm = individual_data.get('cM', None)  # Get 'cM' from dictionary, default to None if not found
    sort = individual_data.get('Sort', None)  # Get 'Sort' from dictionary, default to None if not found
    parent_pairs_a10 = individual_data.get('Last Pairs', {})  # Get 'Last Pairs' from dictionary, default to empty dict if not found

    if cm is not None and sort is not None:  # Check if both 'cM' and 'Sort' are not None before proceeding
        most_distant_ancestor = generation_table.iloc[0]['Parent Pair']  # Get the most distant ancestor from the last row of generation_table
        combined_df_rows.append([individual_id, name, sort, cm, most_distant_ancestor, parent_pairs_a10])

combined_df = pd.DataFrame(combined_df_rows, columns=['ID#', 'Name', 'Match to', 'cM', 'Most Distant Ancestor', 'Ancestral Line A10'])


List of GEDCOM files:
1. yates-one-name-study.ged
Records Moved into short_pool: 1
Records Moved into short_pool: 1
Enter prime_surname (default: Yates): 
The ancestors of individual None are: []
Scores dictionary is empty. Cannot find the maximum.
Final pair is None. Cannot find the final score.
Cumulative scores are empty. Cannot find the final line or score.
Cumulative scores are empty. Cannot find the final line or score.
Final line is None. Cannot access cumulative_scores[final_line].
dnaline: None, Cumulative Score = None
individual_id not in records


In [2]:
!pip install pandas
!pip install python-gedcom

Collecting python-gedcom
  Downloading python_gedcom-1.0.0-py2.py3-none-any.whl (35 kB)
Installing collected packages: python-gedcom
Successfully installed python-gedcom-1.0.0
