<a href="https://colab.research.google.com/github/ronyates47/Gedcom-Utils/blob/main/rebuild_2023_0905_2019.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install necessary packages (Uncomment if not installed)
# !pip install pandas
# !pip install python-gedcom

# Import required libraries
import glob
from gedcom.element.individual import IndividualElement
from gedcom.parser import Parser
import pandas as pd
import os

# Function to select GEDCOM file
def select_gedcom_file():
    gedcom_files = glob.glob('*.ged')
    if not gedcom_files:
        print("No GEDCOM files found.")
        return None
    print("List of GEDCOM files:")
    for i, file_name in enumerate(gedcom_files, 1):
        print(f"{i}. {file_name}")
    selected_num = 1  # Automatically select the first GEDCOM file
    return gedcom_files[selected_num - 1]

# Initialize your GEDCOM instance here
gedcom_file_path = select_gedcom_file()
if gedcom_file_path:
    gedcom_parser = Parser()
    gedcom_parser.parse_file(gedcom_file_path)
    gedcom_instance = gedcom_parser  # Now gedcom_instance holds your initialized GEDCOM data

    # Here's the integration part
    all_individual_records = parse_gedcom_to_individual_records(gedcom_instance)
else:
    print("Exiting as no GEDCOM file was selected.")


def input_prime_surname(default_surname="Yates"):
    """Prompt the user for the prime surname. Use the default if no input is provided."""
    global prime_surname  # Declare prime_surname as global
    prime_surname = input(f"Enter prime_surname (default: {default_surname}): ").strip()
    prime_surname = prime_surname if prime_surname else default_surname

# Call the function to let the user input prime_surname
input_prime_surname()

# prime_surname is now a global variable and can be used elsewhere in the script

class IndividualRecord:
    all_records = {}  # Class variable to keep track of all instances
    record_count = 0  # Class variable to keep track of the number of records

    def __init__(self, person_number, name, NPFX, FAMC, anchor_gen1=None):
        self.__class__.record_count += 1  # <---- Add this line
        self.__class__.all_records[person_number] = self  # Storing each instance in the class variable
        self.person_number = person_number
        self.name = name
        self.NPFX = NPFX
        self.FAMC = FAMC
        self.anchor_gen1 = anchor_gen1
        self.__class__.all_records[person_number] = self  # Storing each instance in the class variable

    @classmethod
    def find_by_person_number(cls, person_number):
        return cls.all_records.get(person_number, None)

    # Method to get anchor_gen1
    def get_anchor_gen1(self):
        return self.anchor_gen1

    # Method to set anchor_gen1
    def set_anchor_gen1(self, value):
        self.anchor_gen1 = value

    def get_cm(self):
        npfx_value = self.NPFX  # Access the instance variable NPFX
        if npfx_value is None:
            return 'error'  # Or whatever you'd like to return when NPFX is None

        if '&' in npfx_value:
            cm_value = npfx_value.split('&')[0].strip()
        else:
            cm_value = npfx_value.strip()

        try:
            int(cm_value)
            return cm_value
        except ValueError:
            return 'error'

    def get_sort(self):
        npfx_value = self.NPFX
        if npfx_value is not None:
            if '&' in npfx_value:
                sort_value = npfx_value.split('&')[1].strip()
            else:
                sort_value = ''  # or whatever default value you'd like
            return sort_value
        else:
            return None  # Or some default value

# Function to parse GEDCOM data into individual records
# Place your function definitions before they are called

def parse_gedcom_to_individual_records(gedcom_instance):
    elements = gedcom_instance.get_element_list()
    individual_records = []

    for element in elements:
        if isinstance(element, IndividualElement):
            person_number = element.get_pointer()
            name = element.get_name()

            # Initialize all variables you'll use to None (or some other default value)
            NPFX, FAMC, anchor_gen1 = None, None, None

            for child in element.get_child_elements():
                tag = child.get_tag()
                if tag == "NPFX":
                    NPFX = child.get_value()
                elif tag == "FAMC":
                    FAMC = child.get_value()
                elif tag == "anchor_gen1":
                    anchor_gen1 = child.get_value()

            new_record = IndividualRecord(person_number, name, NPFX, FAMC, anchor_gen1)

    return individual_records  # Return your populated list here





# Initialize pools
short_pool = []
filter_pool = []

# Function to filter individuals based on Excel file and NPFX
def filter_individuals(all_individual_records):

        # Clear the pools
    short_pool.clear()
    filter_pool.clear()

    # If Excel file exists
    if os.path.exists('/content/shortged.xlsx'):
        df = pd.read_excel('/content/shortged.xlsx')
        allowed_individual_ids = df.iloc[:, 0].astype(str).tolist()

    for record in all_individual_records:
        # Strip '@' symbols from the GEDCOM person_number for comparison
        stripped_person_number = record.person_number.replace('@', '')

        # For Excel-allowed IDs
        if stripped_person_number in allowed_individual_ids:
            short_pool.append(record)
            filter_pool.append(record)  # Add to filter_pool as well
            record.cm = record.get_cm()
            record.sort = record.get_sort()
        # For NPFX
        elif record.NPFX:
            filter_pool.append(record)



# Assuming all_individual_records is already populated
filter_individuals(all_individual_records)


# Main part of the script
if __name__ == "__main__":
    gedcom_file_path = select_gedcom_file()
    if gedcom_file_path:
        gedcom_parser = Parser()
        gedcom_parser.parse_file(gedcom_file_path)
        gedcom_instance = gedcom_parser

        all_individual_records = parse_gedcom_to_individual_records(gedcom_instance)
        print(all_individual_records)  # You should see your list here


        # Call the filter function
        filter_individuals(all_individual_records)

        # Debugging prints
        print("Records Moved into short_pool:", len(short_pool))
        print("Records Moved into filter_pool:", len(filter_pool))

        # Debugging print for a sample record
        if short_pool:
            print("Attributes of first record in short_pool:", vars(short_pool[0]))
    else:
        print("Exiting as no GEDCOM file was selected.")



# Initialize the list to hold dictionaries of individual records
individual_dicts = []

# Loop through records in filter_pool and convert each one to a dictionary
for record in filter_pool:
    individual_dict = {
        'Person Number': record.person_number,
        'Anchor Gen1': record.get_anchor_gen1(),
        'CM': record.cm,
        'Sort': record.sort,
        'NPFX': record.NPFX,
        'FAMC': record.FAMC
    }

    individual_dicts.append(individual_dict)



# NEW ANCESTRAL SECTION STARTS HERE

import gedcom


def find_parents(person_number, generation, records, parent_pairs, children, ancestral_lines, current_line=None):
    if current_line is None:
        current_line = []

    if person_number not in records:
        return

    record = IndividualRecord.find_by_person_number(person_number)
    famc_id = record.FAMC  # Directly accessing the FAMC attribute of the IndividualRecord

    if famc_id not in records:
        return

    fam_record = records[famc_id]
    wife_start = fam_record.find('1 WIFE @') + 8
    wife_end = fam_record.find('@', wife_start)
    mother_id = fam_record[wife_start:wife_end]

    husb_start = fam_record.find('1 HUSB @') + 8
    husb_end = fam_record.find('@', husb_start)
    father_id = fam_record[husb_start:husb_end]

    if mother_id and mother_id in records:
        mother_record = records[mother_id]
        mother_name = extract_name(mother_record)
        if mother_name not in children:
            children[mother_name] = []
        children[mother_name].append(person_number)
    else:
        mother_name = None

    if father_id and father_id in records:
        father_record = records[father_id]
        father_name = extract_name(father_record)
        if father_name not in children:
            children[father_name] = []
        children[father_name].append(person_number)
    else:
        father_name = None

    if mother_name and father_name:
        parent_pair = father_name + "&" + mother_name
        current_line.append((generation, parent_pair))
        if parent_pair not in visited_pairs:
            visited_pairs.add(parent_pair)
            if has_both_parents(records, mother_id, father_id):
                parent_pairs.append((generation, parent_pair))
                ancestral_lines[parent_pair] = list(current_line)

    if mother_id:
        find_parents(mother_id, generation + 1, records, parent_pairs, children, ancestral_lines, current_line)

    if father_id:
        find_parents(father_id, generation + 1, records, parent_pairs, children, ancestral_lines, current_line)

    current_line.pop()


filter_pool = [...]  # This should contain a list of 'person_numbers' you've filtered

for person_number in filter_pool:
    # Initialize or reset any data structures here if needed
    ancestral_lines = {}
    parent_pairs = []
    children = {}

    find_parents(person_number, 1, IndividualRecord.all_records, parent_pairs, children, ancestral_lines)
    # Rest of your code for each individual in filter_pool

for last_pair, ancestral_line in ancestral_lines.items():
    print(f'Ancestral line for last pair {last_pair}:')
    for generation, parent_pair in ancestral_line:
        print(f'  Generation {generation}: {parent_pair}')

def process_individual(person_number, gedcom_instance):
    global generation_table
    global visited_pairs

    generation_table = pd.DataFrame(columns=['Generation', 'Parent Pair'])
    visited_pairs = set()

    parent_pairs = []
    children = {}
    ancestral_lines = {}

    find_parents(person_number, 1, IndividualRecord.all_records, parent_pairs, children, ancestral_lines)

    # For the generation table
    individual_record = IndividualRecord.all_records.get(person_number, None)
    anchor_gen1 = individual_record.anchor_gen1 if individual_record else None
    if anchor_gen1 is not None:
        generation_table.loc[0] = [1, anchor_gen1]

    generation_table = generation_table.sort_values('Generation', ascending=False).reset_index(drop=True)

    # For individual data
    individual_data = {}

    individual_record = IndividualRecord.all_records.get(person_number, None)
    if individual_record is not None:
        individual_data['cM'] = individual_record.get_cm()
    else:
        individual_data['cM'] = 'Not found'  # Or any other default value

    if individual_record is not None:
        individual_data['Sort'] = individual_record.get_sort()
    else:
        individual_data['Sort'] = 'Not found'  # Or any other default value
        individual_data['Parent Pairs A10'] = '|'.join([f"{row['Parent Pair']}" for index, row in generation_table[:10].iterrows()])

    # Add anchor_gen1 to the beginning of each ancestral line
    for last_pair, ancestral_line in ancestral_lines.items():
        if anchor_gen1 is not None:
            ancestral_line.insert(0, (1, anchor_gen1))

    # Ensure 'Last Pairs' is always present in individual_data
    individual_data['Last Pairs'] = "lastofline"  # Move this line out of the loop

    return individual_data


def trace_children(person_number, parent_pairs, children, visited=None):
    # Create a list to store the children of the individual
    individual_children = []

    # Create a set to keep track of visited individuals
    if visited is None:
        visited = set()

    # Check if the individual has already been visited
    if person_number in visited:
        # If the individual has already been visited, stop the recursion and return an empty list
        return individual_children

    # Add the individual to the set of visited individuals
    visited.add(person_number)

    # Find the name of the individual
    individual_name = None
    for pair in parent_pairs:
        if "lastpair" in pair[1]:
            names = pair[1].split('&')
            if person_number in children[names[0]]:
                individual_name = names[0]
            elif person_number in children[names[1]]:
                individual_name = names[1]
            break

    # Check if the individual has any children
    if individual_name in children:
        # Add the children of the individual to the list
        individual_children.extend(children[individual_name])

        # Recursively find the children of the individual's children
        for child_id in children[individual_name]:
            child_children = trace_children(child_id, parent_pairs, children, visited)
            individual_children.extend(child_children)

    return individual_children

descendants = trace_children(person_number, parent_pairs, children)
print(f'The ancestors of individual {person_number} are: {descendants}')

# user enters the surname=target surname
target_surname = prime_surname

# Define a scoring function that counts the number of occurrences of the target surname in an ancestral line
def score_ancestral_line(ancestral_line_list):
    score = 0
    for line in map(str, ancestral_line_list):
        if target_surname in line:
            score += 1
    return score


# Initialize an empty dictionary to store the scores
scores = {}

# Convert the tuples into a list
ancestral_lines_list = list(ancestral_lines.items())

# Compute and store the scores for each last pair
for last_pair, ancestral_line in ancestral_lines_list:
    scores[last_pair] = score_ancestral_line(ancestral_line)

# Check if the scores dictionary is empty
if scores:
    # Find the final-line with the highest score
    final_pair = max(scores, key=scores.get)
    final_score = scores[final_pair]
else:
    print("No scores available to find max.")
    final_pair = None
    final_score = None

# ... (rest of your code)


# Print the last pair and score for each ancestral line
#for last_pair, score in scores.items():
#    print(f"Last pair: {last_pair} (score: {score})")

# Print the final-line with a designation of 'final-line' to the highest score
# print(f"dnaline: {final_pair} (score: {final_score})")



# Initialize a dictionary to store the cumulative scores for each branch line
cumulative_scores = {}

# Calculate and store the cumulative scores for each branch line and generation
for last_pair, ancestral_line in ancestral_lines_list:
    cumulative_score = 0
    for generation, line in enumerate(ancestral_line, start=1):
        generation_score = score_ancestral_line([line])
        cumulative_score += generation_score
        cumulative_scores.setdefault(line, 0)  # Initialize cumulative score for line if not exists
        cumulative_scores[line] += cumulative_score

# Print the cumulative scores for each branch line and generation
for line, cumulative_score in cumulative_scores.items():
    print(f"Ancestral line: {line}, Cumulative Score = {cumulative_score}")

# Check if the cumulative_scores dictionary is empty
if cumulative_scores:
    # Find the final-line with the highest cumulative score
    final_line = max(cumulative_scores, key=cumulative_scores.get)
    final_cumulative_score = cumulative_scores[final_line]

    # Print the final-line with the highest cumulative score
    print(f"dnaline: {final_line}, Cumulative Score = {final_cumulative_score}")
else:
    print("No cumulative scores available to find max.")
    final_line = None
    final_cumulative_score = None



# At the end of your main script or wherever appropriate
    print_record_count()

# This will create a list of tuples, where each tuple contains the name and person_number
individuals = [(record.name, record.person_number) for record in IndividualRecord.all_records.values()]



# Run the process_individual function for each individual in the individuals list and create a combined DataFrame
combined_df_rows = []
# Loop through individuals
for name, person_number in individuals:
    individual_data = process_individual(person_number, gedcom_instance)  # Assuming you have this function defined
    cm = IndividualRecord.find_by_person_number(person_number).get_cm()
    sort = IndividualRecord.find_by_person_number(person_number).get_sort()  # Assuming you add get_sort() to the class
    parent_pairs_a10 = IndividualRecord.find_by_person_number(person_number).get_anchor_gen1()  # Assuming you want this data
if not generation_table.empty:  # Check if generation_table is empty
    most_distant_ancestor = generation_table.iloc[0]['Parent Pairs']
else:
    most_distant_ancestor = 'Unknown'

combined_df_rows.append([person_number, name, sort, cm, most_distant_ancestor, parent_pairs_a10])

# Create DataFrame
combined_df = pd.DataFrame(combined_df_rows, columns=['ID#', 'Name', 'Match to', 'cM', 'Most Distant Ancestor', 'Ancestral Line A10'])

# Convert list of dictionaries to a Pandas DataFrame
df = pd.DataFrame(individual_dicts)

# Optionally, save the DataFrame to a CSV file
# df.to_csv('filter_pool_records.csv', index=False)

# Show the DataFrame (first 5 rows)
print(df.head())





List of GEDCOM files:
1. yates-one-name-study.ged
Enter prime_surname (default: Yates): 
List of GEDCOM files:
1. yates-one-name-study.ged
Records Moved into short_pool: 0
Records Moved into filter_pool: 0
The ancestors of individual Ellipsis are: []
No scores available to find max.
No cumulative scores available to find max.
Total records parsed: 0
Empty DataFrame
Columns: []
Index: []


In [None]:
!pip install pandas
!pip install python-gedcom

Collecting python-gedcom
  Downloading python_gedcom-1.0.0-py2.py3-none-any.whl (35 kB)
Installing collected packages: python-gedcom
Successfully installed python-gedcom-1.0.0
