<a href="https://colab.research.google.com/github/ronyates47/Gedcom-Utils/blob/main/2023_0930_1320_hrs_Yates_final_formatting.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [35]:
import openpyxl
import glob
import os
from gedcom.element.individual import IndividualElement
from gedcom.parser import Parser
import pandas as pd

class Gedcom:
    def __init__(self, file_name):
        self.file_name = file_name
        self.records = {}  # Initialize records attribute here
        self.gedcom_datasets = []  # Keep this if other parts of your code depend on it

    @staticmethod
    def get_standard_name(file_path):
        file_name = file_path.split('/')[-1]
        if '.' in file_name:
            file_name = file_name.rsplit('.', 1)[0]
        standard_name = file_name.replace(' ', '_').lower()
        return standard_name

    def parse_gedcom(self):
        with open(self.file_name, 'r', encoding='utf-8-sig') as f:
            gedcom_lines = f.readlines()

        current_dataset = None
        for line in gedcom_lines:
            parts = line.strip().split(' ', 2)
            level = int(parts[0])
            tag = parts[1]
            value = parts[2] if len(parts) > 2 else None

            if level == 0 and tag.startswith('@') and tag.endswith('@') and value == 'INDI':
                current_dataset = GedcomDataset(tag)
                self.records[tag] = current_dataset
                self.gedcom_datasets.append(current_dataset)  # Add to the list of datasets
            elif current_dataset is not None:
                if level == 1 and tag in ['NAME', 'FAMC']:
                    current_dataset.add_extractable_detail(tag, value)
                elif level == 2 and tag == 'NPFX':
                    current_dataset.add_extractable_detail(tag, value)

#        print(f"47-GEDCOM contained {len(self.records)} total records")

    def apply_xlsx_filter(self, xlsx_file_path):  # BACKWARDS CONVERSION TO EASE xlsx MANUAL FILTER OF ID's
        if not os.path.exists(xlsx_file_path):
            print("xlsx file not found. Proceeding without filtering by xlsx.")
            return

        df = pd.read_excel(xlsx_file_path)  # Corrected this line
        # Add '@' to the beginning and end of each ID to match the GEDCOM format
        filtered_ids = set(f"@{x}@" for x in df['ID'])

        self.records = {k: v for k, v in self.records.items() if k in filtered_ids}

        print(f'Records manually xlsx filtered: {len(self.records)}')

    def apply_npfx_filter(self):
        filtered_records = {k: v for k, v in self.records.items() if v.get_extractable_NPFX()}
        # Update self.records to only contain the filtered records
        self.records = filtered_records
        print(f'66-Records tagged and filtered by NPFX: {len(self.records)}')
        print(f"64-'self.records' contains {len(self.records)} total records")

def extract_id(record):  # Moved outside of the class so it's a standalone function
    id_start = record.find('@') + 1
    id_end = record.find('@', id_start)
    return record[id_start:id_end]

def extract_name(gedcom_dataset_instance):# Moved outside of the class so it's a standalone function
    record = gedcom_dataset_instance.extractable_detail.get('NAME', '')
    name_start = record.find('1 NAME ') + 6
    name_end = record.find('\n', name_start)
    name = record[name_start:name_end]

    if '/' not in name:
#        print(f"78-Invalid name format for record: {record[:50]}...")  # Print the first 100 characters of the problematic record for debugging
        return None  # or you could return a default value like "UnknownUnknown"

    first_name, last_name = name.split('/', 1)
    first_name = first_name[:10]  # Use slicing syntax to extract the first 10 characters of the first_name variable
    last_name = last_name[:10].rstrip('/')  # Use slicing syntax to extract the first 10 characters of the last_name variable
    return last_name.replace(" ", "") + first_name.replace(" ", "")

def has_both_parents(records, mother_id, father_id):   # Moved outside of the class so it's a standalone function
    return mother_id in records and father_id in records

class GedcomDataset:
    def __init__(self, individual_id):
        self.individual_id = individual_id
        self.extractable_detail = {}
        self.anchor_gen1 = None

    def add_extractable_detail(self, key, value):
#        print(f"96-Adding key: {key}, 97-value: {value}")  # This will print out the key and value being added
        self.extractable_detail[key] = value

    def get_gen_person(self):
        return self.individual_id.strip('@')

    def get_extractable_NPFX(self):
        return self.extractable_detail.get('NPFX', '')

    def get_extractable_cm(self):
        npfx_value = self.extractable_detail.get('NPFX', '')
        if '&' in npfx_value:
            cm_value = npfx_value.split('&')[0].strip()
        else:
            cm_value = npfx_value.strip()
        try:
            int(cm_value)
            return cm_value
        except ValueError:
            return 'error'

    def get_extractable_sort(self):
        npfx_value = self.extractable_detail.get('NPFX', '')
        if '&' in npfx_value:
            sort_value = npfx_value.split('&')[1].strip()
            return sort_value
        else:
            return ''

    def get_extractable_FAMC(self):
        return self.extractable_detail.get('FAMC', '').strip('@')

    def get_extractable_FAMS(self):
        return self.extractable_detail.get('FAMS', '').strip('@')

    def get_extractable_WIFE(self):
        return self.extractable_detail.get('WIFE', '').strip('@')

    def get_extractable_HUSB(self):
        return self.extractable_detail.get('HUSB', '').strip('@')

    def get_anchor_gen1(self):
        name = self.extractable_detail.get('NAME', '')
        if '/' in name:
            first_name, last_name = name.split('/', 1)
            first_name = first_name.split(' ')[0]
            last_name = last_name.rstrip('/')
            self.anchor_gen1 = last_name.replace(" ", "") + first_name.replace(" ", "")
            return self.anchor_gen1
        else:
            print(f"Warning: No '/' found in NAME tag for individual {self.get_gen_person()}")
            return ''

def select_gedcom_file():
    gedcom_files = glob.glob('*.ged')
    if not gedcom_files:
        print("No GEDCOM files found.")
        return None

    print("Automatically selecting the first GEDCOM file.")
    return gedcom_files[0]

gedcom_file_path = select_gedcom_file()

if gedcom_file_path:
    gedcom_instance = Gedcom(gedcom_file_path)
    gedcom_instance.parse_gedcom()

    # Apply filters
    gedcom_instance.apply_xlsx_filter("/content/filtered_ids.xlsx")  # Replace with your actual xlsx file path
    gedcom_instance.apply_npfx_filter()

    individuals = []    # Initialize the list of individuals

    with open(gedcom_file_path, 'r') as file:    # Reading the GEDCOM file and parsing records
        data = file.read()
    data = data.split('\n0 ')
    records = {extract_id(record): record for record in data}

    # Create the filtered_records dictionary after self.records has been populated (DO WE STILL NEED THIS)
    filtered_records = {dataset.get_gen_person(): records.get(dataset.get_gen_person(), "") for dataset in gedcom_instance.records.values()}

# Include the sanitize_id function here
def sanitize_id(raw_id):
    return raw_id.replace('@', '')

# Your existing find_parents function
def find_parents(individual_id, generation, records):
    if individual_id not in records:
        return
    record = records[individual_id]

    famc_id = record.get_extractable_FAMC()
    if not famc_id:
        return

    fam_record = records.get(famc_id, None)
    if not fam_record:
        return

    mother_id = fam_record.get_extractable_WIFE()
    father_id = fam_record.get_extractable_HUSB()

    if mother_id and mother_id in records:
        mother_record = records[mother_id]
        mother_name = extract_name(mother_record)
    else:
        mother_name = None

    if father_id and father_id in records:
        father_record = records[father_id]
        father_name = extract_name(father_record)
    else:
        father_name = None

    if mother_name is not None and father_name is not None:
        parent_pair = father_name + "&" + mother_name
        if parent_pair not in visited_pairs:
            visited_pairs.add(parent_pair)
            if has_both_parents(records, mother_id, father_id):
                generation_table.loc[len(generation_table)] = [generation, parent_pair]

    if mother_id:
        find_parents(mother_id, generation + 1, records)

    if father_id:
        find_parents(father_id, generation + 1, records)


def get_formatted_name(male_individual_id, female_individual_id, records):
    male_record = records.get(male_individual_id, "")
    female_record = records.get(female_individual_id, "")

    male_name = extract_name(male_record)
    female_name = extract_name(female_record)

    if male_name is None or female_name is None:
        print(f"Could not extract name for either male ID: {male_individual_id} or female ID: {female_individual_id}")
        return None  # or return a default value

    return male_name + "&" + female_name

most_distant_ancestors = []  # New list to collect most distant ancestors

def sanitize_id(raw_id):
    return raw_id.replace('@', '')

def find_distant_ancestors(individual_id, records, path=None, spouses=None, most_distant_ancestors=None):
    sanitized_id = sanitize_id(individual_id)  # Sanitize the ID here
    original_id = f"@{sanitized_id}@"  # Create original ID by adding '@' back

    if path is None:
        path = [original_id]  # Use the original ID with '@'

    # ... (rest of your existing code remains the same)

        path = [individual_id]
    if spouses is None:
        spouses = {}
    if most_distant_ancestors is None:
        most_distant_ancestors = []

    record = records.get(individual_id, None)
    if not record:
        print(f"Record not found for {individual_id}")
        return most_distant_ancestors, spouses, []  # Assuming you want an empty list as the third value

    famc_id = extract_famc_id(record)

    # Sanitizing and Checking IDs
    if sanitize_id(famc_id) not in map(sanitize_id, records.keys()):
        print(f"353-Warning: famc_id {famc_id} not found in records")
        return most_distant_ancestors, spouses, []  # Assuming you want an empty list as the third value

    famc_id = extract_famc_id(record)
    print(f"Debug: famc_id for {individual_id} is {famc_id}")  # Debug print

    if famc_id in records:
        mother_id, father_id = extract_parent_ids(records[famc_id])
        print(f"Debug: mother_id = {mother_id}, father_id = {father_id}")  # Debug print


    if famc_id in records:
        mother_id, father_id = extract_parent_ids(records[famc_id])

        if not mother_id and not father_id:
            most_distant_ancestors.append(path)

        if father_id:
            new_path = list(path) + [father_id]
            find_distant_ancestors(father_id, records, new_path, spouses, most_distant_ancestors)

        if mother_id:
            new_path = list(path) + [mother_id]
            find_distant_ancestors(mother_id, records, new_path, spouses, most_distant_ancestors)

    else:
        most_distant_ancestors.append(path)

    return most_distant_ancestors, spouses, []  # Assuming you want an empty list as the third value

def extract_famc_id(gedcom_dataset_instance):
    return gedcom_dataset_instance.get_extractable_FAMC()

def extract_fams_id(gedcom_dataset_instance):
    return gedcom_dataset_instance.get_extractable_FAMS()

def extract_parent_ids(gedcom_dataset_instance):
    mother_id = gedcom_dataset_instance.get_extractable_WIFE()
    father_id = gedcom_dataset_instance.get_extractable_HUSB()
    return mother_id, father_id

#for individual_id in gedcom_instance.records.keys():  # Loop through the filtered IDs.
#    distant_ancestors_paths, spouses, most_distant_ancestors = find_distant_ancestors(
#        individual_id, gedcom_instance.records
#    )
#    print(f"291-Most Distant Ancestors for {individual_id}: {most_distant_ancestors}")

#for individual_id, dataset in gedcom_instance.records.items():
#    print("286-Before accessing record")  # Debug print
#    record = gedcom_instance.records.get(individual_id, "")
#    print("288-After accessing record")  # Debug print
#    print(f"281-Type of record: {type(record)}, Content: {record}")

def extract_fam_id(gedcom_dataset_instance):
    return gedcom_dataset_instance.get_extractable_FAMC()

def extract_parent_ids(fam_record):
    wife_start = fam_record.find('1 WIFE @') + 8
    wife_end = fam_record.find('@', wife_start)
    mother_id = fam_record[wife_start:wife_end]

    husb_start = fam_record.find('1 HUSB @') + 8
    husb_end = fam_record.find('@', husb_start)
    father_id = fam_record[husb_start:husb_end]

    return mother_id, father_id

def extract_spouse_id(fam_record, individual_record):
    if '1 HUSB @' in individual_record:
        spouse_start = fam_record.find('1 WIFE @') + 8
    else:
        spouse_start = fam_record.find('1 HUSB @') + 8
    spouse_end = fam_record.find('@', spouse_start)
    return fam_record[spouse_start:spouse_end]

# Your existing for loops
for individual_id in gedcom_instance.records.keys():  # Loop through the filtered IDs.
    sanitized_id = sanitize_id(individual_id)  # Sanitize ID
    distant_ancestors_paths, spouses, most_distant_ancestors = find_distant_ancestors(
        sanitized_id, gedcom_instance.records  # Use sanitized ID
    )
    print(f"319-Distant Ancestors Paths for {individual_id}: {distant_ancestors_paths}")


individual_and_spouse = {}  # Create a dictionary to track individual and their spouse.


# Instead of using individual_id_to_search, loop through the filtered IDs.
for individual_id in gedcom_instance.records.keys():
    distant_ancestors_paths, spouses, most_distant_ancestors = find_distant_ancestors(individual_id, gedcom_instance.records)
#    print(f"352-Distant Ancestors Paths for {individual_id}: {distant_ancestors_paths}")
    print(f"353-Most Distant Ancestors for {individual_id}: {most_distant_ancestors}")

indexed_paths = {i: path for i, path in enumerate(distant_ancestors_paths)}

# Assuming `spouses` is a dictionary where the key is the individual ID and the value is the spouse ID
spouses = {}  # Populate this based on your function that finds distant ancestors

name_paths = []

for path in distant_ancestors_paths:
    name_path_with_spouse = []
    for id in path:  # This should be a single ID, not a list
        individual_name = extract_name(records.get(id, GedcomDataset('')))
        spouse_id = spouses.get(id, None)
        spouse_name = extract_name(records.get(spouse_id, '')) if spouse_id else "unknown"
        name_path_with_spouse.append(f"{individual_name}&{spouse_name}")
    name_paths.append(name_path_with_spouse)

for idx, name_path in enumerate(name_paths):  # Check for the surname 'Yates' in the paths
    has_yates = any("Yates" in name for name in name_path)
    if has_yates:
        print(f"Path index: {idx}, Contains Yates: {has_yates}, Names: {name_path}")

path_scores = {}  # Initialize an empty dictionary to store the scores of each path

for idx, name_path in enumerate(name_paths): # Loop through the paths and apply the scoring scheme
    score = 0
    for generation, name in enumerate(name_path):
        if 'Yates' in name:
            score += 1 * (generation + 1)  # Add 1 to generation because index starts at 0
    path_scores[idx] = score     # Store the score of this path

# Find the index of the path with the highest score
#winning_path_index = max(path_scores, key=path_scores.get)
#winning_path_score = path_scores[winning_path_index]
#winning_path_names = name_paths[winning_path_index]

# Print the winning path to the console
#print(f"High Branch score was {winning_path_score}.")    #The winning path is at index {winning_path_index}
#print(f"Names in the winning path: {winning_path_names}")

#winning_path_names.reverse() # Reversing the list for highest generation first

# Adding spouses (Replace 'unknown' with actual spouse names if you have them)
#spouse_names = ['unknown'] * len(winning_path_names)  # Replace with real names or keep as 'unknown'

# Combine names and spouses in your specified format
#formatted_names = [f"{name}&{spouse}" for name, spouse in zip(winning_path_names, spouse_names)]

#final_output = "|".join(formatted_names)# Join all the formatted names with '|'

#print("Selected DNA Branch:", final_output)

visited_pairs = set()

def process_individual(individual_id, gedcom_instance):
    global generation_table
    generation_table = pd.DataFrame(columns=['Generation', 'Parent Pair'])
    global visited_pairs
    visited_pairs = set()

    find_parents(individual_id, 1, gedcom_instance.records)  # Changed to get records from gedcom_instance

    for dataset in gedcom_instance.records.values():  # Changed to get records from gedcom_instance
        if dataset.get_gen_person() == individual_id:
            anchor_gen1 = dataset.get_anchor_gen1()
            generation_table.loc[0] = [1, anchor_gen1]
            break

    generation_table = generation_table.sort_values('Generation', ascending=False).reset_index(drop=True)

    individual_data = {}
    for dataset in gedcom_instance.records.values():  # Changed to get records from gedcom_instance
        if dataset.get_gen_person() == individual_id:
            individual_data['cM'] = dataset.get_extractable_cm()
            individual_data['Sort'] = dataset.get_extractable_sort()
            break

    individual_data['Parent Pairs A10'] = '|'.join([f"{row['Parent Pair']}" for index, row in generation_table[:10].iterrows()])

#    print(f"Returning data for {individual_id}: {individual_data}")  # Debug print
    return individual_data


import pandas as pd

# Global variables
generation_table = None
visited_pairs = set()

# Run the process_individual function for each individual in the filtered_individuals list and create a combined DataFrame
combined_df_rows = []
for individual_id, dataset in gedcom_instance.records.items():
    name = extract_name(dataset)  # Corrected line
    individual_data = process_individual(individual_id, gedcom_instance)
#    print("Debugging individual_data:", individual_data)  # This will print out the contents
    cm = individual_data.get('cM', 'Default Value')  # This will prevent KeyError
    sort = individual_data.get('Sort', 'Default Sort')
    parent_pairs_a10 = individual_data.get('Parent Pairs A10', 'Default Parent Pairs A10')
    # most_distant_ancestor = generation_table.iloc[0]['Parent Pair']  # Get the most distant ancestor from the last row of generation_table
    combined_df_rows.append([individual_id, name, sort, cm, parent_pairs_a10]) #, most_distant_ancestor

combined_df = pd.DataFrame(combined_df_rows, columns=['ID#', 'Name', 'Match to', 'cM', 'Ancestral Line A10'])


def create_hotlink(row):# Function to create hotlinks
    url_base = "https://yates.one-name.net/tng/verticalchart.php?personID="
    person_id = row['ID#']
    hotlink = f'<a href="{url_base}{person_id}&tree=tree1&parentset=0&display=vertical&generations=8" target="_blank">{person_id}</a>'
    return hotlink

combined_df['LUN#'] = combined_df.apply(lambda row: create_hotlink(row), axis=1) # Apply the hotlink function to create the 'LUN#' column
# Change the order of the columns
combined_df = combined_df[['ID#', 'Name', 'Match to', 'cM', 'LUN#', 'Ancestral Line A10']] # 'Most Distant Ancestor',
# Adjust index to start from 1 instead of 0
combined_df.index = combined_df.index + 1
# Print all records from the DataFrame
print(combined_df)
# Export the combined_df DataFrame to an Excel file
combined_df.to_excel('/content/output.xlsx', index=False)


Automatically selecting the first GEDCOM file.
xlsx file not found. Proceeding without filtering by xlsx.
66-Records tagged and filtered by NPFX: 375
64-'self.records' contains 375 total records
Record not found for I13817
319-Distant Ancestors Paths for @I13817@: []
Record not found for I21743
319-Distant Ancestors Paths for @I21743@: []
Record not found for I23678
319-Distant Ancestors Paths for @I23678@: []
Record not found for I26925
319-Distant Ancestors Paths for @I26925@: []
Record not found for I31861
319-Distant Ancestors Paths for @I31861@: []
Record not found for I40190
319-Distant Ancestors Paths for @I40190@: []
Record not found for I42478
319-Distant Ancestors Paths for @I42478@: []
Record not found for I44570
319-Distant Ancestors Paths for @I44570@: []
Record not found for I44883
319-Distant Ancestors Paths for @I44883@: []
Record not found for I44894
319-Distant Ancestors Paths for @I44894@: []
Record not found for I44935
319-Distant Ancestors Paths for @I44935@: []
Re

In [24]:
!pip install pandas
!pip install python-gedcom
!pip install openpyxl

