<a href="https://colab.research.google.com/github/ronyates47/Gedcom-Utils/blob/main/A_version_4_with_mission_success_2023_1001_1500.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [53]:
import csv
import glob
from gedcom.element.individual import IndividualElement
from gedcom.parser import Parser
import pandas as pd

class GedcomDataset:
    def __init__(self, gen_person):
        self.gen_person = gen_person
        self.extractable_detail = {}

    def add_extractable_detail(self, key, value):
        self.extractable_detail[key] = value

    def get_gen_person(self):
        name = self.extractable_detail.get('NAME', '')
        first_name, last_name = name.split('/', 1)
        first_name = first_name.split(' ')[0]
        last_name = last_name.rstrip('/')
        self.anchor_gen1 = last_name.replace(" ", "") + first_name.replace(" ", "")
        return self.gen_person.strip('@')

    def get_anchor_gen1(self):
        return self.anchor_gen1

    def get_extractable_NPFX(self):
        return self.extractable_detail.get('NPFX', '')

    def get_extractable_cm(self):
        npfx_value = self.extractable_detail.get('NPFX', '')
        if '&' in npfx_value:
            cm_value = npfx_value.split('&')[0].strip()
        else:
            cm_value = npfx_value.strip()
        try:
            int(cm_value)
            return cm_value
        except ValueError:
            return 'error'

    def get_extractable_sort(self):
        npfx_value = self.extractable_detail.get('NPFX', '')
        if '&' in npfx_value:
            sort_value = npfx_value.split('&')[1].strip()
            return sort_value
        else:
            return ''

    def get_extractable_FAMC(self):
        return self.extractable_detail.get('FAMC', '').strip('@')

# Function definitions
def extract_id(record):
    id_start = record.find('@') + 1
    id_end = record.find('@', id_start)
    return record[id_start:id_end]

def extract_name(record):
    name_start = record.find('1 NAME ') + 6
    name_end = record.find('\n', name_start)
    name = record[name_start:name_end]
    first_name, last_name = name.split('/', 1)
    first_name = first_name[:10] # Use slicing syntax to extract the first 10 characters of the first_name variable
    last_name = last_name[:10].rstrip('/') # Use slicing syntax to extract the first 10 characters of the last_name variable
    return last_name.replace(" ", "") + first_name.replace(" ", "")

class Gedcom:
    def __init__(self, file_name):
        self.file_name = file_name
        self.gedcom_datasets = []
        self.filter_pool = []

    @staticmethod
    def get_standard_name(file_path):
        file_name = file_path.split('/')[-1]
        if '.' in file_name:
            file_name = file_name.rsplit('.', 1)[0]
        standard_name = file_name.replace(' ', '_').lower()
        return standard_name

    def parse_gedcom(self):  # Note the correct indentation here
        with open(self.file_name, 'r', encoding='utf-8-sig') as f:
            gedcom_lines = f.readlines()

        current_dataset = None
        npfx_count = 0
        total_count = 0
        for line in gedcom_lines:
            parts = line.strip().split(' ', 2)
            level = int(parts[0])
            tag = parts[1]
            value = parts[2] if len(parts) > 2 else None

            if level == 0 and tag.startswith('@') and tag.endswith('@') and value == 'INDI':
                total_count += 1
                current_dataset = GedcomDataset(tag)
                self.gedcom_datasets.append(current_dataset)
#                print(f"New individual dataset created with tag {tag}")
            elif current_dataset is not None:
                if level == 1 and tag in ['NAME', 'FAMC']:
                    current_key = tag
                    current_dataset.add_extractable_detail(current_key, value)
#                    print(f"Added detail: {current_key} = {value}")
                elif level == 2 and tag == 'NPFX':
                    npfx_count += 1
                    current_dataset.add_extractable_detail(tag, value)
#                    print(f"Added NPFX: {value}")

        print(f'GEDCOM contained {total_count} total records')
        print(f'Records tagged and filtered by NPFX: {npfx_count}')

        # First level of filtering: Filter those with NPFX
        for dataset in self.gedcom_datasets:
            if dataset.get_extractable_NPFX():
                self.filter_pool.append(dataset)
#                print(f"Added to filter pool: {dataset.get_gen_person()}")

        # Check if manual filtering should be applied
        manual_filter_activated = True  # or False depending on your situation

        # Second level of filtering: Apply manual filter from Excel sheet
        if manual_filter_activated:
            import pandas as pd  # Assuming you haven't imported it yet
            df = pd.read_excel('filtered_ids.xlsx')
            manual_filtered_ids = set(df['ID'])
            print(f"Manual filter IDs loaded: {len(manual_filtered_ids)}")
            self.filter_pool = [dataset for dataset in self.filter_pool if dataset.get_gen_person() in manual_filtered_ids]
            print(f"After manual filter, total records: {len(self.filter_pool)}")

def input_prime_surname(last_prime_surname=None):
    if last_prime_surname:
        last_name = input(f"Enter prime_surname (default: {last_prime_surname}): ")
        if not last_name:
            last_name = last_prime_surname
    else:
        last_name = input("Enter prime_surname: ")
    return last_name

def select_gedcom_file():
    gedcom_files = glob.glob('*.ged')
    if not gedcom_files:
        print("No GEDCOM files found.")
        return None

    print("Automatically selecting the first GEDCOM file.")
    return gedcom_files[0]

    while True:
        try:
            selected_num = int(input("Enter the number of the GEDCOM file you want to use: "))
            if 1 <= selected_num <= len(gedcom_files):
                return gedcom_files[selected_num - 1]
            else:
                print("Invalid number. Please enter a valid number from the list.")
        except ValueError:
            print("Invalid input. Please enter a valid number.")

gedcom_file_path = select_gedcom_file() # Call the function to let the user select the GEDCOM file
if gedcom_file_path:
    # Use the selected GEDCOM file path to create an instance of the Gedcom class
    gedcom_instance = Gedcom(gedcom_file_path)
    gedcom_instance.parse_gedcom()

    individuals = []  # Initialize the list of individuals

    for dataset in gedcom_instance.filter_pool:    # Iterate over the filter_pool list,add each last name and ID to list
        individual_id = dataset.get_gen_person()
        last_name = dataset.get_anchor_gen1()
        individuals.append((last_name, individual_id))

    print(f'Records tagged and filtered by NPFX: {len(individuals)}')

    with open(gedcom_file_path, 'r') as file:    # Read the GEDCOM file and split it into individual and family records
        data = file.read()
    data = data.split('\n0 ')
    records = {extract_id(record): record for record in data}




def has_both_parents(records, mother_id, father_id):
    return mother_id in records and father_id in records

def find_parents(individual_id, generation, records):
    if individual_id not in records:
        return
    record = records[individual_id]
    famc_start = record.find('1 FAMC @') + 8
    famc_end = record.find('@', famc_start)
    famc_id = record[famc_start:famc_end]
    if famc_id not in records:
        return

    fam_record = records[famc_id]
    wife_start = fam_record.find('1 WIFE @') + 8
    wife_end = fam_record.find('@', wife_start)
    mother_id = fam_record[wife_start:wife_end]

    husb_start = fam_record.find('1 HUSB @') + 8
    husb_end = fam_record.find('@', husb_start)
    father_id = fam_record[husb_start:husb_end]

    if mother_id and mother_id in records:
        mother_record = records[mother_id]
        mother_name = extract_name(mother_record)
    else:
        mother_name = None

    if father_id and father_id in records:
        father_record = records[father_id]
        father_name = extract_name(father_record)
    else:
        father_name = None

    if mother_name is not None and father_name is not None:
        parent_pair = father_name + "&" + mother_name
        if parent_pair not in visited_pairs:
            visited_pairs.add(parent_pair)
            if has_both_parents(records, mother_id, father_id):
                generation_table.loc[len(generation_table)] = [generation, parent_pair]

    if mother_id:
        find_parents(mother_id, generation + 1, records)

    if father_id:
        find_parents(father_id, generation + 1, records)

def extract_name(record):
    name_start = record.find('1 NAME ') + 6
    name_end = record.find('\n', name_start)
    name = record[name_start:name_end]
    first_name, last_name = name.split('/', 1)
    first_name = first_name[:10]
    last_name = last_name[:10].rstrip('/')
    return last_name.replace(" ", "") + first_name.replace(" ", "")

def find_distant_ancestors(individual_id, records, path=None):
    if path is None:
        path = [individual_id]
    else:
        path.append(individual_id)

    if individual_id not in records:
        return []

    record = records[individual_id]
    famc_start = record.find('1 FAMC @') + 8
    famc_end = record.find('@', famc_start)
    famc_id = record[famc_start:famc_end]

    if famc_id not in records:
        return [path]

    fam_record = records[famc_id]
    wife_start = fam_record.find('1 WIFE @') + 8
    wife_end = fam_record.find('@', wife_start)
    mother_id = fam_record[wife_start:wife_end]

    husb_start = fam_record.find('1 HUSB @') + 8
    husb_end = fam_record.find('@', husb_start)
    father_id = fam_record[husb_start:husb_end]

    if father_id is None and mother_id is None:
        return [path]

    paths = []
    if father_id:
        new_path = list(path)
        paths.extend(find_distant_ancestors(father_id, records, new_path))

    if mother_id:
        new_path = list(path)
        paths.extend(find_distant_ancestors(mother_id, records, new_path))

    return paths

filtered_datasets = gedcom_instance.filter_pool

for dataset in filtered_datasets:
    individual_id = dataset.get_gen_person()
    print(f"Processing individual with ID: {individual_id}")

    distant_ancestors_paths = find_distant_ancestors(individual_id, records)
    print("Distant Ancestors Paths:", distant_ancestors_paths)

indexed_paths = {i: path for i, path in enumerate(distant_ancestors_paths)}

name_paths = []
for path in distant_ancestors_paths:
    name_path = [extract_name(records.get(id, '')) for id in path]
    name_paths.append(name_path)

for idx, name_path in enumerate(name_paths):# Check for the surname 'Yates' in the paths
    has_yates = any("Yates" in name for name in name_path)
    if has_yates:
        print(f"Path index: {idx}, Contains Yates: {has_yates}, Names: {name_path}")

path_scores = {}  # Initialize an empty dictionary to store the scores of each path

for idx, name_path in enumerate(name_paths):  # Loop through the paths and apply the scoring scheme
    score = 0
    for generation, name in enumerate(name_path):
        if 'Yates' in name:
            score += 1 * (generation + 1)  # Add 1 to generation because index starts at 0

    path_scores[idx] = score    # Store the score of this path

# Find the index of the path with the highest score
winning_path_index = max(path_scores, key=path_scores.get)
winning_path_score = path_scores[winning_path_index]
winning_path_names = name_paths[winning_path_index]

# Print the winning path to the console
print(f"High Branch score was {winning_path_score}.")    #The winning path is at index {winning_path_index}
#print(f"Names in the winning path: {winning_path_names}")

# Reversing the list for highest generation first
winning_path_names.reverse()

# Adding spouses (Replace 'unknown' with actual spouse names if you have them)
spouse_names = ['unknown'] * len(winning_path_names)  # Replace with real names or keep as 'unknown'

# Combine names and spouses in your specified format
formatted_names = [f"{name}&{spouse}" for name, spouse in zip(winning_path_names, spouse_names)]

# Join all the formatted names with '|'
final_output = "|".join(formatted_names)

print("Selected DNA Branch:", final_output)

visited_pairs = set()

def process_individual(individual_id, gedcom_instance):
    global generation_table
    generation_table = pd.DataFrame(columns=['Generation', 'Parent Pair'])
    global visited_pairs
    visited_pairs = set()

    find_parents(individual_id, 1, records)

    for dataset in gedcom_instance.filter_pool:
        if dataset.get_gen_person() == individual_id:
            anchor_gen1 = dataset.get_anchor_gen1()
            generation_table.loc[0] = [1, anchor_gen1]
            break

    generation_table = generation_table.sort_values('Generation', ascending=False).reset_index(drop=True)

    individual_data = {}
    for dataset in gedcom_instance.filter_pool:
        if dataset.get_gen_person() == individual_id:
            individual_data['cM'] = dataset.get_extractable_cm()
            individual_data['Sort'] = dataset.get_extractable_sort()
            break

    filtered_parent_pairs = [row['Parent Pair'] for index, row in generation_table.iterrows() if any(parent in final_output for parent in row['Parent Pair'].split())]

    individual_data['Parent Pairs A10'] = '|'.join(filtered_parent_pairs)

    # Replace 'most_distant_ancestor' with winning path score
    winning_path_score = 100  # Replace with the actual calculation for the winning path score
#    print(f"High Branch score was {winning_path_score}.")
    individual_data['High Branch Score'] = winning_path_score
#    print(f"Returning data for {individual_id}: {individual_data}")  # Debug print
#    print(f"Returning data for {individual_id}: {individual_data}")  # Debug print
    return individual_data

import pandas as pd

# Global variables
generation_table = None
visited_pairs = set()


filtered_individuals = [(dataset.get_anchor_gen1(), dataset.get_gen_person()) for dataset in gedcom_instance.filter_pool]

combined_df_rows = []
for name, individual_id in filtered_individuals:
    individual_data = process_individual(individual_id, gedcom_instance)
    cm = individual_data['cM']
    sort = individual_data['Sort']
    parent_pairs_a10 = individual_data['Parent Pairs A10']
    high_branch_score = individual_data['High Branch Score']  # Use this instead of most_distant_ancestor
    combined_df_rows.append([individual_id, name, sort, cm, high_branch_score, parent_pairs_a10])

def create_hotlink(row):  # Function to create hotlinks
    url_base = "https://yates.one-name.net/tng/verticalchart.php?personID="
    person_id = row['ID#']
    hotlink = f'<a href="{url_base}{person_id}&tree=tree1&parentset=0&display=vertical&generations=8" target="_blank">{person_id}</a>'
    return hotlink

# Assume that you already have your combined_df_rows populated, now with 'High Branch Score'
combined_df = pd.DataFrame(combined_df_rows, columns=['ID#', 'Name', 'Match to', 'cM', 'High Branch Score', 'Ancestral Line A10'])

# Create hotlinks
combined_df['LUN#'] = combined_df.apply(lambda row: create_hotlink(row), axis=1)

# Change the order of the columns
combined_df = combined_df[['ID#', 'Name', 'Match to', 'cM', 'High Branch Score', 'LUN#', 'Ancestral Line A10']]

# Adjust index to start from 1 instead of 0
combined_df.index = combined_df.index + 1

# Print all records from the DataFrame
print(combined_df)

# Export to Excel
combined_df.to_excel('/content/output.xlsx', index=False)

# Initialize an empty list to hold combined DataFrame rows
combined_df_rows = []

for name, individual_id in filtered_individuals:
    # Debug Print
#    print(f"Processing: Name={name}, ID={individual_id}")

    individual_data = process_individual(individual_id, gedcom_instance)

    # Debug Print
#    print(f"Returned individual_data: {individual_data}")

    cm = individual_data['cM']
    sort = individual_data['Sort']
    parent_pairs_a10 = individual_data['Parent Pairs A10']
#    high_branch_score = individual_data['High Branch Score']  # Use this instead of most_distant_ancestor

    # Debug Print
#    print(f"Extracted Values: cm={cm}, sort={sort}, parent_pairs_a10={parent_pairs_a10}, high_branch_score={high_branch_score}")

    combined_df_rows.append([individual_id, name, sort, cm, high_branch_score, parent_pairs_a10])

# Debug Print
#print(f"Final combined_df_rows: {combined_df_rows}")
#print("Contents of filter_pool:", gedcom_instance.filter_pool)
#print("Contents of filtered_individuals:", filtered_individuals)



Automatically selecting the first GEDCOM file.
GEDCOM contained 50776 total records
Records tagged and filtered by NPFX: 375
Manual filter IDs loaded: 1
After manual filter, total records: 1
Records tagged and filtered by NPFX: 1
Processing individual with ID: I50522
Distant Ancestors Paths: [['I50522', 'I50520', 'I50521'], ['I50522', 'I50519', 'I50517', 'I50516'], ['I50522', 'I50519', 'I50517', 'I50515', 'I50514'], ['I50522', 'I50519', 'I50517', 'I50515', 'I50513', 'I44015'], ['I50522', 'I50519', 'I50517', 'I50515', 'I50513', 'I44014', 'I21091', 'I21089', 'I18899', 'I11851', 'I11866', 'I9912', 'I9914', 'I11798', 'I24380', 'I24382', 'I24502', 'I47548', 'I47549'], ['I50522', 'I50519', 'I50517', 'I50515', 'I50513', 'I44014', 'I21091', 'I21089', 'I18899', 'I11851', 'I11866', 'I9912', 'I9914', 'I11798', 'I24380', 'I24382', 'I24502', 'I47548', 'I47550'], ['I50522', 'I50519', 'I50517', 'I50515', 'I50513', 'I44014', 'I21091', 'I21089', 'I18899', 'I11851', 'I11866', 'I9912', 'I9914', 'I11798',

In [30]:
!pip install pandas
!pip install python-gedcom
!pip install openpyxl



In [None]:
Automatically selecting the first GEDCOM file.
GEDCOM contained 50776 total records
Records tagged and filtered by NPFX: 375
Distant Ancestors Paths: [['I42478', 'I42477', 'I42472', 'I30885', 'I30138', 'I36688', 'I22477', 'I11759', 'I11763'], ['I42478', 'I42477', 'I42472', 'I30885', 'I30138', 'I36688', 'I22477', 'I11759', 'I22476'], ['I42478', 'I42477', 'I42472', 'I30885', 'I30138', 'I36688', 'I22477', 'I11760'], ['I42478', 'I42477', 'I42472', 'I30885', 'I30138', 'I36688', 'I29542'], ['I42478', 'I42477', 'I42472', 'I30885', 'I30138', 'I36689', 'I37249'], ['I42478', 'I42477', 'I42472', 'I30885', 'I30138', 'I36689', 'I37248', 'I54', 'I42186', 'I42256'], ['I42478', 'I42477', 'I42472', 'I30885', 'I30138', 'I36689', 'I37248', 'I54', 'I42186', 'I42257'], ['I42478', 'I42477', 'I42472', 'I30885', 'I30138', 'I36689', 'I37248', 'I54', 'I42187', 'I42249'], ['I42478', 'I42477', 'I42472', 'I30885', 'I30138', 'I36689', 'I37248', 'I54', 'I42187', 'I42250'], ['I42478', 'I42477', 'I42472', 'I30885', 'I30138', 'I36689', 'I37248', 'I55'], ['I42478', 'I42477', 'I42472', 'I30885', 'I30139'], ['I42478', 'I42477', 'I42472', 'I34836'], ['I42478', 'I42477', 'I42473'], ['I42478', 'I44838', 'I42474', 'I44832', 'I44834', 'I44836'], ['I42478', 'I44838', 'I42474', 'I44832', 'I44834', 'I44837'], ['I42478', 'I44838', 'I42474', 'I44832', 'I44835'], ['I42478', 'I44838', 'I42474', 'I44833'], ['I42478', 'I44838', 'I42476']]
Path index: 2, Contains Yates: True, Names: ['RoperHollyNic', 'RoperGaryScot', 'RoperFloydSpe', 'RoperKelly', 'RoperDavidAsb', 'RoperElijah', 'RoperJamesC.', 'YatesSarahAnn']
Path index: 13, Contains Yates: True, Names: ['RoperHollyNic', 'RobersonNancyMel', 'RobersonYatesEdm', 'RobersonJohnBost', 'RobersonJamesRun', 'RobersonWilliamI']
Path index: 14, Contains Yates: True, Names: ['RoperHollyNic', 'RobersonNancyMel', 'RobersonYatesEdm', 'RobersonJohnBost', 'RobersonJamesRun', 'NanneyMyra']
Path index: 15, Contains Yates: True, Names: ['RoperHollyNic', 'RobersonNancyMel', 'RobersonYatesEdm', 'RobersonJohnBost', 'MorganElizabeth']
Path index: 16, Contains Yates: True, Names: ['RoperHollyNic', 'RobersonNancyMel', 'RobersonYatesEdm', 'McDadeClara']
High Branch score was 8.
Selected DNA Branch: YatesSarahAnn&unknown|RoperJamesC.&unknown|RoperElijah&unknown|RoperDavidAsb&unknown|RoperKelly&unknown|RoperFloydSpe&unknown|RoperGaryScot&unknown|RoperHollyNic&unknown
User additional manual filter: 1
Returning data for I50522: {'cM': '17', 'Sort': 'yatesronald', 'Parent Pairs A10': 'YatesRichard&AshendonJoan|TichborneJohn&MartinMargaret|YatesJohn&HydeAlice|TichborneNicholas&WhiteAnne|YatesThomas&WhiteFrances|TichborneNicholas&RytheElizabeth|YatesFrancis&TichborneJane|SnowNickolas&HopkinsConstance|WhiteRichard&WestonKatherine|TettershalGeorge&BiggsElizabeth'}
      ID#                Name     Match to  cM      Most Distant Ancestor  \
1  I50522  FehrenbacherAngela  yatesronald  17  YatesRichard&AshendonJoan

                                                LUN#  \
1  <a href="https://yates.one-name.net/tng/vertic...

                                  Ancestral Line A10
1  YatesRichard&AshendonJoan|TichborneJohn&Martin...