<a href="https://colab.research.google.com/github/ronyates47/Gedcom-Utils/blob/main/A_v_18_Yates_Primary_2023_gedcom.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import csv
import glob
from gedcom.element.individual import IndividualElement
from gedcom.parser import Parser
import pandas as pd

class GedcomDataset:
    def __init__(self, gen_person):
        self.gen_person = gen_person
        self.extractable_detail = {}

    def add_extractable_detail(self, key, value):
        self.extractable_detail[key] = value

    def get_gen_person(self):
        name = self.extractable_detail.get('NAME', '')
        parts = name.split('/', 1)
        first_name = parts[0].split(' ')[0]
        last_name = parts[1].rstrip('/') if len(parts) > 1 else ""
        self.anchor_gen1 = last_name.replace(" ", "") + first_name.replace(" ", "")
        return self.gen_person.strip('@')

    def get_anchor_gen1(self):
        return self.anchor_gen1

    def get_extractable_NPFX(self):
        return self.extractable_detail.get('NPFX', '')

    def get_extractable_cm(self):
        npfx_value = self.extractable_detail.get('NPFX', '')
        if '&' in npfx_value:
            cm_value = npfx_value.split('&')[0].strip()
        else:
            cm_value = npfx_value.strip()
        try:
            int(cm_value)
            return cm_value
        except ValueError:
            return 'error'

    def get_extractable_sort(self):
        npfx_value = self.extractable_detail.get('NPFX', '')
        if '&' in npfx_value:
            sort_value = npfx_value.split('&')[1].strip()
            return sort_value
        else:
            return ''

    def get_extractable_FAMC(self):
        return self.extractable_detail.get('FAMC', '').strip('@')

# Function definitions
def extract_id(record):
    id_start = record.find('@') + 1
    id_end = record.find('@', id_start)
    return record[id_start:id_end]

def extract_name(record):
    name_start = record.find('1 NAME ') + 6
    name_end = record.find('\n', name_start)
    name = record[name_start:name_end]
    first_name, last_name = name.split('/', 1)
    first_name = first_name[:10] # Use slicing syntax to extract the first 10 characters of the first_name variable
    last_name = last_name[:10].rstrip('/') # Use slicing syntax to extract the first 10 characters of the last_name variable
    return last_name.replace(" ", "") + first_name.replace(" ", "")

class Gedcom:
    def __init__(self, file_name):
        self.file_name = file_name
        self.gedcom_datasets = []
        self.filter_pool = []

    @staticmethod
    def get_standard_name(file_path):
        file_name = file_path.split('/')[-1]
        if '.' in file_name:
            file_name = file_name.rsplit('.', 1)[0]
        standard_name = file_name.replace(' ', '_').lower()
        return standard_name

    def parse_gedcom(self):  # Note the correct indentation here
        with open(self.file_name, 'r', encoding='utf-8-sig') as f:
            gedcom_lines = f.readlines()
        current_dataset = None
        npfx_count = 0
        total_count = 0
        for line in gedcom_lines:
            parts = line.strip().split(' ', 2)
            level = int(parts[0])
            tag = parts[1]
            value = parts[2] if len(parts) > 2 else None

            if level == 0 and tag.startswith('@') and tag.endswith('@') and value == 'INDI':
                total_count += 1
                current_dataset = GedcomDataset(tag)
                self.gedcom_datasets.append(current_dataset)
#                print(f"New individual dataset created with tag {tag}")
            elif current_dataset is not None:
                if level == 1 and tag in ['NAME', 'FAMC']:
                    current_key = tag
                    current_dataset.add_extractable_detail(current_key, value)
#                    print(f"Added detail: {current_key} = {value}")
                elif level == 2 and tag == 'NPFX':
                    npfx_count += 1
                    current_dataset.add_extractable_detail(tag, value)
#                    print(f"Added NPFX: {value}")

        print(f'GEDCOM contained {total_count} total records')
        print(f'Records tagged and filtered by NPFX: {npfx_count}')

        # First level of filtering: Filter those with NPFX
        for dataset in self.gedcom_datasets:
            if dataset.get_extractable_NPFX():
                self.filter_pool.append(dataset)

        # Check if manual filtering should be applied
        manual_filter_activated = True  # or False depending on your situation

        # Second level of filtering: Apply manual filter from Excel sheet
        if manual_filter_activated:
            import pandas as pd  # Assuming you haven't imported it yet
            try:
                df = pd.read_excel('filtered_ids.xlsx')
            except FileNotFoundError:
                print("filtered_ids.xlsx not found. Skipping second-level manual filter.")
            else:
                manual_filtered_ids = set(df['ID'])
                print(f"Manual filter IDs loaded: {len(manual_filtered_ids) - 1}")

                self.filter_pool = [dataset for dataset in self.filter_pool if dataset.get_gen_person() in manual_filtered_ids]
                print(f"After manual filter, total records: {len(self.filter_pool)}")

def input_prime_surname(last_prime_surname=None):
    if last_prime_surname:
        last_name = input(f"Enter prime_surname (default: {last_prime_surname}): ")
        if not last_name:
            last_name = last_prime_surname
    else:
        last_name = input("Enter prime_surname: ")
    return last_name

def select_gedcom_file():
    gedcom_files = glob.glob('*.ged')
    if not gedcom_files:
        print("No GEDCOM files found.")
        return None

    print("Automatically selecting the first GEDCOM file.")
    return gedcom_files[0]

    while True:
        try:
            selected_num = int(input("Enter the number of the GEDCOM file you want to use: "))
            if 1 <= selected_num <= len(gedcom_files):
                return gedcom_files[selected_num - 1]
            else:
                print("Invalid number. Please enter a valid number from the list.")
        except ValueError:
            print("Invalid input. Please enter a valid number.")

gedcom_file_path = select_gedcom_file() # Call the function to let the user select the GEDCOM file
if gedcom_file_path:
    # Use the selected GEDCOM file path to create an instance of the Gedcom class
    gedcom_instance = Gedcom(gedcom_file_path)
    gedcom_instance.parse_gedcom()

    individuals = []  # Initialize the list of individuals

    for dataset in gedcom_instance.filter_pool:    # Iterate over the filter_pool list,add each last name and ID to list
        individual_id = dataset.get_gen_person()
        last_name = dataset.get_anchor_gen1()
        individuals.append((last_name, individual_id))

#    print(f'Records tagged and filtered by NPFX: {len(individuals)}')

    with open(gedcom_file_path, 'r') as file:    # Read the GEDCOM file and split it into individual and family records
        data = file.read()
    data = data.split('\n0 ')
    records = {extract_id(record): record for record in data}

def has_both_parents(records, mother_id, father_id):
    return mother_id in records and father_id in records

def find_parents(individual_id, generation, records):
    if individual_id not in records:
        return
    record = records[individual_id]
    famc_start = record.find('1 FAMC @') + 8
    famc_end = record.find('@', famc_start)
    famc_id = record[famc_start:famc_end]
    if famc_id not in records:
        return

    fam_record = records[famc_id]
    wife_start = fam_record.find('1 WIFE @') + 8
    wife_end = fam_record.find('@', wife_start)
    mother_id = fam_record[wife_start:wife_end]

    husb_start = fam_record.find('1 HUSB @') + 8
    husb_end = fam_record.find('@', husb_start)
    father_id = fam_record[husb_start:husb_end]

    if mother_id and mother_id in records:
        mother_record = records[mother_id]
        mother_name = extract_name(mother_record)
    else:
        mother_name = None

    if father_id and father_id in records:
        father_record = records[father_id]
        father_name = extract_name(father_record)
    else:
        father_name = None

    if mother_name is not None and father_name is not None:
        parent_pair = father_name + "&" + mother_name
        if parent_pair not in visited_pairs:
            visited_pairs.add(parent_pair)
            if has_both_parents(records, mother_id, father_id):
                generation_table.loc[len(generation_table)] = [generation, parent_pair]

    if mother_id:
        find_parents(mother_id, generation + 1, records)

    if father_id:
        find_parents(father_id, generation + 1, records)

def extract_name(record):
    name_start = record.find('1 NAME ') + 6
    name_end = record.find('\n', name_start)
    name = record[name_start:name_end]
    first_name, last_name = name.split('/', 1)
    first_name = first_name[:10]
    last_name = last_name[:10].rstrip('/')
    return last_name.replace(" ", "") + first_name.replace(" ", "")

def find_distant_ancestors(individual_id, records, path=None):
    if path is None:
        path = [individual_id]
    else:
        path.append(individual_id)

    if individual_id not in records:
        return []

    record = records[individual_id]
    famc_start = record.find('1 FAMC @') + 8
    famc_end = record.find('@', famc_start)
    famc_id = record[famc_start:famc_end]

    if famc_id not in records:
        return [path]

    fam_record = records[famc_id]
    wife_start = fam_record.find('1 WIFE @') + 8
    wife_end = fam_record.find('@', wife_start)
    mother_id = fam_record[wife_start:wife_end]

    husb_start = fam_record.find('1 HUSB @') + 8
    husb_end = fam_record.find('@', husb_start)
    father_id = fam_record[husb_start:husb_end]

    if father_id is None and mother_id is None:
        return [path]

    paths = []
    if father_id:
        new_path = list(path)
        paths.extend(find_distant_ancestors(father_id, records, new_path))

    if mother_id:
        new_path = list(path)
        paths.extend(find_distant_ancestors(mother_id, records, new_path))

    return paths
filtered_datasets = gedcom_instance.filter_pool

# Additional Function to isolate score calculation logic
def calculate_score(distant_ancestors_paths, records):
    name_paths = []
    for path in distant_ancestors_paths:
        name_path = [extract_name(records.get(id, '')) for id in path]
        name_paths.append(name_path)

    path_scores = {}
    for idx, name_path in enumerate(name_paths):
        score = 0
        for generation, name in enumerate(name_path):
            if 'Yates' in name:
                score += 1 * (generation + 1)
        path_scores[idx] = score

    if path_scores:
        winning_path_index = max(path_scores, key=path_scores.get)
        winning_path_score = path_scores[winning_path_index]
        winning_path_names = name_paths[winning_path_index]
    else:
        winning_path_index = None
        winning_path_score = 0
        winning_path_names = []

    return winning_path_score, winning_path_names

# Start your main loop
for dataset in filtered_datasets:
    individual_id = dataset.get_gen_person()
    distant_ancestors_paths = find_distant_ancestors(individual_id, records)

    # Calculate score
    winning_path_score, winning_path_names = calculate_score(distant_ancestors_paths, records)

    # Translate IDs to Names
    name_paths = []
    for path in distant_ancestors_paths:
        name_path = [extract_name(records.get(id, '')) for id in path]
        name_paths.append(name_path)

    # Reverse the names for output formatting
    winning_path_names.reverse()

    # After score calculation, you do your output formatting
    spouse_names = ['322'] * len(winning_path_names)
    formatted_names = [f"{name}&{spouse}" for name, spouse in zip(winning_path_names, spouse_names)]
    final_output = "|".join(formatted_names)

# Inside your existing process_individual function
def process_individual(individual_id, gedcom_instance, records):
    global generation_table
    global visited_pairs

    # Resetting the variables for each individual
    generation_table = pd.DataFrame(columns=['Generation', 'Parent Pair'])
    visited_pairs = set()

    # No recursive call here; simply proceed with the rest of the function
    find_parents(individual_id, 1, records)
    distant_ancestors_paths = find_distant_ancestors(individual_id, records)

    # Call the isolated score calculation function
    winning_path_score, winning_path_names = calculate_score(distant_ancestors_paths, records)

    for dataset in records:  # Use passed-in records instead of gedcom_instance.filter_pool
        if dataset.get_gen_person() == individual_id:
            anchor_gen1 = dataset.get_anchor_gen1()
            generation_table.loc[0] = [1, anchor_gen1]
            break

    winning_path_score, winning_path_names = calculate_score(distant_ancestors_paths, records)    # Call the isolated score calculation function

    individual_data = {}
    for dataset in gedcom_instance.filter_pool:
        if dataset.get_gen_person() == individual_id:
            individual_data['cM'] = dataset.get_extractable_cm()
            individual_data['Sort'] = dataset.get_extractable_sort()
            break  # Update for multiple records. The score and filtered line do not come from here...

    filtered_parent_pairs = [row['Parent Pair'] for index, row in generation_table.iterrows() if any(parent in final_output for parent in row['Parent Pair'].split())]
    selected_dna_branch_parents = set(parent.split('&')[0] for parent in final_output.split('|'))

    filtered_parent_pairs = {}
    for index, row in generation_table.iterrows():
        generation = row['Generation']
        parent_pair = row['Parent Pair']
        for parent in parent_pair.split('&'):
            if parent in selected_dna_branch_parents:
                filtered_parent_pairs[generation] = parent_pair
                break

    # Convert filtered_parent_pairs to a string format, omitting generation numbers
    filtered_parent_pairs_str = '|'.join(filtered_parent_pairs.values())
    individual_data['Filtered Ancestral Line'] = filtered_parent_pairs_str    # Add it to individual_data

    return individual_data


#print(final_output)
#filtered_parent_pairs = [row['370-Parent Pair'] for index, row in generation_table.iterrows() if any(parent in final_output for parent in row['Parent Pair'].split())]
#print(filtered_parent_pairs)
#for index, row in generation_table.iterrows():
#    print("373-Parent Pair in generation_table:", row['Parent Pair'].split())
#    print("374-Parent in final_output:", [parent for parent in row['Parent Pair'].split() if parent in final_output])

import pandas as pd

# Global variables
visited_pairs = set()

# Assuming gedcom_instance and path_scores variables are already defined
filtered_individuals = [(dataset.get_anchor_gen1(), dataset.get_gen_person()) for dataset in gedcom_instance.filter_pool]

combined_df_rows = []  # Initialize your empty combined_df_rows list

# Start your main loop
for dataset in filtered_datasets:
    individual_id = dataset.get_gen_person()
    individual_name = extract_name(records.get(individual_id, ''))

    distant_ancestors_paths = find_distant_ancestors(individual_id, records)
    winning_path_score, winning_path_names = calculate_score(distant_ancestors_paths, records)

    # Translate IDs to Names
    name_paths = []
    for path in distant_ancestors_paths:
        name_path = [extract_name(records.get(id, '')) for id in path]
        name_paths.append(name_path)

    winning_path_names.reverse()    # Reverse the names for output formatting

    formatted_names = [f"{name}" for name in winning_path_names]
    final_output = "~".join(formatted_names)

    individual_data = process_individual(individual_id, gedcom_instance, gedcom_instance.filter_pool)
    cm = individual_data['cM']
    sort = individual_data['Sort']

    # Swapping the positions of 'individual_name' and 'sort' in the output
    combined_df_rows.append([individual_id, sort, individual_name, cm, final_output])

# Define whether to use hotlinks or not
USE_HOTLINK = True  # Set this to False if you don't want to use hotlinks

def create_hotlink(row):
    if USE_HOTLINK:
        url_base = "https://yates.one-name.net/tng/verticalchart.php?personID="
        additional_params = "&tree=tree1&parentset=0&display=vertical&generations=15"
        person_id = row['ID#']
        hotlink_url = f'{url_base}{person_id}{additional_params}'
        return f'<a href="{hotlink_url}">{person_id}</a>'
    else:
        return row['ID#']

combined_df = pd.DataFrame(combined_df_rows, columns=['ID#', 'Match to', 'Name', 'cM', 'Yates DNA Ancestral Line'])  # Excluded 'Score' from columns

combined_df['LUN#'] = combined_df.apply(lambda row: create_hotlink(row), axis=1)

ordered_columns = ['ID#', 'Match to', 'Name', 'cM', 'LUN#', 'Yates DNA Ancestral Line']  # Removed 'Score' from ordered columns
combined_df = combined_df[ordered_columns]

combined_df.index = combined_df.index + 1

# Sort the DataFrame as you wanted
combined_df = combined_df.sort_values(by=['Match to', 'Yates DNA Ancestral Line'], ascending=[False, True])

print(combined_df)

combined_df.to_excel('/content/output.xlsx', index=False)



Automatically selecting the first GEDCOM file.
GEDCOM contained 50938 total records
Records tagged and filtered by NPFX: 400
filtered_ids.xlsx not found. Skipping second-level manual filter.
        ID#       Match to              Name   cM  \
165  I47241  yatesjamesrob   StanleyMiltonTh   16   
170  I47306  yatesjamesrob      MyersDianaJo   15   
179  I47416  yatesjamesrob  StrattonJudithMa   17   
386  I50806  yatesjamesrob   SummerfielJason  260   
174  I47350  yatesjamesrob       LaxTinaMari   13   
..      ...            ...               ...  ...   
365  I50364  addison,david  HumphriesBrandon   18   
364  I50355  addison,david     HunterSamuelF   19   
362  I50309  addison,david     YatesDonaldFr   13   
366  I50369  addison,david    MillerMyraBell   13   
367  I50375  addison,david      MyersRichard   15   

                                                  LUN#  \
165  <a href="https://yates.one-name.net/tng/vertic...   
170  <a href="https://yates.one-name.net/tng/vertic...  

In [1]:
!pip install pandas
!pip install python-gedcom
!pip install openpyxl

Collecting python-gedcom
  Downloading python_gedcom-1.0.0-py2.py3-none-any.whl (35 kB)
Installing collected packages: python-gedcom
Successfully installed python-gedcom-1.0.0
