<a href="https://colab.research.google.com/github/ronyates47/Gedcom-Utils/blob/main/rebuild_2023_09032108_2008exp.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [18]:
#!pip install pandas
#!pip install python-gedcom
# Base script-stable-selects GEDCOM-correct output

import glob
from gedcom.element.individual import IndividualElement
from gedcom.parser import Parser
import pandas as pd
import os

# Reset the global variables
find_parents_new_counter = 0
last_pair_counter = 0
visited_pairs = set()

def select_gedcom_file():
    gedcom_files = glob.glob('*.ged')
    if not gedcom_files:
        print("No GEDCOM files found in the Colab contents.")
        return None

    print("List of GEDCOM files:")
    for i, file_name in enumerate(gedcom_files, 1):
        print(f"{i}. {file_name}")

    # Automatically select the first GEDCOM file in the list
    selected_num = 1
    return gedcom_files[selected_num - 1]

#This modified version of the function, we removed the while loop that prompts the user to enter a number. Instead, we set the value of selected_num to 1, which means that the first GEDCOM file in the list will always be selected. Finally, we return the selected GEDCOM file using the return statement.

#This modification will cause the function to always select the first GEDCOM file in the list, without prompting the user to enter a number. However, if there are multiple GEDCOM files in the list, the function will still display all available options to the user.

# Initialize the last_prime_surname variable
last_prime_surname = None

def input_prime_surname(last_prime_surname="Yates"):  # Set default surname to "Yates"
    global surname  # Declare surname as a global variable
    if last_prime_surname:
        last_name = input(f"Enter prime_surname (default: {last_prime_surname}): ")
        if not last_name:
            last_name = last_prime_surname
    else:
        last_name = input("Enter prime_surname: ")
    surname = last_name  # Assign the value of last_name to the global surname variable
    return last_name

# Call the function to let the user input prime_surname
prime_surname = input_prime_surname()

# Store the value of prime_surname for later use
last_prime_surname = prime_surname

class Gedcom:
    def __init__(self, file_name):
        self.file_name = file_name
        self.gedcom_datasets = []
        self.filter_pool = []
        self.total_records = 0
        self.total_individuals = 0
        self.total_families = 0


    @staticmethod
    def get_standard_name(file_path):
        file_name = file_path.split('/')[-1]
        if '.' in file_name:
            file_name = file_name.rsplit('.', 1)[0]
        standard_name = file_name.replace(' ', '_').lower()
        return standard_name

    def parse_gedcom(self):
        with open(self.file_name, 'r', encoding='utf-8-sig') as f:
            gedcom_lines = f.readlines()

        parser = Parser()
        parser.parse_file(self.file_name)


        current_dataset = None
        npfx_count = 0
        total_count = 0
        for line in gedcom_lines:
            parts = line.strip().split(' ', 2)
            level = int(parts[0])
            tag = parts[1]
            value = parts[2] if len(parts) > 2 else None

            if level == 0 and tag.startswith('@') and tag.endswith('@') and value == 'INDI':
                total_count += 1
                current_dataset = GedcomDataset(tag)
                self.gedcom_datasets.append(current_dataset)
            elif current_dataset is not None:
                if level == 1 and tag in ['NAME', 'FAMC']:
                    current_key = tag
                    current_dataset.add_extractable_detail(current_key, value)
                elif level == 2 and tag == 'NPFX':
                    npfx_count += 1
                    current_dataset.add_extractable_detail(tag, value)

        for dataset in self.gedcom_datasets:
            if dataset.get_extractable_NPFX():
                  self.filter_pool.append(dataset)

        print(f'Parsed {total_count} total records') # does print


    # Existing methods here

    def filter_individuals(self):
      if os.path.exists('/content/shortged.xlsx'):
          df = pd.read_excel('/content/shortged.xlsx')

          # Cast to string just to be safe
          allowed_individual_ids = df.iloc[:, 0].astype(str).tolist()

          print("Allowed IDs:", allowed_individual_ids)  # Debugging line

          # Debug: Before filtering
#          print("Before filtering:", [dataset.individual_id for dataset in self.gedcom_datasets][:5])  # Print first 5 for checking

          # Using 'contains' comparison
          filtered_datasets = []
          for dataset in self.gedcom_datasets:
              if any(allowed_id in dataset.individual_id for allowed_id in allowed_individual_ids):
                  filtered_datasets.append(dataset)

          self.gedcom_datasets = filtered_datasets

          # Debug: After filtering
          print("After filtering:", [dataset.individual_id for dataset in self.gedcom_datasets][:5])  # Print first 5 for checking

          print('Records Moved into short_pool:', len(self.gedcom_datasets))
      else:
          print("Excel file '/content/shortged.xlsx' not found.")


class GedcomDataset:
    def __init__(self, gen_person):
        self.individual_id = gen_person  # Initialize with gen_person or however you like
        self.extractable_detail = {}
        self.anchor_gen1 = ""  # Initialize to an empty string

    def add_extractable_detail(self, key, value):
        self.extractable_detail[key] = value

    def set_anchor_gen1(self):
        name = self.extractable_detail.get('NAME', '')
        first_name, last_name = name.split('/', 1)
        first_name = first_name.split(' ')[0]
        last_name = last_name.rstrip('/')
        self.anchor_gen1 = last_name.replace(" ", "") + first_name.replace(" ", "")

    def get_anchor_gen1(self):
        return self.anchor_gen1

    def get_individual_id(self):  # Renamed this method for clarity
        return self.individual_id.strip('@')

    def get_extractable_cm(self):
        return self.extractable_detail.get('cm', '')

    def get_extractable_sort(self):
        return self.extractable_detail.get('sort', '')

    def get_extractable_NPFX(self):
        return self.extractable_detail.get('NPFX', '')

    def get_extractable_FAMC(self):
        return self.extractable_detail.get('FAMC', '').strip('@')

class IndividualRecord:
    def __init__(self, id, cm, sort):
        self.id = id
        self.cm = cm
        self.sort = sort

# Declare anchor_gen1 as a global variable
global anchor_gen1
anchor_gen1 = None

def get_gen_person(self):
    name = self.extractable_detail.get('NAME', '')
    first_name, last_name = name.split('/', 1)
    first_name = first_name.split(' ')[0]
    last_name = last_name.rstrip('/')
    anchor_gen1_local = last_name.replace(" ", "") + first_name.replace(" ", "")
    self.anchor_gen1 = anchor_gen1_local  # Store it in an instance variable if needed
    return self.individual_id.strip('@'), anchor_gen1_local
    def get_anchor_gen1(self):
        return self.anchor_gen1

    def get_extractable_NPFX(self):
        return self.extractable_detail.get('NPFX', '')

    def get_extractable_cm(self):
        npfx_value = self.extractable_detail.get('NPFX', '')
        if '&' in npfx_value:
            cm_value = npfx_value.split('&')[0].strip()
        else:
            cm_value = npfx_value.strip()
        try:
            int(cm_value)
            return cm_value
        except ValueError:
            return 'error'

    def get_extractable_sort(self):
        npfx_value = self.extractable_detail.get('NPFX', '')
        if '&' in npfx_value:
            sort_value = npfx_value.split('&')[1].strip()
            return sort_value
        else:
            return ''

    def get_full_name(self):
        name = self.extractable_detail.get('NAME', '')
        if '/' in name:
            first_name, last_name = name.split('/', 1)
            first_name = first_name.strip()
            last_name = last_name.rstrip('/')
            full_name = last_name.replace(" ", "") + first_name.replace(" ", "")
        else:
            full_name = name.replace(" ", "")
        return full_name


    def get_extractable_FAMC(self):
        return self.extractable_detail.get('FAMC', '').strip('@')




# Function definitions
def extract_id(record):
    id_start = record.find('@') + 1
    id_end = record.find('@', id_start)
    return record[id_start:id_end]

def extract_name(record):
    name_start = record.find('1 NAME ') + 6
    name_end = record.find('\n', name_start)
    name = record[name_start:name_end]
    first_name, last_name = name.split('/', 1)
    first_name = first_name[:10] # Use slicing syntax to extract the first 10 characters of the first_name variable
    last_name = last_name[:10].rstrip('/') # Use slicing syntax to extract the first 10 characters of the last_name variable
    return last_name.replace(" ", "") + first_name.replace(" ", "")

def find_parents(individual_id, generation, records, parent_pairs, children, ancestral_lines, current_line=None):
    if current_line is None:
        current_line = []

    if individual_id not in records:
        return

    record = records[individual_id]
    famc_start = record.find('1 FAMC @') + 8
    famc_end = record.find('@', famc_start)
    famc_id = record[famc_start:famc_end]
    if famc_id not in records:
        return

    fam_record = records[famc_id]
    wife_start = fam_record.find('1 WIFE @') + 8
    wife_end = fam_record.find('@', wife_start)
    mother_id = fam_record[wife_start:wife_end]

    husb_start = fam_record.find('1 HUSB @') + 8
    husb_end = fam_record.find('@', husb_start)
    father_id = fam_record[husb_start:husb_end]

    if mother_id and mother_id in records:
        mother_record = records[mother_id]
        mother_name = extract_name(mother_record)
        if mother_name not in children:
            children[mother_name] = []
        children[mother_name].append(individual_id)
    else:
        mother_name = None

    if father_id and father_id in records:
        father_record = records[father_id]
        father_name = extract_name(father_record)
        if father_name not in children:
            children[father_name] = []
        children[father_name].append(individual_id)
    else:
        father_name = None

    if mother_name and father_name:
        parent_pair = father_name + "&" + mother_name
        current_line.append((generation, parent_pair))
        if parent_pair not in visited_pairs:
            visited_pairs.add(parent_pair)
            if has_both_parents(records, mother_id, father_id):
                parent_pairs.append((generation, parent_pair))
                ancestral_lines[parent_pair] = list(current_line)

    if mother_id:
        find_parents(mother_id, generation + 1, records, parent_pairs, children, ancestral_lines, current_line)

    if father_id:
        find_parents(father_id, generation + 1, records, parent_pairs, children, ancestral_lines, current_line)

    current_line.pop()

def process_individual(individual_id, gedcom_instance):
    global generation_table
    generation_table = pd.DataFrame(columns=['Generation', 'Parent Pair'])
    global visited_pairs
    visited_pairs = set()

    parent_pairs = []
    children = {}
    ancestral_lines = {}
    find_parents(individual_id, 1, records, parent_pairs, children, ancestral_lines)

    # Add anchor_gen1 to the beginning of each ancestral line
    for last_pair, ancestral_line in ancestral_lines.items():
        # Check if anchor_gen1 has been assigned a value
        if anchor_gen1 is not None:
            ancestral_line.insert(0, (1, anchor_gen1))


    individual_data = {}
    individual_data['Last Pairs'] = ancestral_lines

    return individual_data


individual_data = process_individual(individual_id, gedcom_instance)
last_pairs = individual_data['Last Pairs']

#for key, value in last_pairs.items():
#    print(f'{key}: {value[1]}', end='\t')


def trace_children(individual_id, parent_pairs, children, visited=None):
    # Create a list to store the children of the individual
    individual_children = []

    # Create a set to keep track of visited individuals
    if visited is None:
        visited = set()

    # Check if the individual has already been visited
    if individual_id in visited:
        # If the individual has already been visited, stop the recursion and return an empty list
        return individual_children

    # Add the individual to the set of visited individuals
    visited.add(individual_id)

    # Find the name of the individual
    individual_name = None
    for pair in parent_pairs:
        if "lastpair" in pair[1]:
            names = pair[1].split('&')
            if individual_id in children[names[0]]:
                individual_name = names[0]
            elif individual_id in children[names[1]]:
                individual_name = names[1]
            break

    # Check if the individual has any children
    if individual_name in children:
        # Add the children of the individual to the list
        individual_children.extend(children[individual_name])

        # Recursively find the children of the individual's children
        for child_id in children[individual_name]:
            child_children = trace_children(child_id, parent_pairs, children, visited)
            individual_children.extend(child_children)

    return individual_children

descendants = trace_children(individual_id, parent_pairs, children)
print(f'The ancestors of individual {individual_id} are: {descendants}')


# Define a scoring function that counts the number of occurrences of the target surname in an ancestral line
def score_ancestral_line(ancestral_line_list):
    score = 0
    for line in map(str, ancestral_line_list):
        if target_surname in line:
            score += 1
    return score


# Initialize an empty dictionary to store the scores
scores = {}

# Convert the tuples into a list
ancestral_lines_list = list(ancestral_lines.items())


# Compute and store the scores for each last pair
for last_pair, ancestral_line in ancestral_lines_list:
    scores[last_pair] = score_ancestral_line(ancestral_line)


# Find the final-line with the highest score
final_pair = max(scores, key=scores.get)
final_score = scores[final_pair]

# Print the last pair and score for each ancestral line
#for last_pair, score in scores.items():
#    print(f"Last pair: {last_pair} (score: {score})")

# Print the final-line with a designation of 'final-line' to the highest score
# print(f"dnaline: {final_pair} (score: {final_score})")

# SCRIPT STABLE DNALINE AS OUTPUT-DONOR SCRIPT---DONOR SCRIPT---DONOR SCRIPT

# Initialize a dictionary to store the cumulative scores for each branch line
cumulative_scores = {}

# Calculate and store the cumulative scores for each branch line and generation
for last_pair, ancestral_line in ancestral_lines_list:
    cumulative_score = 0
    for generation, line in enumerate(ancestral_line, start=1):
        generation_score = score_ancestral_line([line])
        cumulative_score += generation_score
        cumulative_scores.setdefault(line, 0)  # Initialize cumulative score for line if not exists
        cumulative_scores[line] += cumulative_score

# Print the cumulative scores for each branch line and generation
for line, cumulative_score in cumulative_scores.items():
    print(f"Ancestral line: {line}, Cumulative Score = {cumulative_score}")

# Find the final-line with the highest cumulative score
final_line = max(cumulative_scores, key=cumulative_scores.get)
final_cumulative_score = cumulative_scores[final_line]

# Print the final-line with the highest cumulative score
print(f"dnaline: {final_line}, Cumulative Score = {final_cumulative_score}")

def has_both_parents(records, mother_id, father_id):
    return mother_id in records and father_id in records

visited_pairs = set()

def find_parents(individual_id, generation, records, parent_pairs, children, ancestral_lines, current_line=None):
    if current_line is None:
        current_line = []

    find_parents(individual_id, 1, records)

    # Concatenate the value of the anchor_gen1 attribute to the beginning of the generation table
    for dataset in gedcom_instance.filter_pool:
        if dataset.get_individual_id() == individual_id:
            anchor_gen1 = dataset.get_anchor_gen1()
            generation_table.loc[0] = [1, anchor_gen1]
            break

    generation_table = generation_table.sort_values('Generation', ascending=False).reset_index(drop=True)

    individual_data = {}
    individual_data['cM'] = individual.get_extractable_cm()
    individual_data['Sort'] = individual.get_extractable_sort()

    individual_data['Parent Pairs A10'] = '|'.join([f"{row['Parent Pair']}" for index, row in generation_table[:10].iterrows()])
    return individual_data

# Define the function before calling it
def process_dataset(filter_pool, individual_id, generation_table):
    individual_data = {}
    for dataset in filter_pool:
        if dataset.get_individual_id() == individual_id:
            individual_data['cM'] = dataset.get_extractable_cm()
            individual_data['Sort'] = dataset.get_extractable_sort()
            break

# Initialize the Gedcom class and populate gedcom_datasets
gedcom_file_path = select_gedcom_file()
if gedcom_file_path:
    gedcom_instance = Gedcom(gedcom_file_path)
    gedcom_instance.parse_gedcom()

# Filter individuals
gedcom_instance.filter_individuals()

# You should define or fetch individual_id and generation_table somewhere here
individual_id = "some_id_here"  # Replace with actual value
generation_table = "some_table_here"  # Replace with actual value

# Now, you can call the function and store the result
result = process_dataset(gedcom_instance.filter_pool, individual_id, generation_table)

# Create an empty list to store individual data
individuals = []

# Iterate over filtered datasets to gather individual data
for dataset in gedcom_instance.gedcom_datasets:
    individual_id = dataset.get_individual_id()  # Update this line
    last_name = dataset.get_anchor_gen1()
    individuals.append((last_name, individual_id))


    last_name = dataset.get_anchor_gen1()
    individuals.append((last_name, individual_id))

# Display total records moved to smaller pile
print(f'Total Records Moved into Smaller Pile: {len(individuals)}')


# Run the process_individual function for each individual in the individuals list and create a combined DataFrame
combined_df_rows = []
for name, individual_id in individuals:
    individual_data = process_individual(individual_id, gedcom_instance)
    cm = individual_data['cM']
    sort = individual_data['Sort']
    parent_pairs_a10 = individual_data['Parent Pairs A10']
    most_distant_ancestor = generation_table.iloc[0]['Parent Pair']  # Get the most distant ancestor from the last row of generation_table
    combined_df_rows.append([individual_id, name, sort, cm, most_distant_ancestor, parent_pairs_a10])

combined_df = pd.DataFrame(combined_df_rows, columns=['ID#', 'Name', 'Match to', 'cM', 'Most Distant Ancestor', 'Ancestral Line A10'])

# Function to create hotlinks
def create_hotlink(row):
    url_base = "https://yates.one-name.net/tng/verticalchart.php?personID="
    person_id = row['ID#']
    hotlink = f'<a href="{url_base}{person_id}&tree=tree1&parentset=0&display=vertical&generations=8" target="_blank">{person_id}</a>'
    return hotlink

# Apply the hotlink function to create the 'LUN#' column
combined_df['LUN#'] = combined_df.apply(lambda row: create_hotlink(row), axis=1)

# Change the order of the columns
combined_df = combined_df[['ID#', 'Name', 'Match to', 'cM', 'Most Distant Ancestor', 'LUN#', 'Ancestral Line A10']]

# Adjust index to start from 1 instead of 0
combined_df.index = combined_df.index + 1

# Print all records from the DataFrame
print(combined_df)

# Export the combined_df DataFrame to an Excel file
combined_df.to_excel('/content/output.xlsx', index=False)

Enter prime_surname (default: Yates): 


NameError: ignored

In [1]:
!pip install pandas
!pip install python-gedcom

Collecting python-gedcom
  Downloading python_gedcom-1.0.0-py2.py3-none-any.whl (35 kB)
Installing collected packages: python-gedcom
Successfully installed python-gedcom-1.0.0
