<a href="https://colab.research.google.com/github/ronyates47/Gedcom-Utils/blob/main/another_start_2023_0918_1638_hrs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [30]:
#!pip install pandas
#!pip install python-gedcom
import glob
from gedcom.element.individual import IndividualElement
from gedcom.parser import Parser
import pandas as pd

generation_table = pd.DataFrame(columns=['Generation', 'Parent Pair'])
# Initialize the last_prime_surname variable

last_prime_surname = None

class Gedcom:
    def __init__(self, file_name):
        self.file_name = file_name
        self.gedcom_datasets = []
        self.filter_pool = []

    @staticmethod
    def get_standard_name(file_path):
        file_name = file_path.split('/')[-1]
        if '.' in file_name:
            file_name = file_name.rsplit('.', 1)[0]
        standard_name = file_name.replace(' ', '_').lower()
        return standard_name

    def parse_gedcom(self):
        with open(self.file_name, 'r', encoding='utf-8-sig') as f:
            gedcom_lines = f.readlines()

        current_dataset = None
        npfx_count = 0
        total_count = 0
        for line in gedcom_lines:
            parts = line.strip().split(' ', 2)
            level = int(parts[0])
            tag = parts[1]
            value = parts[2] if len(parts) > 2 else None

            if level == 0 and tag.startswith('@') and tag.endswith('@') and value == 'INDI':
                total_count += 1
                current_dataset = GedcomDataset(tag)
                self.gedcom_datasets.append(current_dataset)
            elif current_dataset is not None:
                if level == 1 and tag in ['NAME', 'FAMC']:
                    current_key = tag
                    current_dataset.add_extractable_detail(current_key, value)
                elif level == 2 and tag == 'NPFX':
                    npfx_count += 1
                    current_dataset.add_extractable_detail(tag, value)

        print(f'Parsed {total_count} total records')

        for dataset in self.gedcom_datasets:
            if dataset.get_extractable_NPFX():
                self.filter_pool.append(dataset)

class GedcomDataset:
    def __init__(self, gen_person):
        self.gen_person = gen_person
        self.extractable_detail = {}
        self.ancestral_line = []  # Initialize the ancestral_line attribute as an empty list

    def add_extractable_detail(self, key, value):
        self.extractable_detail[key] = value

    def get_gen_person(self):
        name = self.extractable_detail.get('NAME', '')
        first_name, last_name = name.split('/', 1)
        first_name = first_name.split(' ')[0]
        last_name = last_name.rstrip('/')
        self.anchor_gen1 = last_name.replace(" ", "") + first_name.replace(" ", "")
        return self.gen_person.strip('@')

    def get_anchor_gen1(self):
        return self.anchor_gen1

    def get_extractable_NPFX(self):
        return self.extractable_detail.get('NPFX', '')

    def get_extractable_cm(self):
        npfx_value = self.extractable_detail.get('NPFX', '')
        if '&' in npfx_value:
            cm_value = npfx_value.split('&')[0].strip()
        else:
            cm_value = npfx_value.strip()
        try:
            int(cm_value)
            return cm_value
        except ValueError:
            return 'error'

    def get_extractable_sort(self):
        npfx_value = self.extractable_detail.get('NPFX', '')
        if '&' in npfx_value:
            sort_value = npfx_value.split('&')[1].strip()
            return sort_value
        else:
            return ''

    def get_extractable_FAMC(self):
        return self.extractable_detail.get('FAMC', '').strip('@')

    def truncate_ancestral_line(self, surname):
        # Start at the most distant ancestor and work your way down
        for i in range(len(self.ancestral_line) - 1, -1, -1):
            # Check if the current ancestor's surname matches the inputted surname
            if self.ancestral_line[i].split('&')[0] == surname:
                # If it does, truncate the ancestral line at this point
                self.ancestral_line = self.ancestral_line[:i+1]
                break

    def populate_ancestral_line(self, records):
        # Code to populate the ancestral_line attribute with the individual's ancestral line
        individual_id = self.get_gen_person()
        self.find_ancestors(individual_id, records)

    def find_ancestors(self, individual_id, records):
        if individual_id not in records:
            return
        record = records[individual_id]
        famc_start = record.find('1 FAMC @') + 8
        famc_end = record.find('@', famc_start)
        famc_id = record[famc_start:famc_end]
        if famc_id not in records:
            return

        fam_record = records[famc_id]
        wife_start = fam_record.find('1 WIFE @') + 8
        wife_end = fam_record.find('@', wife_start)
        mother_id = fam_record[wife_start:wife_end]

        husb_start = fam_record.find('1 HUSB @') + 8
        husb_end = fam_record.find('@', husb_start)
        father_id = fam_record[husb_start:husb_end]

        if mother_id and mother_id in records:
            mother_record = records[mother_id]
            mother_name = extract_name(mother_record)
            self.ancestral_line.append(mother_name)
            self.find_ancestors(mother_id, records)

        if father_id and father_id in records:
            father_record = records[father_id]
            father_name = extract_name(father_record)
            self.ancestral_line.append(father_name)
            self.find_ancestors(father_id, records)

def select_gedcom_file():
    gedcom_files = glob.glob('*.ged')
    if not gedcom_files:
        print("No GEDCOM files found in the Colab contents.")
        return None

    print("List of GEDCOM files:")
    for i, file_name in enumerate(gedcom_files, 1):
        print(f"{i}. {file_name}")

    return gedcom_files[0]

# Read the GEDCOM file and split it into individual and family records
gedcom_file_path = select_gedcom_file()  # Add this line
with open(gedcom_file_path, 'r') as file:
    data = file.read()

# Split data by lines starting with '0 '
data = data.split('\n0 ')

# Function definitions
def extract_id(record):
    id_start = record.find('@') + 1
    id_end = record.find('@', id_start)
    return record[id_start:id_end]

def extract_name(record):
    name_start = record.find('1 NAME ') + 6
    name_end = record.find('\n', name_start)
    name = record[name_start:name_end]
    first_name, last_name = name.split('/', 1)
    first_name = first_name[:10] # Use slicing syntax to extract the first 10 characters of the first_name variable
    last_name = last_name[:10].rstrip('/') # Use slicing syntax to extract the first 10 characters of the last_name variable
    return last_name.replace(" ", "") + first_name.replace(" ", "")

# Process the records
records = {extract_id(record): record for record in data}

# Initialize global variables at the top of your script
all_branches = []
visited_pairs = set()
branch_score = 0  # Initialize the branch score for each branch
dna_line = []

# Existing function definitions and variables here, like 'all_branches' and 'visited_pairs'

def find_relations(individual_id, generation, records, direction="up"):
    global all_branches  # Declare all_branches as global

    # Initialize the branch score for each branch
    branch_score = 0

    if individual_id not in records:
        return

    record = records[individual_id]
    famc_start = record.find('1 FAMC @') + 8
    famc_end = record.find('@', famc_start)
    famc_id = record[famc_start:famc_end]
    if famc_id not in records:
        return

    fam_record = records[famc_id]
    wife_start = fam_record.find('1 WIFE @') + 8
    wife_end = fam_record.find('@', wife_start)
    mother_id = fam_record[wife_start:wife_end]

    husb_start = fam_record.find('1 HUSB @') + 8
    husb_end = fam_record.find('@', husb_start)
    father_id = fam_record[husb_start:husb_end]

    mother_name = extract_name(records[mother_id]) if mother_id in records else None
    father_name = extract_name(records[father_id]) if father_id in records else None

    bio_parent_pair = None
    if mother_name and father_name:
        bio_parent_pair = (mother_name, father_name)

    # Special interest in lines with 'surname'
    if mother_name and "Yates" in mother_name:
        branch_score += generation  # Increment the score by the generation number
    if father_name and "Yates" in father_name:
        branch_score += generation  # Increment the score by the generation number

    # Store bio_line info
    if bio_parent_pair:
        branch_info = {
            'bio_parent_pair': bio_parent_pair,
            'generation': generation,
            'branch_score': branch_score,
        }
        all_branches.append(branch_info)

def find_child_ids(individual_id, records):
    child_ids = []
    if individual_id not in records:
        return child_ids

    record = records[individual_id]
    fams_start = record.find('1 FAMS @')

    while fams_start != -1:
        fams_start += 8
        fams_end = record.find('@', fams_start)
        fams_id = record[fams_start:fams_end]

        if fams_id in records:
            fam_record = records[fams_id]
            chil_start = fam_record.find('1 CHIL @')

            while chil_start != -1:
                chil_start += 8
                chil_end = fam_record.find('@', chil_start)
                chil_id = fam_record[chil_start:chil_end]
                child_ids.append(chil_id)

                chil_start = fam_record.find('1 CHIL @', chil_end)

        fams_start = record.find('1 FAMS @', fams_end)

    return child_ids

# Global variable to store all branches (both upwards and downwards)
all_branches = []

# Function to find descendants
def find_descendants(individual_id, generation, records, direction="down"):
    global all_branches
    branch_score = 0  # Initialize the branch score for each branch

    # First, find the children of the individual
    child_ids = find_child_ids(individual_id, records)

    for child_id in child_ids:
        child_name = extract_name(records[child_id]) if child_id in records else None

        # If child name is of special interest, update the branch score
        if child_name and "Yates" in child_name:
            branch_score += generation  # Increment the score by the generation number

        # Store the info
        branch_info = {
            'bio_parent_pair': (extract_name(records[individual_id]), child_name),
            'generation': generation,
            'branch_score': branch_score
        }
        all_branches.append(branch_info)

        # Recursive call to find the descendants of the child
        find_descendants(child_id, generation + 1, records, direction)

def find_branches(individual_id, generation, records, branch_index, direction="up"):
    global all_branches

    if individual_id not in records:
        return

    record = records[individual_id]
    famc_start = record.find('1 FAMC @') + 8
    famc_end = record.find('@', famc_start)
    famc_id = record[famc_start:famc_end]

    # Check if this is a branch_head_pair
    is_branch_head = False
    if famc_id not in records:
        is_branch_head = True

    if not is_branch_head:
        fam_record = records[famc_id]
        # Extract mother and father IDs like you've done before

        # Rest of your code to identify mother and father

        if mother_name and father_name:
            bio_parent_pair = (mother_name, father_name)

            # Handle special interest in Yates surname
            if "Yates" in mother_name or "Yates" in father_name:
                branch_score = generation

            branch_info = {
                'bio_parent_pair': bio_parent_pair,
                'generation': generation,
                'branch_score': branch_score,
                'branch_index': branch_index,
                'is_branch_head': is_branch_head
            }
            all_branches.append(branch_info)

            # Recursive calls, incrementing the generation and keeping the same branch_index
            find_branches(mother_id, generation + 1, records, branch_index, direction)
            find_branches(father_id, generation + 1, records, branch_index, direction)
    else:
        # Mark this pair as a branch head
        branch_info = {
            'bio_parent_pair': None,  # Or whatever you'd like to represent the head
            'generation': generation,
            'branch_score': None,  # Or another placeholder
            'branch_index': branch_index,
            'is_branch_head': is_branch_head
        }
        all_branches.append(branch_info)

# Initialization
branch_index = 1  # You can make this dynamic based on the individual_id or any other criterion
all_branches = []

# Usage
find_branches("anchor_gen1_id_here", 0, records, branch_index)


def process_individual(individual_id, generation, records):
    global visited_pairs  # Declare visited_pairs as global
    global all_branches  # Declare all_branches as global
    global generation_table  # Declare generation_table as global

    if individual_id in visited_pairs:
        return
    visited_pairs.add(individual_id)  # Mark this individual as visited

    # Initialize branch_score for this function
    branch_score = 0

    # Call the function to find ancestors
    find_relations(individual_id, generation, records, direction="up")

    # Call the function to find descendants
    find_descendants(individual_id, generation, records, direction="down")

    # Make sure generation is an integer before trying to add it
    if isinstance(generation, int):
        # Check if the surname matches
        if "Yates" in extract_name(records[individual_id]):
            branch_score += generation
    else:
        print(f"Warning: generation is not an int, it's a {type(generation)}")

    # Initialize the generation table if it's not done yet
    if generation_table is None:
        generation_table = pd.DataFrame(columns=['Generation', 'Parent Pair'])

    # Sort or filter the branches
    sorted_branches = sorted(all_branches, key=lambda x: x['branch_score'], reverse=True)

    # Create a dictionary to return
    individual_data = {
        "sorted_branches": sorted_branches,
        "generation_table": generation_table,
    }

    return individual_data


# Function Definitions
def input_prime_surname(last_prime_surname=None):
    if last_prime_surname:
        last_name = last_prime_surname.capitalize()
    else:
        last_name = "Yates"
    return last_name

# Set the prime surname
prime_surname = input_prime_surname(last_prime_surname)
last_prime_surname = prime_surname

# Initialize gedcom_instance to None
gedcom_instance = None

# Let user select GEDCOM file
gedcom_file_path = select_gedcom_file()

if gedcom_file_path:
    gedcom_instance = Gedcom(gedcom_file_path)
    gedcom_instance.parse_gedcom()

# Initialize the list of individuals
individuals = []

if gedcom_instance:
    for dataset in gedcom_instance.filter_pool:
        individual_id = dataset.get_gen_person()
        last_name = dataset.get_anchor_gen1()
        individuals.append((last_name, individual_id))
    print(f'Records Moved into Smaller Pile {len(individuals)}')

# Initialize an empty dictionary to hold individual data
individual_data = {}

if gedcom_instance:
    for dataset in gedcom_instance.filter_pool:
        if dataset.get_gen_person() == individual_id:
            individual_data['cM'] = dataset.get_extractable_cm()
            individual_data['Sort'] = dataset.get_extractable_sort()
            break

# Replace 'Parent Pairs A10' with 'DNA Line'
individual_data['DNA Line'] = '|'.join([f"{row['Parent Pair']}" for index, row in generation_table.iterrows()])

gedcom_files = glob.glob('*.ged')
if gedcom_files:
    file_path = gedcom_files[0]
else:
    print("No GEDCOM files found.")
    # Read the GEDCOM file and split it into individual and family records
    with open(file_path, 'r') as file:
        data = file.read()
    data = data.split('\n0 ')
    records = {extract_id(record): record for record in data}

combined_df_rows = []
for name, individual_id in individuals:
    individual_data = process_individual(individual_id, 0, records)  # Assuming 0 is the starting generation number

    # Initialize variables to "N/A" (or some other default value)
    cm = "N/A"
    sort = "N/A"
    dna_line_data = "N/A"

    if individual_data is not None:  # Add this check
        # Check for each expected key before trying to access it
        cm = individual_data.get('cM', "N/A")
        sort = individual_data.get('Sort', "N/A")
        dna_line_data = individual_data.get('DNA Line', "N/A")

    most_distant_ancestor = generation_table.iloc[0]['Parent Pair'] if (generation_table is not None and len(generation_table) > 0) else "N/A"

    combined_df_rows.append([individual_id, name, sort, cm, most_distant_ancestor, parent_pairs_a10])

combined_df = pd.DataFrame(combined_df_rows, columns=['ID#', 'Name', 'Match to', 'cM', 'Most Distant Ancestor', 'Ancestral Line A10'])


# Function to create hotlinks
def create_hotlink(row):
    url_base = "https://yates.one-name.net/tng/verticalchart.php?personID="
    person_id = row['ID#']
    hotlink = f'<a href="{url_base}{person_id}&tree=tree1&parentset=0&display=vertical&generations=8" target="_blank">{person_id}</a>'
    return hotlink

# Apply the hotlink function to create the 'LUN#' column
combined_df['LUN#'] = combined_df.apply(lambda row: create_hotlink(row), axis=1)

# Change the order of the columns
combined_df = combined_df[['ID#', 'Name', 'Match to', 'cM', 'Most Distant Ancestor', 'LUN#', 'Ancestral Line A10']]

# Adjust index to start from 1 instead of 0
combined_df.index = combined_df.index + 1

# Print all records from the DataFrame
print(combined_df)

# Export the combined_df DataFrame to an Excel file
combined_df.to_excel('/content/output.xlsx', index=False)

import pandas as pd

# Convert all_branches to a DataFrame
all_branches_df = pd.DataFrame(all_branches)

# Export the DataFrame to an Excel file
all_branches_df.to_excel("all_branches.xlsx", index=False)

# Print each branch in a readable format to the console
#for i, branch in enumerate(all_branches):
#    print(f"Branch {i+1}:")
#    print(f"\tBio Parent Pair: {branch['bio_parent_pair']}")
#    print(f"\tGeneration: {branch['generation']}")
#    print(f"\tBranch Score: {branch['branch_score']}")
#    print("="*40)  # Separating line




List of GEDCOM files:
1. yates-one-name-study.ged
List of GEDCOM files:
1. yates-one-name-study.ged
Parsed 50776 total records
Records Moved into Smaller Pile 375
        ID#                Name Match to   cM Most Distant Ancestor  \
1    I13817        YatesWilliam      N/A  N/A                   N/A   
2    I21743          HuntKelsey      N/A  N/A                   N/A   
3    I23678       JohnsonDonald      N/A  N/A                   N/A   
4    I26925            HudsonJL      N/A  N/A                   N/A   
5    I31861         ChurchDebra      N/A  N/A                   N/A   
..      ...                 ...      ...  ...                   ...   
371  I50494       BeaversLaNell      N/A  N/A                   N/A   
372  I50511       StiversKaylee      N/A  N/A                   N/A   
373  I50522  FehrenbacherAngela      N/A  N/A                   N/A   
374  I50535      PetersonHannah      N/A  N/A                   N/A   
375  I50551      NicholsTheresa      N/A  N/A           

In [1]:
!pip install pandas
!pip install python-gedcom

Collecting python-gedcom
  Downloading python_gedcom-1.0.0-py2.py3-none-any.whl (35 kB)
Installing collected packages: python-gedcom
Successfully installed python-gedcom-1.0.0
