<a href="https://colab.research.google.com/github/ronyates47/Gedcom-Utils/blob/main/A_v_33_00_YatesStudy_stable_2024_gedcom.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install pandas
!pip install python-gedcom
!pip install openpyxl

Collecting python-gedcom
  Downloading python_gedcom-1.0.0-py2.py3-none-any.whl (35 kB)
Installing collected packages: python-gedcom
Successfully installed python-gedcom-1.0.0


In [12]:
import csv
import glob
from gedcom.element.individual import IndividualElement
from gedcom.parser import Parser
import pandas as pd

anchor_gen1 = None

class GedcomDataset:
    def __init__(self, gen_person):
        self.gen_person = gen_person
        self.extractable_detail = {}
        self.anchor_gen1 = None  # Initialize anchor_gen1 here

    def add_extractable_detail(self, key, value):
        self.extractable_detail[key] = value

    def get_gen_person(self):
        name = self.extractable_detail.get('NAME', '')
        parts = name.split('/', 1)
        first_name = parts[0].split(' ')[0]
        last_name = parts[1].rstrip('/') if len(parts) > 1 else ""
        self.anchor_gen1 = last_name.replace(" ", "") + first_name.replace(" ", "")
        global anchor_gen1  # Declare that we're using the global variable
        anchor_gen1 = self.anchor_gen1  # Update the global variable
        return self.gen_person.strip('@')

    def get_anchor_gen1(self):
        return self.anchor_gen1

    def get_extractable_NPFX(self):
        return self.extractable_detail.get('NPFX', '')

    def get_extractable_cm(self):
        npfx_value = self.extractable_detail.get('NPFX', '')
        if '&' in npfx_value:
            cm_value = npfx_value.split('&')[0].strip()
        else:
            cm_value = npfx_value.strip()
        try:
            int(cm_value)
            return cm_value
        except ValueError:
            return 'error'

    def get_extractable_sort(self):
        npfx_value = self.extractable_detail.get('NPFX', '')
        if '&' in npfx_value:
            sort_value = npfx_value.split('&')[1].strip()
            return sort_value
        else:
            return ''

    def get_extractable_FAMC(self):
        return self.extractable_detail.get('FAMC', '').strip('@')

# Function definitions
def extract_id(record):
    id_start = record.find('@') + 1
    id_end = record.find('@', id_start)
    return record[id_start:id_end]


def extract_name(record):
    name_start = record.find('1 NAME ') + 6
    name_end = record.find('\n', name_start)
    name = record[name_start:name_end]
    first_name, last_name = name.split('/', 1)
    first_name = first_name[:10] # Use slicing syntax to extract the first 10 characters of the first_name variable
    last_name = last_name[:10].rstrip('/') # Use slicing syntax to extract the first 10 characters of the last_name variable
    return last_name.replace(" ", "") + first_name.replace(" ", "")

name_to_id = {}   # Global dictionary to hold name to ID mapping

class Gedcom:
    def __init__(self, file_name):
        self.file_name = file_name
        self.gedcom_datasets = []
        self.filter_pool = []

    @staticmethod
    def get_standard_name(file_path):
        file_name = file_path.split('/')[-1]
        if '.' in file_name:
            file_name = file_name.rsplit('.', 1)[0]
        standard_name = file_name.replace(' ', '_').lower()
        return standard_name

    def parse_gedcom(self):
        global name_to_id  # Declare name_to_id as global to modify it
        with open(self.file_name, 'r', encoding='utf-8-sig') as f:
            gedcom_lines = f.readlines()
        current_dataset = None
        npfx_count = 0
        total_count = 0

        for line in gedcom_lines:
            parts = line.strip().split(' ', 2)
            level = int(parts[0])
            tag = parts[1]
            value = parts[2] if len(parts) > 2 else None

            if level == 0 and tag.startswith('@') and tag.endswith('@') and value == 'INDI':
                total_count += 1
                current_dataset = GedcomDataset(tag)
                self.gedcom_datasets.append(current_dataset)

                # Populate name_to_id
                individual_name = current_dataset.get_anchor_gen1()
                individual_id = current_dataset.get_gen_person()
                name_to_id[individual_name] = individual_id

            elif current_dataset is not None:
                if level == 1 and tag in ['NAME', 'FAMC']:
                    current_key = tag
                    current_dataset.add_extractable_detail(current_key, value)

                elif level == 2 and tag == 'NPFX':
                    npfx_count += 1
                    current_dataset.add_extractable_detail(tag, value)

        print(f'GEDCOM contained {total_count} total records')
        print(f'Records tagged and filtered by NPFX: {npfx_count}')

        # First level of filtering: Filter those with NPFX
        for dataset in self.gedcom_datasets:
            if dataset.get_extractable_NPFX():
                self.filter_pool.append(dataset)

        # Check if manual filtering should be applied
        manual_filter_activated = True  # or False depending on your situation

        # Second level of filtering: Apply manual filter from Excel sheet
        if manual_filter_activated:
            import pandas as pd  # Assuming you haven't imported it yet
            try:
                df = pd.read_excel('filtered_ids.xlsx')
            except FileNotFoundError:
                print("filtered_ids.xlsx not found. Skipping second-level manual filter.")
            else:
                manual_filtered_ids = set(df['ID'])
                print(f"Manual filter IDs loaded: {len(manual_filtered_ids) - 1}")

                self.filter_pool = [dataset for dataset in self.filter_pool if dataset.get_gen_person() in manual_filtered_ids]
                print(f"After manual filter, total records: {len(self.filter_pool)}")

    def apply_manual_filter(self):
        manual_filter_activated = True
        if manual_filter_activated:
            import pandas as pd
            try:
                df = pd.read_excel('filtered_ids.xlsx')
                manual_filtered_ids = set(df['ID'].astype(str))  # Ensure IDs are strings
                print(f"Manual filter IDs loaded: {len(manual_filtered_ids)}")
            except FileNotFoundError:
                print("filtered_ids.xlsx not found. Skipping second-level manual filter.")
            else:
                # Debug output to verify IDs before filtering
                print("IDs before manual filter:", [ds.get_gen_person() for ds in self.filter_pool][:10])
                self.filter_pool = [ds for ds in self.filter_pool if str(ds.get_gen_person()) in manual_filtered_ids]
                print("IDs after manual filter:", [ds.get_gen_person() for ds in self.filter_pool][:10])
                print(f"After manual filter, total records: {len(self.filter_pool)}")

    def check_and_apply_exclusion_filter(self):
        """Apply exclusion filter if '/exclude_ids.xlsx' is present."""
        file_path = '/content/exclude_ids.xlsx'  # Updated path
        if os.path.exists(file_path):
            try:
                df_exclude = pd.read_excel(file_path)
                if 'ID' in df_exclude.columns:
                    exclude_ids = set(df_exclude['ID'].astype(str))  # Ensure conversion to string
                    print(f"Exclusion filter IDs loaded: {len(exclude_ids)}")
                    print(f"Sample of IDs to exclude: {list(exclude_ids)[:5]}")  # Print some sample IDs
                    # Apply the exclusion filter
                    initial_count = len(self.filter_pool)
                    self.filter_pool = [ds for ds in self.filter_pool if str(ds.get_gen_person()) not in exclude_ids]
                    print(f"Excluded {initial_count - len(self.filter_pool)} records based on exclusion IDs.")
                else:
                    print("Column 'ID' not found in the Excel file.")
            except Exception as e:
                print(f"Failed to apply exclusion filter: {str(e)}")
        else:
            print(f"No exclusion filter applied, '{file_path}' not found. Check the path and ensure the file is uploaded to Colab.")



def input_prime_surname(last_prime_surname=None):
    if last_prime_surname:
        last_name = input(f"Enter prime_surname (default: {last_prime_surname}): ")
        if not last_name:
            last_name = last_prime_surname
    else:
        last_name = input("Enter prime_surname: ")
    return last_name

def select_gedcom_file():
    gedcom_files = glob.glob('*.ged')
    if not gedcom_files:
        print("No GEDCOM files found.")
        return None

    print("Automatically selecting the first GEDCOM file.")
    return gedcom_files[0]

    while True:
        try:
            selected_num = int(input("Enter the number of the GEDCOM file you want to use: "))
            if 1 <= selected_num <= len(gedcom_files):
                return gedcom_files[selected_num - 1]
            else:
                print("Invalid number. Please enter a valid number from the list.")
        except ValueError:
            print("Invalid input. Please enter a valid number.")

gedcom_file_path = select_gedcom_file() # Call the function to let the user select the GEDCOM file
if gedcom_file_path:
    # Use the selected GEDCOM file path to create an instance of the Gedcom class
    gedcom_instance = Gedcom(gedcom_file_path)
    gedcom_instance.parse_gedcom()

    individuals = []  # Initialize the list of individuals

    for dataset in gedcom_instance.filter_pool:    # Iterate over the filter_pool list,add each last name and ID to list
        individual_id = dataset.get_gen_person()
        last_name = dataset.get_anchor_gen1()
        individuals.append((last_name, individual_id))

    print(f'Records tagged and filtered by NPFX: {len(individuals)}')

    with open(gedcom_file_path, 'r') as file:    # Read the GEDCOM file and split it into individual and family records
        data = file.read()
    data = data.split('\n0 ')
    records = {extract_id(record): record for record in data}

def has_both_parents(records, mother_id, father_id):
    return mother_id in records and father_id in records

visited_pairs = set()
generation_table = []

def find_parents(individual_id, generation, records):
    if individual_id not in records:
        return
    record = records[individual_id]
    famc_start = record.find('1 FAMC @') + 8
    famc_end = record.find('@', famc_start)
    famc_id = record[famc_start:famc_end]
    if famc_id not in records:
        return

    fam_record = records[famc_id]
    wife_start = fam_record.find('1 WIFE @') + 8
    wife_end = fam_record.find('@', wife_start)
    mother_id = fam_record[wife_start:wife_end]

    husb_start = fam_record.find('1 HUSB @') + 8
    husb_end = fam_record.find('@', husb_start)
    father_id = fam_record[husb_start:husb_end]

    if mother_id and mother_id in records and father_id and father_id in records:
        parent_pair = (father_id, mother_id)
        if parent_pair not in visited_pairs:
            visited_pairs.add(parent_pair)
            generation_table.append((generation, parent_pair))

    if mother_id:
        find_parents(mother_id, generation + 1, records)

    if father_id:
        find_parents(father_id, generation + 1, records)

def extract_name(record):
    name_start = record.find('1 NAME ') + 6
    name_end = record.find('\n', name_start)
    name = record[name_start:name_end]
    first_name, last_name = name.split('/', 1)
    first_name = first_name[:10]
    last_name = last_name[:10].rstrip('/')
    return last_name.replace(" ", "") + first_name.replace(" ", "")

def find_distant_ancestors(individual_id, records, path=None):
    path = path if path is not None else []
    if path is None:
        path = [individual_id]
    else:
        path.append(individual_id)

    if individual_id not in records:
        return []

    record = records[individual_id]
    famc_start = record.find('1 FAMC @') + 8
    famc_end = record.find('@', famc_start)
    famc_id = record[famc_start:famc_end]

    if famc_id not in records:
        return [path]

    fam_record = records[famc_id]
    wife_start = fam_record.find('1 WIFE @') + 8
    wife_end = fam_record.find('@', wife_start)
    mother_id = fam_record[wife_start:wife_end]

    husb_start = fam_record.find('1 HUSB @') + 8
    husb_end = fam_record.find('@', husb_start)
    father_id = fam_record[husb_start:husb_end]

    if father_id is None and mother_id is None:
        return [path]

    paths = []
    if father_id:
        new_path = list(path)
        paths.extend(find_distant_ancestors(father_id, records, new_path))

    if mother_id:
        new_path = list(path)
        paths.extend(find_distant_ancestors(mother_id, records, new_path))

#    print(f"Distant ancestors paths for {individual_id}: {paths}")

    return paths
filtered_datasets = gedcom_instance.filter_pool

#global generation_table
#global visited_pairs

def calculate_score(distant_ancestors_paths, records):
    name_paths = []
    for path in distant_ancestors_paths:
        name_path = [extract_name(records.get(id, '')) for id in path]
        name_paths.append(name_path)

    path_scores = {}
    for idx, name_path in enumerate(name_paths):
        score = 0
        for generation, name in enumerate(name_path):
            if 'Yates' in name:
                score += 1 * (generation + 1)
        path_scores[idx] = score

    if path_scores:
        winning_path_index = max(path_scores, key=path_scores.get)
        winning_path_score = path_scores[winning_path_index]
        winning_path_names = name_paths[winning_path_index]
        winning_path_ids = distant_ancestors_paths[winning_path_index]
    else:
        winning_path_index = None
        winning_path_score = 0
        winning_path_names = []
        winning_path_ids = []

    return winning_path_score, winning_path_names, winning_path_ids

def filter_ancestral_line(winning_path_ids, generation_table):
    matching_table = []

    for generation, pair in generation_table:
        id1, id2 = pair
        if id1 in winning_path_ids or id2 in winning_path_ids:
            matching_table.append((generation, pair))

    return matching_table

def filter_ancestral_line(winning_path_ids, generation_table):
    matching_table = []
    for generation, pair in generation_table:
        id1, id2 = pair
        if id1 in winning_path_ids or id2 in winning_path_ids:
            matching_table.append((generation, pair))
    return matching_table

# Main Loop
for dataset in filtered_datasets:
    individual_id = dataset.get_gen_person()

    visited_pairs = set()
    generation_table = []

    find_parents(individual_id, 1, records)
    distant_ancestors_paths = find_distant_ancestors(individual_id, records)
    winning_path_score, winning_path_names, winning_path_ids = calculate_score(distant_ancestors_paths, records)
    filtered_ancestral_line = filter_ancestral_line(winning_path_ids, generation_table)
    filtered_ancestral_line.sort(key=lambda x: x[0])
    filtered_ancestral_line_names = []
    for generation, pair in filtered_ancestral_line:
        name_pair = [extract_name(records.get(id, '')) for id in pair]
        formatted_name_pair = f"{name_pair[0]}&{name_pair[1]}"
        filtered_ancestral_line_names.append(formatted_name_pair)

    filtered_ancestral_line_names.reverse()
#    filtered_ancestral_line_str = "|".join(filtered_ancestral_line_names)
#    print(f"Filtered Ancestral Line for {individual_id}: {filtered_ancestral_line_str}")

def process_individual(individual_id, gedcom_instance, records):
    global generation_table
    global visited_pairs
    global anchor_gen1  # Declare that we're using the global variable

    generation_table = []
    visited_pairs = set()

    find_parents(individual_id, 1, records)
    distant_ancestors_paths = find_distant_ancestors(individual_id, records)
    winning_path_score, winning_path_names, winning_path_ids = calculate_score(distant_ancestors_paths, records)
    filtered_ancestral_line = filter_ancestral_line(winning_path_ids, generation_table)
    filtered_ancestral_line.sort(key=lambda x: x[0])
    filtered_ancestral_line_names = []
    for dataset in gedcom_instance.filter_pool:
        if dataset.get_gen_person() == individual_id:
            cm_value = dataset.get_extractable_cm()
            sort_value = dataset.get_extractable_sort()
            anchor_gen1 = dataset.get_anchor_gen1()  # Update anchor_gen1 locally here
            break
    else:
        cm_value = 'N/A'
        sort_value = 'N/A'

    if anchor_gen1 is not None:
        filtered_ancestral_line_names.insert(0, anchor_gen1)

    for generation, pair in filtered_ancestral_line:
        name_pair = [extract_name(records.get(id, '')) for id in pair]
        formatted_name_pair = f"{name_pair[0]}&{name_pair[1]}"
        filtered_ancestral_line_names.append(formatted_name_pair)

    filtered_ancestral_line_names.reverse()
    filtered_ancestral_line_str = "~~~".join(filtered_ancestral_line_names)

    individual_data = {
        'cM': cm_value,
        'Sort': sort_value,
        'Filtered Ancestral Line': filtered_ancestral_line_str
    }

    return individual_data, filtered_ancestral_line_str

import pandas as pd

# Global variables
visited_pairs = set()
combined_df_rows = []  # Initialize your empty combined_df_rows list

# Main Loop
for dataset in gedcom_instance.filter_pool:  # Assuming filter_pool is iterable
    individual_id = dataset.get_gen_person()

    # Reset global variables for each new individual
    visited_pairs.clear()
    generation_table = []

    # Process Individual and Get Data
    individual_data, filtered_ancestral_line_str = process_individual(individual_id, gedcom_instance, records)
    cm = individual_data['cM']
    sort = individual_data['Sort']
    individual_name = extract_name(records.get(individual_id, ''))
    # Append to DataFrame Rows
    combined_df_rows.append([individual_id, sort, individual_name, cm, filtered_ancestral_line_str])

import pandas as pd
from datetime import datetime

# Create DataFrame
columns = ['ID#', 'Match to', 'Name', 'cM', 'Yates DNA Ancestral Line']
combined_df = pd.DataFrame(combined_df_rows, columns=columns)

#**********************************************************************************WORKING
# Function to remove the named prefix from the 'Yates DNA Ancestral Line' column
def remove_prefix(row):
    ancestral_line = row['Yates DNA Ancestral Line']
    prefix_to_remove = 'YatesJohn&SearchingStill~~~YatesWilliam&SearchingStill~~~YatesWilliam&SearchingStill~~~YatesEdmund&CornellMargaret~~~YatesRichard&AshendonJoan~~~YatesJohn&HydeAlice~~~YatesThomas&WhiteFrances~~~'
    if ancestral_line.startswith(prefix_to_remove):
        row['Yates DNA Ancestral Line'] = ancestral_line[len(prefix_to_remove):]
    return row

# Apply the function to remove the prefix
combined_df = combined_df.apply(remove_prefix, axis=1)
#**********************************************************************************WORKING

# Function to add hotlinks
def create_hotlink(row):
    url_base = "https://yates.one-name.net/tng/verticalchart.php?personID="
    additional_params = "&tree=tree1&parentset=0&display=vertical&generations=15"
    if pd.notnull(row['ID#']):
        return f'<a href="{url_base}{row["ID#"]}{additional_params}">{row["ID#"]}</a>'
    return ''  # Return an empty string for null values

# Apply the hotlink function
combined_df['Link'] = combined_df.apply(create_hotlink, axis=1)

# Define the columns to be used in the final DataFrame
ordered_columns = ['Match to', 'Name', 'cM', 'Link', 'Yates DNA Ancestral Line']
combined_df = combined_df[ordered_columns]

# Sort the DataFrame
combined_df.sort_values(by=['Yates DNA Ancestral Line', 'Match to'], ascending=[False, False], inplace=True)

# Save the version with hotlinks
output_excel_with_links = f'/content/1_data_with-Hotlinks.xlsx'
combined_df.to_excel(output_excel_with_links, index=False)
print(f"Excel file with hotlinks saved at: {output_excel_with_links}")

import pandas as pd
from datetime import datetime

# Assuming combined_df is already defined and includes the 'Link' column with IDs in a hyperlink format

# Create a copy of the DataFrame to modify for no-hotlinks version
combined_df_no_links = combined_df.copy()

# Assuming you want to simply remove the HTML link and retain the plain ID in the 'Link' column,
# you can strip the HTML if it's structured consistently. If it's already plain text or you don't need to change,
# skip this step or adjust accordingly.
def strip_links(link_text):
    # Example to strip a simple HTML tag; adjust regex as needed for actual data format
    if '<a href=' in link_text:
        return link_text.split('">')[-1].split('</a>')[0]  # Adjust based on actual format
    return link_text  # Return as is if no HTML tag

# Apply function to strip HTML from 'Link' if necessary
combined_df_no_links['Link'] = combined_df_no_links['Link'].apply(strip_links)

# Define the output file path with a timestamp for uniqueness
output_excel_no_links = f'/content/2_data_NO_Hotlinks_{datetime.now().strftime("%Y-%m-%d_%H%M%S")}.xlsx'

# Try to save the DataFrame without hotlinks to an Excel file
try:
    combined_df_no_links.to_excel(output_excel_no_links, index=False)
    print(f"Excel file without hotlinks saved at: {output_excel_no_links}")
except Exception as e:
    print(f"Failed to save the Excel file without hotlinks: {e}")

##########################################################################################################
##########################################################################################################
##########################################################################################################

Automatically selecting the first GEDCOM file.
GEDCOM contained 55730 total records
Records tagged and filtered by NPFX: 1045
filtered_ids.xlsx not found. Skipping second-level manual filter.
Records tagged and filtered by NPFX: 1045
Excel file with hotlinks saved at: /content/1_data_with-Hotlinks.xlsx
Excel file without hotlinks saved at: /content/2_data_NO_Hotlinks_2024-05-14_205411.xlsx


In [10]:
#Super visualize

import pandas as pd

def remove_prefix(row):
    """Function to remove the named prefix from the 'Yates DNA Ancestral Line' column."""
    ancestral_line = row['Yates DNA Ancestral Line']
    print("Processing line:", ancestral_line)  # Debug statement
    prefix_to_remove = 'YatesEdmund&CornellMargaret~~~YatesRichard&AshendonJoan~~~YatesJohn&HydeAlice~~~YatesThomas&WhiteFrances~~~'
    if ancestral_line.startswith(prefix_to_remove):
        row['Yates DNA Ancestral Line'] = ancestral_line[len(prefix_to_remove):]
    return row

def load_data(file_path):
    """Loads data from an Excel file and applies prefix removal."""
    df = pd.read_excel(file_path)
    print("Initial data loaded. Sample rows before processing:")
    print(df.head())  # Debug statement to check data before processing
    df = df.apply(remove_prefix, axis=1)
    print("Data after removing prefix. Sample rows after processing:")
    print(df.head())  # Debug statement to check data after processing
    return df.to_dict('records')

def filter_data(main_data, exclude_data):
    """Filters out excluded records based on 'Yates DNA Ancestral Line'."""
    excluded_lines = set(item['Yates DNA Ancestral Line'] for item in exclude_data)
    return [record for record in main_data if record['Yates DNA Ancestral Line'] not in excluded_lines]

def process_data(data):
    """Processes data to expand 'Yates DNA Ancestral Line', calculates FQ and QI."""
    df = pd.DataFrame(data)
    expanded_data = []
    parents_stack = []
    starting_plane = 8

    for index, row in df.iterrows():
        nodes = row['Yates DNA Ancestral Line'].split('~~~')
        for i, node in enumerate(nodes):
            plane = i + starting_plane
            if len(parents_stack) > i:
                parents_stack[i] = node
            else:
                parents_stack.append(node)
            parent = parents_stack[i-1] if i > 0 else None
            expanded_data.append({
                'Gen #': plane,
                'Offspring & Spouse': node,
                'Parents': parent,
                'cM': row['cM'],
                'ID': row.get('ID', '')  # Ensure ID is optionally included
            })

    expanded_df = pd.DataFrame(expanded_data)
    results_df = expanded_df.groupby(['Gen #', 'Parents', 'Offspring & Spouse']).agg(
        FQ=('Offspring & Spouse', 'size'),
        QI=('cM', 'mean')
    ).reset_index()
    results_df['QI'] = pd.to_numeric(results_df['QI'], errors='coerce').fillna(0).astype(int)
    return results_df[results_df['FQ'] >= 3]

def main():
    main_data = load_data('/content/main_data_dict.xlsx')
    exclude_data = load_data('/content/exclude_ids.xlsx')
    remaining_data = filter_data(main_data, exclude_data)

    # Save remaining desired data
    remaining_df = pd.DataFrame(remaining_data)
    remaining_df.to_excel('/content/remaining_desired_dict.xlsx', index=False)

    final_df = process_data(remaining_data)
    final_df.to_excel('/content/generations_fq-qi.xlsx', index=False)
    print("Final output saved. generations_fq-qi.xlsx.")

if __name__ == "__main__":
    main()


Initial data loaded. Sample rows before processing:
         Match to               Name  cM  \
0          marmar      YatesLucilleA  33   
1          marmar     StroudTerriLei  24   
2  yates,timothyb   MottelerSamantha  19   
3     yates,johnh     PerkinsTeresaG  13   
4    yates,ronald  LeavertonMelissaA  11   

                                                Link  \
0  <a href="https://yates.one-name.net/tng/vertic...   
1  <a href="https://yates.one-name.net/tng/vertic...   
2  <a href="https://yates.one-name.net/tng/vertic...   
3  <a href="https://yates.one-name.net/tng/vertic...   
4  <a href="https://yates.one-name.net/tng/vertic...   

                            Yates DNA Ancestral Line  
0  YatesWilliamT&ShelhorseMaryPoll~~~YatesThomas&...  
1  YatesWilliamT&ShelhorseMaryPoll~~~YatesThomas&...  
2  YatesWilliamR&DavisMalissa~~~YatesWilliamE&Kay...  
3  YatesWilliamP&McKinneyElizabeth~~~YatesJamesMc...  
4  YatesWilliamP&McKinneyElizabeth~~~YatesJamesMc...  
Processing line:

In [16]:


import pandas as pd

# Define the prefix globally
prefix_to_remove = 'YatesJohn&SearchingStill~~~YatesWilliam&SearchingStill~~~YatesWilliam&SearchingStill~~~YatesEdmund&CornellMargaret~~~YatesRichard&AshendonJoan~~~YatesJohn&HydeAlice~~~YatesThomas&WhiteFrances~~~'

def remove_prefix(row):
    """Function to remove the named prefix from the 'Yates DNA Ancestral Line' column."""
    ancestral_line = row['Yates DNA Ancestral Line']
    if ancestral_line.startswith(prefix_to_remove):
        row['Yates DNA Ancestral Line'] = ancestral_line[len(prefix_to_remove):]
    return row

# Load your DataFrame here
# combined_df = pd.read_excel('path_to_your_data.xlsx')

# Apply the function to remove the prefix
combined_df = combined_df.apply(remove_prefix, axis=1)

# Verification step: Check if any rows still start with the prefix
sample_check = combined_df['Yates DNA Ancestral Line'].str.startswith(prefix_to_remove).any()
if not sample_check:
    print("All prefixes successfully removed.")
else:
    print("Some prefixes not removed, please check the data.")



def load_data(file_path):
    """Loads data from an Excel file and applies prefix removal."""
    df = pd.read_excel(file_path)
    df = df.apply(remove_prefix, axis=1)
    return df

def search_for_specific_line(df, line_to_search):
    """Searches for a specific line in the 'Yates DNA Ancestral Line' after prefix removal."""
    found = df[df['Yates DNA Ancestral Line'].str.contains(line_to_search, na=False)]
    if not found.empty:
        print("Line still found after prefix removal:", found)
    else:
        print("Line not found, prefix removal successful.")

def main():
    main_data = load_data('/content/main_data_dict.xlsx')
    target_line = 'YatesJohn&SearchingStill~~~YatesWilliam&SearchingStill~~~YatesWilliam&SearchingStill~~~YatesEdmund&CornellMargaret~~~YatesRichard&AshendonJoan~~~YatesJohn&HydeAlice~~~YatesThomas&WhiteFrances~~~YatesJohnThom&HatfieldeElizabeth~~~YatesRobertJo&DysonMary~~~YatesJosephCh&MarcelisHuybertje~~~YatesRobertJo&DeGraafMargaret~~~YatesJames&JonesMaryAnn~~~YatesDavid&DavisHepsathe~~~YatesJohnFran&StreetRachel~~~YatesWilliam&HartleyRachelMi~~~GriffithCarlAlvi&YatesMaudAlzo~~~GriffithAlvinAll&TracyRoseMary~~~ShuckMelvinEa&GriffithLorettaJ~~~ShuckHarley'
    search_for_specific_line(main_data, target_line)

if __name__ == "__main__":
    main()



All prefixes successfully removed.
Line not found, prefix removal successful.


In [8]:
#visualize

import pandas as pd
import re
import os

def load_data_to_dict(file_path, columns):
    """Loads Excel data directly into a dictionary."""
    df = pd.read_excel(file_path, usecols=columns)
    print(f"Initial data load: {len(df)} rows")
    return df.to_dict('records')

def extract_id_from_link(data_dict):
    """Extracts ID from an HTML anchor tag stored in each dictionary entry."""
    pattern = re.compile(r'personID=(I\d+)')
    count_with_ids = 0
    for entry in data_dict:
        match = pattern.search(entry['Link'])
        if match:
            entry['ID'] = match.group(1)
            count_with_ids += 1
    print(f"Extracted IDs from HTML: {count_with_ids} rows have IDs")
    return data_dict

def apply_exclusion_filter(data_dict, exclude_file_path):
    """Filters out excluded records based on IDs and returns two dictionaries."""
    if os.path.exists(exclude_file_path):
        df_exclude = pd.read_excel(exclude_file_path)
        exclude_ids = set(df_exclude['ID'].astype(str))
        remaining = [entry for entry in data_dict if entry.get('ID') not in exclude_ids]
        excluded = [entry for entry in data_dict if entry.get('ID') in exclude_ids]
        print(f"Excluded records count: {len(excluded)}")
        print(f"Expected remaining records count: {len(remaining)}")
        return remaining, excluded
    else:
        print(f"{exclude_file_path} not found, no exclusion filter applied.")
        return data_dict, []

def save_and_reload_data(remaining_data, filename='remaining_data.xlsx'):
    """Saves the DataFrame to an Excel file and reloads it to ensure data integrity."""
    df = pd.DataFrame(remaining_data)
    df.to_excel(filename, index=False)
    print(f"Data saved to {filename}.")
    return pd.read_excel(filename)

def process_data(data_dict):
    """Converts dictionary back to DataFrame and performs data processing."""
    df = pd.DataFrame(data_dict)
    print(f"Data loaded into DataFrame for processing: {len(df)} rows")

    # Assuming your 'Yates DNA Ancestral Line' expansion and filtering logic is correct
    expanded_data = []
    parents_stack = []
    starting_plane = 8

    for index, row in df.iterrows():
        nodes = row['Yates DNA Ancestral Line'].split('~~~')
        for i, node in enumerate(nodes):
            plane = i + starting_plane
            if len(parents_stack) > i:
                parents_stack[i] = node
            else:
                parents_stack.append(node)
            parent = parents_stack[i-1] if i > 0 else None
            expanded_data.append({
                'Gen #': plane,
                'Offspring & Spouse': node,
                'Parents': parent,
                'cM': row['cM'],
                'ID': row.get('ID')
            })

    expanded_df = pd.DataFrame(expanded_data)
    filtered_df = expanded_df[expanded_df['Gen #'] >= 8]
    results_df = filtered_df.groupby(['Gen #', 'Parents', 'Offspring & Spouse']).agg(
        FQ=('Offspring & Spouse', 'size'),
        QI=('cM', 'mean')
    ).reset_index()
    results_df['QI'] = pd.to_numeric(results_df['QI'], errors='coerce').fillna(0).astype(int)

    final_results_df = results_df[results_df['FQ'] >= 3]
    final_results_df.set_index(['Gen #', 'Parents', 'Offspring & Spouse'], inplace=True)
    print(f"Final data ready for output: {len(final_results_df)} rows")

    return final_results_df

def main():
    file_path = '/content/main_data_dict.xlsx'
    if os.path.exists(file_path):
        data_dict = load_data_to_dict(file_path, ["cM", "Yates DNA Ancestral Line", "Link"])
        data_dict = extract_id_from_link(data_dict)
        remaining_data, excluded_data = apply_exclusion_filter(data_dict, '/content/exclude_ids.xlsx')
        remaining_df = save_and_reload_data(remaining_data)
        final_df = process_data(remaining_df)
        final_df.to_excel('generations_fq-qi.xlsx', index=True)
        print("Final output saved. generations_fq-qi.xlsx.")
    else:
        print(f"File {file_path} not found.")

if __name__ == "__main__":
    main()


#***************************************************************************************************************
#***************************************************************************************************************
#***************************************************************************************************************

Initial data load: 1045 rows
Extracted IDs from HTML: 1045 rows have IDs
Excluded records count: 334
Expected remaining records count: 711
Data saved to remaining_data.xlsx.
Data loaded into DataFrame for processing: 711 rows
Final data ready for output: 288 rows
Final output saved. generations_fq-qi.xlsx.
