<a href="https://colab.research.google.com/github/ronyates47/Gedcom-Utils/blob/main/REFACTOR_Gold__1_%26_2_%26_3_20251014.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [20]:
!pip install pandas
!pip install python-gedcom
!pip install openpyxl
!pip install xlsxwriter
!pip install mlxtend
!pip caas_jupyter_tools

ERROR: unknown command "caas_jupyter_tools"


In [21]:
#credentials

import os

# Gmail SMTP creds
os.environ['GMAIL_USER']         = 'yatesvilleron@gmail.com'
os.environ['GMAIL_APP_PASSWORD'] = 'qtziwiblytgrlzvx'

# FTPS upload creds — make sure FTP_PASS is exactly your password, no < or >
os.environ['FTP_HOST']       = 'ftp.one-name.net'
os.environ['FTP_PORT']       = '21'
os.environ['FTP_USER']       = 'admin@yates.one-name.net'
os.environ['FTP_PASS']       = 'v(i83lfQB@dB'


In [23]:
# Cell 1 20250513
#!/usr/bin/env python
"""
GEDCOM Composite Score Script using:
 - Chunk-based Parallel Processing for Speed (Stage 1: genealogical line creation)
 - A Trie-based approach, then final "Value" = 5 * (number of couples with node.count >=2) + (total couples)

For ancestral lines where none of the couples are repeated (a one-off line), the Value is still computed.
Now, instead of composite scoring, two new columns are added:
  - Value Range (the numeric bracket)
  - Value Label (a descriptive label)

Exports final CSV/HTML sorted by "Yates DNA Ancestral Line", including a 'haplogroup' column.
"""
import csv
import glob
import logging
import functools
import os
from datetime import datetime
from collections import defaultdict, Counter
import numpy as np
import pandas as pd
from concurrent.futures import ProcessPoolExecutor
from tqdm import tqdm
from IPython.display import display, Javascript

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

###############################################################################
# Global Variables
###############################################################################
anchor_gen1 = None
visited_pairs = set()
generation_table = []

###############################################################################
# Trie Data Structure
###############################################################################
class TrieNode:
    """A simple Trie node for storing a couple and counting how many lines pass here."""
    def __init__(self):
        self.count = 0
        self.children = {}

class Trie:
    def __init__(self):
        self.root = TrieNode()

    def insert_line(self, couples_list):
        current = self.root
        for couple in couples_list:
            if couple not in current.children:
                current.children[couple] = TrieNode()
            current = current.children[couple]
            current.count += 1

    def get_couple_count(self, couples_list):
        counts = []
        current = self.root
        for couple in couples_list:
            if couple in current.children:
                current = current.children[couple]
                counts.append(current.count)
            else:
                counts.append(0)
                break
        return counts

###############################################################################
# Utility: chunk generator
###############################################################################
def chunks(lst, n):
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

###############################################################################
# GedcomDataset
###############################################################################
class GedcomDataset:
    def __init__(self, gen_person):
        self.gen_person = gen_person
        self.extractable_detail = {}
        self.anchor_gen1 = None

    def add_extractable_detail(self, key, value):
        self.extractable_detail[key] = value

    def get_gen_person(self):
        name = self.extractable_detail.get('NAME', '')
        parts = name.split('/', 1)
        first_name = parts[0].split(' ')[0]
        last_name = parts[1].rstrip('/') if len(parts) > 1 else ""
        self.anchor_gen1 = last_name.replace(" ", "") + first_name.replace(" ", "")
        global anchor_gen1
        anchor_gen1 = self.anchor_gen1
        return self.gen_person.strip('@')

    def get_extractable_NPFX(self):
        return self.extractable_detail.get('NPFX', '')

    def get_extractable_cm(self):
        npfx_value = self.extractable_detail.get('NPFX', '')
        if '&' in npfx_value:
            cm_value = npfx_value.split('&')[0].strip()
        elif '**' in npfx_value:
            cm_value = npfx_value.split('**')[0].strip()
        else:
            cm_value = npfx_value.strip()
        try:
            int(cm_value)
            return cm_value
        except ValueError:
            return ''

    def get_extractable_sort(self):
        npfx_value = self.extractable_detail.get('NPFX', '')
        if '&' in npfx_value:
            sort_part = npfx_value.split('&')[1]
            if '**' in sort_part:
                sort_value = sort_part.split('**')[0].strip()
            else:
                sort_value = sort_part.strip()
            return sort_value
        return ''

    def get_extractable_YDNA(self):
        npfx_value = self.extractable_detail.get('NPFX', '')
        if '**' in npfx_value:
            ydna_value = npfx_value.split('**')[1].strip()
            return ydna_value
        return ''

    def get_extractable_FAMC(self):
        return self.extractable_detail.get('FAMC', '').strip('@')

###############################################################################
# Gedcom Class
###############################################################################
class Gedcom:
    def __init__(self, file_name):
        self.file_name = file_name
        self.gedcom_datasets = []
        self.filter_pool = []

    def parse_gedcom(self):
        with open(self.file_name, 'r', encoding='utf-8-sig') as f:
            lines = f.readlines()

        current_dataset = None
        npfx_count = 0
        ydna_count = 0
        total_count = 0

        for line in lines:
            parts = line.strip().split(' ', 2)
            level = int(parts[0])
            tag = parts[1]
            value = parts[2] if len(parts) > 2 else None

            if level == 0 and tag.startswith('@') and tag.endswith('@') and value == 'INDI':
                total_count += 1
                current_dataset = GedcomDataset(tag)
                self.gedcom_datasets.append(current_dataset)
            elif current_dataset is not None:
                if level == 1 and tag in ['NAME', 'FAMC']:
                    current_dataset.add_extractable_detail(tag, value)
                elif level == 2 and tag == 'NPFX':
                    npfx_count += 1
                    current_dataset.add_extractable_detail(tag, value)
                    if '**' in value:
                        ydna_count += 1

        autosomal_count = npfx_count - ydna_count
        print(f"GEDCOM contained {total_count} total records")
        print(f"Records tagged and filtered by NPFX: {npfx_count}")
        print(f"Records with YDNA information: {ydna_count}")
        print(f"Autosomal matches: {autosomal_count}")

        for ds in self.gedcom_datasets:
            if ds.get_extractable_NPFX():
                self.filter_pool.append(ds)

        manual_filter_activated = True
        if manual_filter_activated:
            try:
                df = pd.read_excel('filtered_ids.xlsx')
            except FileNotFoundError:
                logger.warning("filtered_ids.xlsx not found. Skipping second-level manual filter.")
            else:
                manual_filtered_ids = set(df['ID'])
                self.filter_pool = [d for d in self.filter_pool if d.get_gen_person() in manual_filtered_ids]
                print(f"After manual filter, total records: {len(self.filter_pool)}")
                logger.info(f"After manual filter, total records: {len(self.filter_pool)}")

        return autosomal_count

###############################################################################
# quick_extract_name
###############################################################################
def quick_extract_name(full_text):
    name_marker = "\n1 NAME "
    idx = full_text.find(name_marker)
    if idx == -1:
        if full_text.startswith("1 NAME "):
            idx = 0
        else:
            return "UnknownName"
    start = idx + len(name_marker)
    end = full_text.find('\n', start)
    if end == -1:
        end = len(full_text)
    name_line = full_text[start:end].strip()
    if '/' not in name_line:
        return name_line[:10].replace(" ", "")
    first_name, last_name = name_line.split('/', 1)
    last_name = last_name.replace("/", "").strip()
    return last_name[:10].replace(" ", "") + first_name[:10].replace(" ", "")

###############################################################################
# Parents & Ancestors
###############################################################################
def find_parents(individual_id, generation, parents_map):
    global visited_pairs, generation_table
    if individual_id not in parents_map:
        return
    father_id, mother_id = parents_map[individual_id]
    if not father_id and not mother_id:
        return
    pair = (father_id, mother_id)
    if pair not in visited_pairs:
        visited_pairs.add(pair)
        generation_table.append((generation, pair))
    if father_id:
        find_parents(father_id, generation+1, parents_map)
    if mother_id:
        find_parents(mother_id, generation+1, parents_map)

def find_distant_ancestors(individual_id, parents_map, path=None):
    if path is None:
        path = []
    path.append(individual_id)
    if individual_id not in parents_map:
        return [path]
    father_id, mother_id = parents_map[individual_id]
    if not father_id and not mother_id:
        return [path]
    paths = []
    if father_id:
        paths.extend(find_distant_ancestors(father_id, parents_map, path[:]))
    if mother_id:
        paths.extend(find_distant_ancestors(mother_id, parents_map, path[:]))
    return paths if paths else [path]

###############################################################################
# filter_ancestral_line
###############################################################################
def filter_ancestral_line(winning_path_ids, generation_table_local, names_map):
    matching_table = []
    for generation, pair in generation_table_local:
        id1, id2 = pair
        if id1 in winning_path_ids or id2 in winning_path_ids:
            matching_table.append((generation, pair))
    matching_table.sort(key=lambda x: x[0])
    lines = []
    for gen, pair in matching_table:
        name_pair = [names_map.get(pid, "UnknownName") for pid in pair]
        lines.append(f"{name_pair[0]}&{name_pair[1]}")
    lines.reverse()
    return "~~~".join(lines)

###############################################################################
# process_record_wrapper (parallel) - STAGE 1
###############################################################################
def process_record_wrapper(individual_id, gedcom_instance, parents_map, names_map):
    global generation_table, visited_pairs, anchor_gen1
    generation_table = []
    visited_pairs = set()

    find_parents(individual_id, 1, parents_map)
    distant_anc_paths = find_distant_ancestors(individual_id, parents_map)

    best_score = None
    best_path = None
    for path in distant_anc_paths:
        name_path = [names_map.get(pid, "UnknownName") for pid in path]
        score = sum((idx+1) for idx, nm in enumerate(name_path) if 'Yates' in nm)
        if best_score is None or score > best_score:
            best_score = score
            best_path = path

    if not best_path:
        best_path = []

    best_path_cleaned = [pid for pid in best_path if pid != individual_id]
    line_str = filter_ancestral_line(set(best_path_cleaned), generation_table, names_map)

    cm_value = ''
    sort_value = ''
    ydna_value = ''
    for ds in gedcom_instance.filter_pool:
        if ds.get_gen_person() == individual_id:
            cm_value = ds.get_extractable_cm()
            sort_value = ds.get_extractable_sort()
            ydna_value = ds.get_extractable_YDNA()
            break

    short_name = names_map.get(individual_id, "UnknownName")
    # Return columns: ID#, Match to, Name, cM, Yates DNA Ancestral Line, haplogroup
    return [individual_id, sort_value, short_name, cm_value, line_str, ydna_value]

###############################################################################
# main()
###############################################################################
def main():
    def select_gedcom():
        files = glob.glob("*.ged")
        if not files:
            print("No GEDCOM files found.")
            return None
        print("Automatically selecting the first GEDCOM file.")
        return files[0]

    gedcom_file_path = select_gedcom()
    if not gedcom_file_path:
        print("No GEDCOM file selected; exiting.")
        return

    ged = Gedcom(gedcom_file_path)
    autosomal_count = ged.parse_gedcom()
    filter_count = len(ged.filter_pool)

    with open("autosomal_count.txt", "w") as f:
        f.write(str(autosomal_count))

    print("Records tagged and filtered by NPFX:", filter_count)

    with open(gedcom_file_path, 'r', encoding='utf-8') as f:
        raw_data = f.read()

    blocks = raw_data.split('\n0 ')
    all_records = {}
    for blk in blocks:
        blk = blk.strip()
        if not blk:
            continue
        flend = blk.find('\n')
        if flend == -1:
            flend = len(blk)
        first_line = blk[:flend]
        if '@' in first_line:
            start = first_line.find('@') + 1
            end = first_line.find('@', start)
            rec_id = first_line[start:end].strip()
            all_records[rec_id] = blk

    parents_map = {}
    names_map = {}
    for rec_id, txt in all_records.items():
        nm = quick_extract_name("\n" + txt)
        names_map[rec_id] = nm

    families = {}
    for rec_id, txt in all_records.items():
        if 'FAM' in txt[:50]:
            father_idx = txt.find('1 HUSB @')
            husb_id = txt[father_idx+len('1 HUSB @'):txt.find('@', father_idx+len('1 HUSB @'))] if father_idx != -1 else None
            wife_idx = txt.find('1 WIFE @')
            wife_id = txt[wife_idx+len('1 WIFE @'):txt.find('@', wife_idx+len('1 WIFE @'))] if wife_idx != -1 else None
            kids = [ln.split('@')[1] for ln in txt.split('\n') if ln.strip().startswith('1 CHIL @')]
            families[rec_id] = (husb_id, wife_id, kids)

    for fam_id, (f_id, m_id, k_list) in families.items():
        for kid in k_list:
            parents_map[kid] = (f_id, m_id)

    individual_ids = [d.get_gen_person() for d in ged.filter_pool]
    print(f"Processing {len(individual_ids)} individuals with chunk-based parallel...")

    combined_rows = []
    chunk_size = 50
    max_workers = os.cpu_count() or 4
    logger.info("Starting chunk-based parallel processing with %d workers.", max_workers)

    with ProcessPoolExecutor(max_workers=max_workers) as executor, tqdm(total=len(individual_ids), desc="Building Yates Lines (Stage 1)") as pbar:
        for chunk in chunks(individual_ids, chunk_size):
            func = functools.partial(process_record_wrapper, gedcom_instance=ged, parents_map=parents_map, names_map=names_map)
            results = list(executor.map(func, chunk))
            combined_rows.extend(results)
            pbar.update(len(chunk))

    columns = ["ID#", "Match to", "Name", "cM", "Yates DNA Ancestral Line", "haplogroup"]
    df = pd.DataFrame(combined_rows, columns=columns)
    df.index += 1

    def remove_specific_prefix(row):
        prefix = "YatesJohn&SearchingStill~~~YatesWilliam&SearchingStill~~~YatesWilliam&SearchingStill~~~YatesEdmund&CornellMargaret~~~YatesRichard&AshendonJoan~~~YatesJohn&HydeAlice~~~YatesThomas&FauconerElizabeth~~~"
        if row["Yates DNA Ancestral Line"].startswith(prefix):
            row["Yates DNA Ancestral Line"] = row["Yates DNA Ancestral Line"][len(prefix):]
        return row

    df = df.apply(remove_specific_prefix, axis=1)

    logger.info("Building Trie from reversed lines...")
    trie = Trie()
    for _, row in df.iterrows():
        line_str = row["Yates DNA Ancestral Line"]
        if pd.notna(line_str) and line_str.strip():
            trie.insert_line([x.strip() for x in line_str.split("~~~") if x.strip()])

    values, prefix_counts = [], []
    logger.info("Computing 'Value' = 5*(#couples with node.count >=2) + (total couples) ...")
    for _, row in df.iterrows():
        line_str = row["Yates DNA Ancestral Line"]
        if pd.isna(line_str) or not line_str.strip():
            values.append(0)
            prefix_counts.append(0)
        else:
            couples_list = [x.strip() for x in line_str.split("~~~") if x.strip()]
            node_counts = trie.get_couple_count(couples_list)
            prefix_count = sum(1 for c in node_counts if c >= 2)
            values.append(5 * prefix_count + len(couples_list))
            prefix_counts.append(prefix_count)

    df["Value"], df["PrefixCount"] = values, prefix_counts

    def assign_value_range_label(val):
        try:
            v = float(val)
        except:
            return "", ""
        if v >= 60: return ">=60", "1-likely correct"
        if 47 <= v <= 59: return "59~47", "2-lines forming"
        if 34 <= v <= 46: return "46~34", "3-patterns emerging"
        if 21 <= v <= 33: return "33~21", "4-notable patterns"
        if 8 <= v <= 20: return "20~8", "5-patterns stable"
        if 1 <= v <= 7:  return f"{v:.0f}", "6-need research"
        return f"{v:.0f}", "0-uncategorized"

    ranges, labels = zip(*(assign_value_range_label(v) for v in df["Value"]))
    df["Value Range"], df["Value Label"] = ranges, labels

    df.sort_values(by=["Yates DNA Ancestral Line"], inplace=True)
    df.drop("PrefixCount", axis=1, inplace=True)

    csv_name = "final_combined_df_with_value_labels.csv"
    df.to_csv(csv_name, index=False)
    logger.info("Exported final DataFrame to '%s'.", csv_name)

    html_name = "HTML_combined_df_with_value_labels.html"
    css_style = """
    <style>
    table { width: 100%; border-collapse: collapse; margin: 20px 0; }
    table, th, td { border: 1px solid #333; }
    th, td { padding: 8px 12px; text-align: center; }
    th { background-color: #f2f2f2; }
    /* Left-align the last column */
    td:nth-child(7) { text-align: left; }
    </style>
    """
    final_cols = ["ID#", "cM", "haplogroup", "Match to", "Value Range", "Value Label", "Yates DNA Ancestral Line"]
    html_content = css_style + df.to_html(index=False, columns=final_cols, escape=False)
    with open(html_name, "w", encoding="utf-8") as f:
        f.write(html_content)
    logger.info("Exported HTML to '%s'.", html_name)

if __name__ == '__main__':
    main()
    try:
        display(Javascript('alert("✅ GEDCOM processing (and HTML export) is complete!");'))
    except:
        pass

import os
import pandas as pd
import smtplib, ssl
from email.mime.text import MIMEText

def send_email(subject, body, to_addr):
    smtp_server = 'smtp.gmail.com'
    port = 465
    sender = os.environ['GMAIL_USER']
    password = os.environ['GMAIL_APP_PASSWORD']
    msg = MIMEText(body)
    msg['Subject'] = subject
    msg['From'] = sender
    msg['To'] = to_addr
    context = ssl.create_default_context()
    with smtplib.SMTP_SSL(smtp_server, port, context=context) as server:
        server.login(sender, password)
        server.send_message(msg)

# Email summary (only total lines)
df_summary = pd.read_csv("final_combined_df_with_value_labels.csv")
total = len(df_summary)
summary = f"GEDCOM processing complete!\n\nTotal lines: {total}"

send_email(
    subject="✅ Cell #1 Report Ready",
    body=summary,
    to_addr=os.environ['GMAIL_USER']
)





Automatically selecting the first GEDCOM file.




GEDCOM contained 62030 total records
Records tagged and filtered by NPFX: 1623
Records with YDNA information: 90
Autosomal matches: 1533
Records tagged and filtered by NPFX: 1623
Processing 1623 individuals with chunk-based parallel...


Building Yates Lines (Stage 1): 100%|██████████| 1623/1623 [12:00<00:00,  2.25it/s]


<IPython.core.display.Javascript object>

In [28]:
# REFACTOR-Gold 2 — create the Report Card (mobile-friendly, sortable)
# Gold 2 create the Report Card (mobile-friendly, sortable)
# -*- coding: utf-8 -*-
# DNA Cousin Surname REFACTOR (Stokes Display, 2 columns)
# Resolver precedence:
#   1) match_to_unmasked.csv (fresh upload)  [remote fetch supported]
#   2) match_to_unmasked.cache.json (last good snapshot)
#   3) built-in DEFAULT_MATCH_TO_UNMASKED
# Features kept: "(back N Gens)", matchee first-name truncated to 4, Column A CSV, sticky headers, back-to-top.
# NEW: Optional remote read for inputs + remote upload for outputs (same FTP as HTML).

# ========= CUT START [1/3] Config + Resolver + Helpers ===========================================
import os, re, json, time, io, posixpath
import pandas as pd
from ftplib import FTP_TLS

# ========= LAYOUT =========
TABLE_WIDTH_PX = 6000
COL_A_PX       = 1220

# ========= DATA / OUTPUT =========
CSV_PATH            = "final_combined_df_with_value_labels.csv"   # main input CSV (will be pulled from server if REMOTE_READ)
LOCAL_NAME          = "dna_cousin_surname_REFACTOR.htm"
REMOTE_NAME         = "dna_cousin_surname_REFACTOR.htm"
LINEAGE_HEADER      = "Lineage (Starting with oldest ancestor, the line is:)"
ARROW_ENTITY        = "&rarr;"
REMOVE_PERIOD_AT_END = True

# Local exports
MATCH_COUSINS_CSV   = "the_match_cousins.csv"            # Column A
RESOLVER_CSV        = "match_to_unmasked.csv"            # fresh upload (optional)
RESOLVER_CACHE_JSON = "match_to_unmasked.cache.json"     # last good snapshot (auto-written)

# ========= Remote I/O toggles (use same creds/dir as HTML upload) =========
REMOTE_READ        = True   # Pull input CSV (and resolver CSV if present) from server before processing
UPLOAD_COLUMN_A    = True   # Push the_match_cousins.csv to server after build
UPLOAD_CACHE_JSON  = False  # If you want to mirror the cache JSON on server, set True

# ========= Optional remote subdirectory (same place you host the HTML) =========
# If your files live in a subfolder (e.g., /public_html/gengen), set FTP_DIR in env.
FTP_DIR = os.environ.get("FTP_DIR", "").strip()  # e.g., "gengen" or "public_html/gengen"

# ========= Built-in fallback resolver (only used if neither CSV nor cache is available) =========
DEFAULT_MATCH_TO_UNMASKED = {
    "1200am":"Cheryl Midnight","adamssarah":"Sarah Adams","addison,david":"Dave Addison","amanic":"Amanda Radnage",
    "beardali":"Alice Beard","birdwelljac":"Jacalyn Yates","bucb":"Beth Buckley","camry":"Cami Crockett","camry_jy":"Jamie Yates",
    "cagilaba,leigh":"Leigh Yates","evansdei":"Deidre Evans","franch,mike":"Mike Franch","fridine":"Nadine Brown",
    "girtain,alma":"Alma Girtain","girtain,andy":"Andy Girtain","girtain,kathryn":"Kathryn Girtain","girtain,theresa":"Theresa Girtain",
    "girtja":"Josh Girtain","handmer":"Meredith Aronson","hatpat":"Pat Hatfield","hatpatm":"Virginia Looney","hell-bry":"Bobby Yates",
    "hellopt":"Pat Thomas","hendricksjas":"Jim Hendricks","henryche":"Cheryl Henry","husainir":"Rebecca Husaini","klingal":"Albert Kling",
    "kuhlmanj?":"Steve Kuhlman","leedon":"Donna Lee","lewiscla":"Claudia Lewis","littleil":"Ilene Little","lovewalk1":"Linda Lovett",
    "marma":"Mary Marshall","mccollummike":"Mike McCollum","milocan":"Candy Milovich","powers,kath":"Kathy Powers",
    "rophy":"Phyllis Rounsevell","rophy_rd":"Robyn Billinghurst","sarpri":"Sarah Price","smittybec":"Rebecca Smith",
    "solyons":"Stephanie Yates","stetlerkar":"Karen Stetler","sudie":"Wanda Tabor","walclif":"Ray Walton","weeksjerri":"Jerri Weeks",
    "wishardglen":"Glen Wishard","yates,andreal":"Andrea Yates","yates,patricial":"Pat Yates","yates,robertd":"Robert Yates",
    "yates,ronald":"Ron Yates","yates,timothyb":"Tim Yates","yates,timothyj":"Tim Joe Yates","yatescmartin":"Charles Yates",
    "yatesjamesrob":"James Yates","yatesjohnrob":"John Yates","yates_nj-a":"Arthur Yates","yates_nj-h":"Howard Yates",
    "yeatesd_gn":"Gillian Yates","yeatesd_mb":"Margaret Yates","yeatesd_ws":"Will Yeates",
}

# ---------- FTP helpers ----------
def ftp_connect():
    ftps = FTP_TLS()
    ftps.connect(os.environ['FTP_HOST'], int(os.environ.get('FTP_PORT', 21)))
    ftps.login(os.environ['FTP_USER'], os.environ['FTP_PASS'])
    try:
        ftps.prot_p()  # secure data connection
    except Exception:
        pass
    if FTP_DIR:
        try:
            ftps.cwd(FTP_DIR)
        except Exception:
            # Try to walk nested directories (e.g., "a/b/c")
            parts = [p for p in FTP_DIR.split("/") if p]
            for p in parts:
                try:
                    ftps.mkd(p)
                except Exception:
                    pass
                ftps.cwd(p)
    return ftps

def _remote_path(name: str) -> str:
    # Ensure POSIX-style join regardless of OS
    return posixpath.join(FTP_DIR, name) if FTP_DIR else name

def ftp_download_if_exists(ftps: FTP_TLS, remote_name: str, local_name: str) -> bool:
    try:
        with open(local_name, "wb") as f:
            ftps.retrbinary(f"RETR {remote_name}", f.write)
        print(f"⬇️  Pulled remote file: {remote_name} → {os.path.abspath(local_name)}")
        return True
    except Exception as e:
        print(f"ℹ️  Remote not found or unreadable: {remote_name} ({e})")
        try:
            if os.path.exists(local_name):
                os.remove(local_name)
        except Exception:
            pass
        return False

def ftp_upload_overwrite(ftps: FTP_TLS, local_path: str, remote_name: str):
    try:
        with open(local_path, "rb") as fh:
            ftps.storbinary(f"STOR {remote_name}", fh)
        print(f"⬆️  Uploaded: {local_path} → {remote_name}")
    except Exception as e:
        raise RuntimeError(f"Upload failed for {local_path} → {remote_name}: {e}")

# ---------- resolver loading ----------
def _read_csv_flexible(path):
    """Read 2-column resolver CSV with flexible casing of headers.
       Expected headers (case-insensitive): 'Match to', 'Unmasked'.
       If headers not found, use the first two columns."""
    encodings = ("utf-8-sig","utf-8","cp1252","iso-8859-15","latin1")
    last_err = None
    df = None
    for enc in encodings:
        try:
            df = pd.read_csv(path, encoding=enc)
            break
        except Exception as e:
            last_err = e
            df = None
    if df is None:
        raise last_err if last_err else ValueError("CSV read failed")

    if df.empty:
        raise ValueError("CSV empty")

    cols_map = {str(c).strip().lower(): c for c in df.columns}
    code_col = None
    name_col = None
    for k, c in cols_map.items():
        if k.replace("_"," ").startswith("match to"):
            code_col = c
        if k.startswith("unmasked"):
            name_col = c
    if code_col is None or name_col is None:
        # fallback to first 2 columns
        if len(df.columns) < 2:
            raise ValueError("CSV must have at least two columns")
        code_col, name_col = df.columns[0], df.columns[1]

    sub = df[[code_col, name_col]].copy()
    sub[code_col] = sub[code_col].astype(str).str.strip()
    sub[name_col] = sub[name_col].astype(str).str.strip()
    sub = sub[sub[code_col] != ""]
    if sub.empty:
        raise ValueError("CSV has no usable rows")

    # build dict (lowercase keys)
    out = {}
    for _, r in sub.iterrows():
        k = r[code_col].strip().lower()
        v = r[name_col].strip()
        if k:
            out[k] = v
    return out

def load_resolver():
    # If remote read is enabled, try to pull resolver CSV from server first
    if REMOTE_READ:
        try:
            ftps = ftp_connect()
            ftp_download_if_exists(ftps, _remote_path(RESOLVER_CSV), RESOLVER_CSV)
            ftps.quit()
        except Exception as e:
            print(f"⚠️  Remote resolver fetch skipped: {e}")

    # 1) fresh CSV
    if os.path.exists(RESOLVER_CSV):
        try:
            mapping = _read_csv_flexible(RESOLVER_CSV)
            # cache snapshot for next runs
            with open(RESOLVER_CACHE_JSON, "w", encoding="utf-8") as f:
                json.dump(mapping, f, ensure_ascii=False, indent=0)
            print(f"Resolver: CSV ({len(mapping)}), cached")
            return mapping
        except Exception:
            pass
    # 2) cache
    if os.path.exists(RESOLVER_CACHE_JSON):
        try:
            with open(RESOLVER_CACHE_JSON, "r", encoding="utf-8") as f:
                mapping = json.load(f)
            # ensure keys are lowercased
            mapping = {str(k).strip().lower(): str(v) for k, v in mapping.items()}
            print(f"Resolver: CACHE ({len(mapping)})")
            return mapping
        except Exception:
            pass
    # 3) built-in
    print(f"Resolver: BUILT-IN ({len(DEFAULT_MATCH_TO_UNMASKED)})")
    return DEFAULT_MATCH_TO_UNMASKED.copy()

MATCH_TO_UNMASKED = load_resolver()

# ---------- helpers ----------
def find_col(df, patterns, prefer_exact=None):
    cols = list(df.columns)
    lowmap = {c.lower(): c for c in cols}
    if prefer_exact:
        for name in prefer_exact:
            if name in df.columns: return name
            if name.lower() in lowmap: return lowmap[name.lower()]
    for pat in patterns:
        rx = re.compile(pat, re.I)
        for c in cols:
            if rx.search(c): return c
    return None

SEP_RE = re.compile(r"\s*(?:→|&rarr;|\u2192|;|>|,|~{2,}|/{2,}|\|{2,})\s*")

def split_tokens(s):
    if pd.isna(s): return []
    if not isinstance(s, str): s = str(s)
    return [p.strip() for p in SEP_RE.split(s) if str(p).strip()]

def _clean_piece(text: str) -> str:
    t = re.sub(r'~+', ' ', str(text))
    t = re.sub(r'\s+', ' ', t)
    return t.strip()

_PARTICLES = {"de","del","della","der","van","von","da","dos","das","di","la","le","du","of"}
def _smart_title(token: str) -> str:
    if not token: return token
    token = re.sub(r"(^|\b)([a-z])(['’])([a-z])",
                   lambda m: m.group(1)+m.group(2).upper()+m.group(3)+m.group(4).upper(),
                   token.lower())
    token = "-".join([w.capitalize() for w in token.split("-")])
    token = re.sub(r"\bmc([a-z])",  lambda m: "Mc"+m.group(1).upper(), token)
    token = re.sub(r"\bmac([a-z])", lambda m: "Mac"+m.group(1).upper(), token)
    return token

def smart_titlecase(name: str) -> str:
    name = _clean_piece(name)
    if not name: return name
    if "," in name:
        last, first = [p.strip() for p in name.split(",", 1)]
        pieces = (first + " " + last).split()
    else:
        pieces = name.split()
    out = []
    for i, w in enumerate(pieces):
        out.append(w.lower() if (i>0 and w.lower() in _PARTICLES) else _smart_title(w))
    return " ".join(out)

def surname_given_from_token(token):
    token = token.strip()
    if not token: return (token,)
    idx = None
    for i in range(1, len(token)):
        if token[i-1].islower() and token[i].isupper(): idx = i; break
    if idx is None:
        for i in range(1, len(token)):
            if token[i].isupper(): idx = i; break
    if idx is None: return (token,)
    surname = token[:idx]; given = token[idx:]
    given_spaced = re.sub(r'(?<!^)([A-Z])', r' \1', given)
    return (f"{given_spaced.strip()} {surname.strip()}",)

def normalize_person_name(s: str) -> str:
    if pd.isna(s): return ""
    s = _clean_piece(str(s))
    if "," in s:
        last, first = [p.strip() for p in s.split(",", 1)]
        s = f"{first} {last}"
    if " " not in s and s.isalpha():
        return smart_titlecase(surname_given_from_token(s)[0])
    return smart_titlecase(s)

def truncate_first(name: str, n: int = 4) -> str:
    name = name.strip()
    if not name: return name
    parts = name.split()
    return parts[0][:n] if len(parts) == 1 else f"{parts[0][:n]} {parts[-1]}"

def derive_common_from_first_token(tokens):
    if not tokens: return ("", "")
    first = _clean_piece(tokens[0])
    parts = re.split(r"\s*(?:&| and )\s*", first, maxsplit=1, flags=re.I)
    if len(parts) != 2: return ("", "")
    def _norm(s): return smart_titlecase(s) if " " in s else smart_titlecase(surname_given_from_token(s)[0])
    return (_norm(parts[0]), _norm(parts[1]))

def degree_label_from_generations(g):
    if g <= 1:
        return "parents" if g == 1 else "self"
    if g == 2:
        return "grandparents"
    greats = g - 2
    if greats == 1:
        return "great-grandparents"
    return f"{greats}\u00d7-great-grandparents"


def build_header(subject_name, cm_val, matchee_name, gens, husband, wife):
    matchee_display = truncate_first(matchee_name, 4)
    try:
        cm_str = f"{int(round(float(cm_val)))}"
    except Exception:
        cm_str = (str(cm_val).strip() or "0")
    degree_label = degree_label_from_generations(gens)
    parts = [
        f"{subject_name} is a {cm_str} cM cousin match to {matchee_display}, whose",
        f"{degree_label} (back {gens} Gens)",
        "are", f"{husband} & {wife}."
    ]
    s = " ".join(parts)
    if REMOVE_PERIOD_AT_END:
        s = re.sub(r'\.\s*$', '', s)
    return s

def resolve_match_to(code: str) -> str:
    if not isinstance(code, str): return ""
    return MATCH_TO_UNMASKED.get(str(code).strip().lower(), str(code))
# ========= CUT END   [1/3] ========================================================================


# ========= CUT START [2/3] Transform & Column A ================================================
import os
import pandas as pd
from bs4 import BeautifulSoup
from io import StringIO

# ---------- Input HTML path ----------
HTML_PATH = "/content/HTML_combined_df_with_value_labels.html"

# ---------- Load data table from HTML ----------
if not os.path.exists(HTML_PATH):
    raise FileNotFoundError(f"HTML source file not found: {HTML_PATH}")

with open(HTML_PATH, "r", encoding="iso-8859-15", errors="ignore") as f:
    soup = BeautifulSoup(f, "html.parser")

tables = soup.find_all("table")
if not tables:
    raise ValueError("No <table> found in HTML file.")

# Parse all tables safely (avoid FutureWarning by wrapping in StringIO)
dfs = []
for t in tables:
    try:
        html_str = str(t)
        df_try = pd.read_html(StringIO(html_str))[0]
        dfs.append(df_try)
    except Exception:
        pass

if not dfs:
    raise ValueError("Could not parse any HTML table into a DataFrame.")

# Heuristic: choose the table with the most “hits” on expected columns
def _score(df):
    cols = [str(c).lower() for c in df.columns]
    targets = [
        "match to", "match", "name",
        "cm", "c m", "centimorgan", "centimorgans",
        "lineage", "path", "yates dna ancestral line",
        "id#","id"
    ]
    return sum(any(t in c for t in targets) for c in cols)

df = max(dfs, key=_score)
print(f"✅ Loaded HTML table — {len(df)} rows, {len(df.columns)} columns")

# ---------- Detect columns using existing helper (from [1/3]) ----------
subject_code_col = find_col(df, [r'^match\s*to$'], ["Match to","Match"])
# Matchee: prefer Name; if missing, allow ID#/ID as fallback
matchee_col      = find_col(df, [r'^name$', r'^id\#?$', r'^id$'], ["Name","ID#","ID"])
cm_col           = find_col(df, [r'\bc[\s\-_]*m\b', r'centi.?morgan'], ["cM","cm","Centimorgans"])
path_col         = find_col(df, [r'\blineage\b', r'\bpath', r'ancestral\s*line', r'yates dna ancestral line'],
                            ["Yates DNA Ancestral Line","path_tokens"])
husb_col         = find_col(df, [r'\bhusband\b', r'common.*husb'], ["common_husband","Common Husband"])
wife_col         = find_col(df, [r'\bwife\b', r'common.*wife'], ["common_wife","Common Wife"])

missing_core = [name for name, col in {
    "Match to":subject_code_col, "cM":cm_col, "path":path_col
}.items() if not col]
if missing_core:
    raise ValueError(f"Missing core columns in HTML table: {missing_core}\nAvailable: {list(df.columns)}")

if not matchee_col:
    # Fallback: if we truly have no matchee-like column, use subject code (resolved) as a stand-in
    # (We’ll also log this so you know Name/ID# was absent.)
    matchee_col = subject_code_col
    print("ℹ️  Matchee column not found; using 'Match to' (resolved) as matchee display.")

# ---------- Build display (same semantics as CSV pipeline) ----------
headers, lineages = [], []
for _, row in df.iterrows():
    subject_name = normalize_person_name(resolve_match_to(row[subject_code_col]))
    matchee_raw  = row.get(matchee_col)
    matchee_name = normalize_person_name(matchee_raw) if pd.notna(matchee_raw) else ""
    cm_val  = row[cm_col]
    tokens  = split_tokens(row[path_col])
    gens    = len(tokens)
    if husb_col and wife_col and pd.notna(row.get(husb_col)) and pd.notna(row.get(wife_col)):
        husband, wife = smart_titlecase(str(row[husb_col])), smart_titlecase(str(row[wife_col]))
    else:
        husband, wife = derive_common_from_first_token(tokens)
    header_text  = build_header(subject_name, cm_val, matchee_name, gens, husband, wife)
    sep = " %s " % ARROW_ENTITY
    lineage_text = sep.join(tokens) if tokens else ""
    headers.append(header_text); lineages.append(lineage_text)

LINEAGE_HEADER_SAFE = LINEAGE_HEADER  # from [1/3]
df["Match Summary"] = headers
df[LINEAGE_HEADER_SAFE]  = lineages
display_df = df[["Match Summary", LINEAGE_HEADER_SAFE]]

# ---------- export Column A ----------
display_df[["Match Summary"]].to_csv(MATCH_COUSINS_CSV, index=False, encoding="iso-8859-15")
print("✅ Wrote local CSV (Column A):", os.path.abspath(MATCH_COUSINS_CSV))
# ========= CUT END   [2/3] ================================================================




# ========= CUT START [3/3] HTML + Upload + Report ================================================
# ---------- HTML table ----------
html_table = display_df.to_html(index=False, escape=False, classes="sortable")
html_table = html_table.replace(
    '<table border="1" class="dataframe sortable">',
    '<table border="1" class="dataframe sortable" id="refactor-table">', 1)
html_table = html_table.replace('<tbody>\n<tr>', '<tbody>\n<tr id="first-row">', 1)
html_table = html_table.replace('<th>Match Summary</th>', '<th>Match Summary</th>', 1)
colgroup_html = ("<colgroup>\n  <col style=\"width:%dpx;\" />\n  <col />\n</colgroup>\n") % (COL_A_PX)
html_table = html_table.replace(
    '<table border="1" class="dataframe sortable" id="refactor-table">',
    '<table border="1" class="dataframe sortable" id="refactor-table">\n' + colgroup_html,
    1
)
html_table_scrolling = '<div class="table-scroll">\n' + html_table + '\n</div>'

# ---------- CSS ----------
TABLE_CSS = (
    "<style type=\"text/css\">\n"
    "  html { scroll-behavior: smooth; }\n"
    "  body { font-family: Georgia, \"Times New Roman\", serif; background:#ffffff; color:#222; margin:0; padding:0; line-height:1.5; }\n"
    "  .wrap { max-width:%dpx; margin:0 auto; background:#ffffff; padding:20px; padding-bottom:48px; }\n"
    "  a { color:#154b8b; text-decoration:none; } a:hover { text-decoration:underline; }\n"
    "  h1 { margin:0 0 6px 0; font-size:26px; line-height:1.2; }\n"
    "  .topbar { display:flex; justify-content:space-between; align-items:flex-start; gap:10px; margin-bottom:6px; }\n"
    "  .topbar .left, .topbar .right { font-size:12px; }\n"
    "  .updated { font-size:11px; color:#555; }\n"
    "  .sortbar { margin:6px 0 10px 0; font-size:13px; background:#b4c3e3; padding:6px 8px; border-radius:6px; display:flex; flex-wrap:wrap; gap:5px; align-items:center; }\n"
    "  .btn { display:inline-block; border:1px solid #3e5a97; background:#5b79b8; color:#fff; padding:4px 9px; text-decoration:none; cursor:pointer; border-radius:5px; line-height:1.2; }\n"
    "  .btn:hover { background:#4668aa; }\n"
    "  .table-scroll { max-height:70vh; overflow-y:auto; overflow-x:auto; border:1px solid #ddd; }\n"
    "  table.sortable { border-collapse:collapse; width:%dpx; table-layout:fixed; }\n"
    "  table.sortable th, table.sortable td { border:1px solid #ddd; padding:6px 8px; vertical-align:top; }\n"
    "  table.sortable th { background:#e3eaf8; text-align:left; position:sticky; top:0; z-index:2; box-shadow:0 1px 0 #ccc; }\n"
    "  table.sortable td { word-wrap:break-word; overflow-wrap:break-word; }\n"
    "  #first-row td { border-top:2px solid #999; }\n"
    "  .back-to-top { position:fixed; right:16px; bottom:16px; padding:6px 10px; border:1px solid #3e5a97; background:#5b79b8; color:#fff; cursor:pointer; border-radius:6px; font-size:12px; display:none; z-index:9999; }\n"
    "  .back-to-top:hover { background:#4668aa; }\n"
    "</style>\n"
) % (TABLE_WIDTH_PX, TABLE_WIDTH_PX)

# ---------- HTML ----------
FULL_HTML = (
    "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\"\n"
    "  \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\n"
    "<html xmlns=\"http://www.w3.org/1999/xhtml\" lang=\"en\">\n"
    "<head>\n"
    "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=iso-8859-15\" />\n"
    "<meta name=\"viewport\" content=\"width=device-width, initial-scale=1.0\" />\n"
    "<title>DNA Cousin Surname &mdash; REFACTOR (Stokes Display)</title>\n"
    "%s"
    "</head>\n"
    "<body id=\"top\">\n"
    "<div class=\"wrap\">\n"
    "  <div class=\"topbar\">\n"
    "    <div class=\"left\">&laquo; <a href=\"https://yates.one-name.net/gengen/dna_cousin_surname_study.htm\">Return to Study Home</a></div>\n"
    "    <div class=\"right\">\n"
    "      <div class=\"updated\" id=\"last-updated\"></div>\n"
    "    </div>\n"
    "  </div>\n"
    "  <h1>DNA Cousin Surname &mdash; REFACTOR (Stokes Display)</h1>\n"
    "  <div class=\"sortbar\">\n"
    "    <span class=\"btn\" data-sort-col=\"0\" data-sort-dir=\"asc\">Sort Match Summary &uarr;</span>\n"
    "    <span class=\"btn\" data-sort-col=\"0\" data-sort-dir=\"desc\">Sort Match Summary &darr;</span>\n"
    "    <span class=\"btn\" data-sort-col=\"1\" data-sort-dir=\"asc\">Sort Lineage &uarr;</span>\n"
    "    <span class=\"btn\" data-sort-col=\"1\" data-sort-dir=\"desc\">Sort Lineage &darr;</span>\n"
    "    <a class=\"btn\" href=\"gengen/images/cousin-calculator.jpg\" target=\"_blank\">Cousin Connection</a>\n"
    "    <a class=\"btn\" href=\"gengen/images/Shared_cM_Project_v4.jpg\" target=\"_blank\">Cousin by DNA</a>\n"
    "  </div>\n"
    "  %s\n"
    "</div>\n"
    "<button id=\"back-to-top\" class=\"back-to-top\" aria-label=\"Back to top\">&#9650; Top</button>\n"
    "<script type=\"text/javascript\">\n"
    "//<![CDATA[\n"
    "(function () {\n"
    "  function textOf(cell) { return (cell.textContent || cell.innerText || '').trim().toLowerCase(); }\n"
    "  function sortTable(tbl, colIndex, dir) {\n"
    "    var tbody = tbl.tBodies[0]; if (!tbody) return;\n"
    "    var rows = Array.prototype.slice.call(tbody.rows);\n"
    "    rows.sort(function(a, b) {\n"
    "      var A = textOf(a.cells[colIndex] || {}), B = textOf(b.cells[colIndex] || {});\n"
    "      if (A < B) return dir === 'asc' ? -1 : 1;\n"
    "      if (A > B) return dir === 'asc' ? 1 : -1; return 0;\n"
    "    });\n"
    "    var frag = document.createDocumentFragment();\n"
    "    for (var i=0; i<rows.length; i++) frag.appendChild(rows[i]);\n"
    "    tbody.appendChild(frag);\n"
    "  }\n"
    "  var tbl = document.getElementById('refactor-table'); if (!tbl) return;\n"
    "  var bar = document.querySelector('.sortbar');\n"
    "  if (bar) { bar.addEventListener('click', function(e) {\n"
    "    var btn = e.target.closest('.btn'); if (!btn) return;\n"
    "    var col = parseInt(btn.getAttribute('data-sort-col'), 10);\n"
    "    var dir = btn.getAttribute('data-sort-dir') || 'asc'; sortTable(tbl, col, dir);\n"
    "  }, false); }\n"
    "  if (tbl.tHead && tbl.tHead.rows.length) {\n"
    "    var ths = tbl.tHead.rows[0].cells;\n"
    "    for (var i=0; i<ths.length; i++) (function(idx){\n"
    "      var dir = 'asc'; ths[idx].addEventListener('click', function(){ dir = (dir === 'asc') ? 'desc' : 'asc'; sortTable(tbl, idx, dir); }, false);\n"
    "    })(i);\n"
    "  }\n"
    "  // Initial sort\n"
    "  sortTable(tbl, 1, 'asc');\n"
    "  // Last updated (topbar)\n"
    "  var el = document.getElementById('last-updated'); if (el) {\n"
    "    var d = new Date(document.lastModified || new Date());\n"
    "    function z(n){return (n<10?'0':'')+n;}\n"
    "    el.innerHTML = 'Last updated: ' + d.getFullYear() + '-' + z(d.getMonth()+1) + '-' + z(d.getDate()) + ' ' + z(d.getHours()) + ':' + z(d.getMinutes());\n"
    "  }\n"
    "  // Back to top listens to both window and .table-scroll\n"
    "  var btt = document.getElementById('back-to-top'); var container = document.querySelector('.table-scroll');\n"
    "  function onAnyScroll(){ var y=(window.scrollY||window.pageYOffset||0); var cy=container?container.scrollTop:0; btt.style.display = (y>200||cy>200)?'block':'none'; }\n"
    "  window.addEventListener('scroll', onAnyScroll, {passive:true}); if (container) container.addEventListener('scroll', onAnyScroll, {passive:true}); onAnyScroll();\n"
    "  btt.addEventListener('click', function(){ if (container) container.scrollTo({top:0, behavior:'smooth'}); window.scrollTo({top:0, behavior:'smooth'}); });\n"
    "})();\n"
    "//]]>\n"
    "</script>\n"
    "</body>\n"
    "</html>\n"
) % (TABLE_CSS, html_table_scrolling)

# ---------- Save & Upload ----------
with open(LOCAL_NAME, "w", encoding="iso-8859-15", errors="xmlcharrefreplace") as f:
    f.write(FULL_HTML)

with ftp_connect() as ftps:
    try: ftps.delete(_remote_path(REMOTE_NAME))
    except Exception: pass
    ftp_upload_overwrite(ftps, LOCAL_NAME, _remote_path(REMOTE_NAME))

    if UPLOAD_COLUMN_A and os.path.exists(MATCH_COUSINS_CSV):
        ftp_upload_overwrite(ftps, MATCH_COUSINS_CSV, _remote_path(MATCH_COUSINS_CSV))

    if UPLOAD_CACHE_JSON and os.path.exists(RESOLVER_CACHE_JSON):
        ftp_upload_overwrite(ftps, RESOLVER_CACHE_JSON, _remote_path(RESOLVER_CACHE_JSON))

# ---------- Report: Resolver usage counts (CSV + console preview) ----------
from collections import Counter

# Normalize subject codes from the dataset
used_series = df[subject_code_col].astype(str).map(lambda x: str(x).strip().lower())
counts = Counter([c for c in used_series if c and c != "nan"])

rows = []
all_keys = set(MATCH_TO_UNMASKED.keys())

# 1) Every resolver entry with a count (zeros included)
for code in sorted(all_keys):
    rows.append((code, MATCH_TO_UNMASKED.get(code, ""), counts.get(code, 0)))

# 2) Any codes used in data that aren't in resolver (flag as unmapped)
extra_codes = sorted(set(counts.keys()) - all_keys)
for code in extra_codes:
    rows.append((code, "(unmapped)", counts.get(code, 0)))

usage_df = pd.DataFrame(rows, columns=["Match to (code)", "Unmasked", "Count"])
usage_df.sort_values(["Count", "Match to (code)"], ascending=[False, True], inplace=True)

# Save CSV locally and upload
RESOLVER_USAGE_CSV = "resolver_usage_report.csv"
usage_df.to_csv(RESOLVER_USAGE_CSV, index=False, encoding="iso-8859-15")
print("✅ Wrote resolver usage CSV:", os.path.abspath(RESOLVER_USAGE_CSV))

try:
    with ftp_connect() as ftps:
        ftp_upload_overwrite(ftps, RESOLVER_USAGE_CSV, _remote_path(RESOLVER_USAGE_CSV))
    print(f"✅ Uploaded resolver usage CSV: https://yates.one-name.net/{RESOLVER_USAGE_CSV}")
except Exception as e:
    print(f"⚠️  Upload of resolver usage CSV failed: {e}")

# Console preview
print(f"📊 Resolver usage (top 30 of {len(usage_df)} rows):")
for _, r in usage_df.head(30).iterrows():
    code = str(r['Match to (code)'])
    unm  = str(r['Unmasked'])
    cnt  = int(r['Count'])
    print(f"   {code:<20s} → {unm[:28]:<28s} : {cnt}")
# ========= CUT END   [3/3] =======================================================================


⬇️  Pulled remote file: match_to_unmasked.csv → /content/match_to_unmasked.csv
Resolver: CSV (77), cached
✅ Loaded HTML table — 1623 rows, 7 columns
✅ Wrote local CSV (Column A): /content/the_match_cousins.csv
⬆️  Uploaded: dna_cousin_surname_REFACTOR.htm → dna_cousin_surname_REFACTOR.htm
⬆️  Uploaded: the_match_cousins.csv → the_match_cousins.csv
✅ Wrote resolver usage CSV: /content/resolver_usage_report.csv
⬆️  Uploaded: resolver_usage_report.csv → resolver_usage_report.csv
✅ Uploaded resolver usage CSV: https://yates.one-name.net/resolver_usage_report.csv
📊 Resolver usage (top 30 of 89 rows):
   yates,ronald         → Ron Yates                    : 135
   marmar               → Mary Marshall                : 69
   y-dna                → (unmapped)                   : 68
   hatpat               → Pat Hatfield                 : 64
   wishardglen          → Glen Wishard                 : 60
   yates,patricial      → Pat Yates                    : 54
   yates,johnh          → J Harry Ya

In [None]:
# Gold Cell 3 for Y-DNA Grid with Auto-Adjusting Column Widths

import os
import pandas as pd
from datetime import datetime
from zoneinfo import ZoneInfo
from ftplib import FTP_TLS

# ── PATHS ─────────────────────────────────────────────────────────────────
combo_csv  = "/content/y_dna_user_detail_combo.csv"
output_csv = "/content/y_dna_grid.csv"
output_htm = "/content/y_dna_grid.htm"

# ── 1) Load vertical data ─────────────────────────────────────────────────
df = pd.read_csv(combo_csv)

# Rename “Date” → “Era”
if "Date" in df.columns:
    df.rename(columns={"Date": "Era"}, inplace=True)

# ── 2) Insert Action *after* Era ──────────────────────────────────────────
# Era is at index 1, so Action goes at index 2
df.insert(2, "Action", ["→"] * len(df))

# ── 3) Save vertical CSV ─────────────────────────────────────────────────
df.to_csv(output_csv, index=False)
print(f"✅ Saved vertical grid CSV to {output_csv}")

# ── 4) Build HTML ─────────────────────────────────────────────────────────
now = datetime.now(ZoneInfo("America/New_York"))
ts  = now.strftime("%-m/%-d/%y, %-I:%M %p EDT")
cols = df.columns.tolist()

html = f"""<!DOCTYPE html>
<html>
<head><meta charset="UTF-8"><title>Yates Y-DNA Grid</title>
<style>
body {{
  background: #faf9d3;
  font-family: Arial, sans-serif;
  font-size: 14px;
  margin: 0;
  padding: 0;
}}
.container {{
  padding: 10px;
}}
.table-container {{
  overflow-x: auto;
  max-height: 80vh;
}}
table {{
  border: 2px solid #333;
  border-collapse: collapse;
  margin: 0 auto;
}}
table.mainsection {{
  /* allows CSS targeting of blank under “Year” */
}}
thead {{
  display: table-header-group;
}}
thead th {{
  position: sticky;
  top: 0;
  background: #333;
  color: #fff;
  padding: 6px;
  border: 1px solid #999;
  z-index: 3;
}}
a {{
  color: #fff;
  text-decoration: underline;
}}
.era {{
  background: #666;
  color: #eee;
  padding: 6px;
  border: 1px solid #999;
  font-size: 0.9em;
}}
.action {{
  background: #fff;
  padding: 6px;
  border: 1px solid #999;
  text-align: center;
}}
td {{
  padding: 6px;
  border: 1px solid #999;
  text-align: center;
}}
th:nth-child(n+4),
td:nth-child(n+4) {{
  border: 1px solid #333;
}}
.match {{
  background: #fff;
}}
.blank {{
  background: #ccc;
  color: #ccc;
}}
/* make the blank under the “Year” header match the era-cell background */
table.mainsection td.blank:nth-child(2) {{
  background-color: #fdfcd0;
}}
</style>
</head>
<body>
  <div class="container">
    <h1 style="text-align:center">Yates Y-DNA Grid</h1>
    <p style="text-align:center;font-size:0.9em">Updated: {ts}</p>
    <p style="text-align:center;margin-bottom:12px">
      <a href="https://yates.one-name.net/gengen/dna_cousin_surname_study.htm">
        Return to DNA Cousin Surname Study
      </a>
    </p>
    <div class="table-container">
      <table class="mainsection">
        <thead>
          <tr>"""

# Header row
for i, c in enumerate(cols):
    if i == 0:
        html += "<th>SNP</th>"
    elif i == 1:
        html += "<th>Year</th>"
    elif i == 2:
        html += "<th>Action</th>"
    else:
        pid = c.split("-")[0].upper()
        html += (
          '<th>'
          f'<a href="https://yates.one-name.net/tng/verticalchart.php?'
          f'personID={pid}&tree=tree1&parentset=0&display=vertical&generations=15">{c}</a>'
          '</th>'
        )

html += """
          </tr>
        </thead>
        <tbody>"""

# Data rows
for _, row in df.iterrows():
    html += "<tr>"
    for i, c in enumerate(cols):
        v = row[c]
        if i == 0:
            html += f"<td>{v}</td>"
        elif i == 1:
            html += '<td class="blank">–</td>' if pd.isna(v) or not str(v).strip() else f'<td class="era">{v}</td>'
        elif i == 2:
            html += '<td class="blank">–</td>' if pd.isna(v) or not str(v).strip() else f'<td class="action">{v}</td>'
        else:
            html += '<td class="blank">–</td>' if pd.isna(v) or not str(v).strip() else f'<td class="match">{v}</td>'
    html += "</tr>"

html += """
        </tbody>
      </table>
    </div>
  </div>
</body>
</html>"""

with open(output_htm, "w", encoding="utf-8") as f:
    f.write(html)
print(f"✅ Saved vertical XHTML to {output_htm}")

# ── 5) FTP upload ────────────────────────────────────────────────────────
ftp = FTP_TLS()
ftp.connect(os.environ["FTP_HOST"], int(os.environ["FTP_PORT"]))
ftp.login(os.environ["FTP_USER"], os.environ["FTP_PASS"])
ftp.prot_p()
for path in (output_csv, output_htm):
    fn = os.path.basename(path)
    try:
        ftp.delete(fn)
    except:
        pass
    with open(path, "rb") as fp:
        ftp.storbinary(f"STOR {fn}", fp)
ftp.quit()
print("✅ Uploaded CSV & HTML to server")



✅ Saved vertical grid CSV to /content/y_dna_grid.csv
✅ Saved vertical XHTML to /content/y_dna_grid.htm
✅ Uploaded CSV & HTML to server


In [None]:
# EXP

import os
import pandas as pd
from datetime import datetime
from zoneinfo import ZoneInfo
from ftplib import FTP_TLS

# ── CONFIG ───────────────────────────────────────────────────────────────
info_csv   = "/content/haplogroup_info.csv"
user_csv   = "/content/y_dna_user_detail.csv"
output_csv = "/content/y_dna_grid.csv"
output_htm = "/content/y_dna_grid.htm"

# ── 1) Load & prepare haplogroup info ───────────────────────────────────
df_info = pd.read_csv(info_csv)
if "Date" in df_info.columns:
    df_info.rename(columns={"Date": "Era"}, inplace=True)
df_info = df_info.loc[df_info["Haplogroup"].drop_duplicates().index]
hap_order = df_info["Haplogroup"].tolist()
era_map   = dict(zip(df_info["Haplogroup"], df_info.get("Era", [""] * len(df_info))))

# ── 2) Load user detail table ───────────────────────────────────────────
df_users = pd.read_csv(user_csv)
if "User_ID" not in df_users.columns:
    df_users.rename(columns={df_users.columns[0]: "User_ID"}, inplace=True)
user_chains = [
    [str(v) for v in row.drop(labels=["User_ID"]).tolist() if pd.notna(v) and str(v).strip()]
    for _, row in df_users.iterrows()
]

# ── 3) Insert new SNPs after parent ──────────────────────────────────────
for chain in user_chains:
    prev = None
    for h in chain:
        if prev and h not in hap_order:
            idx = hap_order.index(prev)
            hap_order.insert(idx + 1, h)
        prev = h
# Build final eras list
eras = [era_map.get(h, "") for h in hap_order]

# ── 4) Build horizontal grid DataFrame ───────────────────────────────────
for h in hap_order:
    if h not in df_users.columns:
        df_users[h] = ""
df_grid_h = df_users[["User_ID"] + hap_order]

# ── 5) Transform to vertical layout ─────────────────────────────────────
df_vert = df_grid_h.set_index("User_ID").T
# Insert Era as first column
df_vert.insert(0, 'Era', eras)
df_vert.index.name = 'SNP'
df_grid = df_vert.reset_index()

# ── 6) Save vertical CSV ─────────────────────────────────────────────────
df_grid.to_csv(output_csv, index=False)
print(f"✅ Vertical grid CSV saved to {output_csv}")

# ── 7) Generate XHTML (vertical) ────────────────────────────────────────
now = datetime.now(ZoneInfo("America/New_York"))
ts  = now.strftime("%-m/%-d/%y, %-I:%M %p EDT")

template = '''<!DOCTYPE html>
<html><head><meta charset="UTF-8"><title>Yates Y-DNA Grid</title>
<style>
  body { background:#faf9d3; font-family:Arial,Helvetica,sans-serif; font-size:14px; }
  table { width:100%; border:1px solid #333; border-collapse:collapse; table-layout:auto; }
  th { background:#333; color:#fff; padding:6px; border:1px solid #999; }
  .era { background:#666; color:#eee; padding:6px; border:1px solid #999; font-size:0.9em; }
  td { padding:6px; border:1px solid #999; text-align:center; white-space:nowrap; }
  .match { background:#fff; }
  .blank { background:#ccc; color:#ccc; }
</style>
</head><body>
  <h1 style="text-align:center;">Yates Y-DNA Grid</h1>
  <table>
'''  # end template

# Build header row
cols = df_grid.columns.tolist()
header_html = '<tr><th>SNP</th><th>Era</th>' + ''.join(f'<th>{u}</th>' for u in cols[2:]) + '</tr>'

# Build data rows
rows_html = []
for _, row in df_grid.iterrows():
    cells = []
    for u in cols[2:]:
        v = row[u]
        if pd.isna(v) or not str(v).strip():
            cells.append('<td class="blank">–</td>')
        else:
            cells.append(f'<td class="match">{v}</td>')
    rows_html.append(f'<tr><td>{row["SNP"]}</td><td class="era">{row["Era"]}</td>' + ''.join(cells) + '</tr>')

# Combine and save HTML
html = template + header_html + '\n' + '\n'.join(rows_html) + f'''
  </table>
  <p style="text-align:right;font-size:0.9em;">Updated: {ts}</p>
</body>
</html>'''
with open(output_htm, 'w', encoding='utf-8') as f:
    f.write(html)
print(f"✅ Vertical XHTML Grid saved to {output_htm}")

# ── 8) FTP Upload ───────────────────────────────────────────────────────
ftp = FTP_TLS()
ftp.connect(os.environ['FTP_HOST'], int(os.environ.get('FTP_PORT',21)))
ftp.login(os.environ['FTP_USER'], os.environ['FTP_PASS'])
ftp.prot_p()
for path in [output_csv, output_htm]:
    name = os.path.basename(path)
    try: ftp.delete(name)
    except: pass
    with open(path,'rb') as fp:
        ftp.storbinary(f"STOR {name}", fp)
ftp.quit()
print("✅ Uploaded to server.")



✅ Vertical grid CSV saved to /content/y_dna_grid.csv
✅ Vertical XHTML Grid saved to /content/y_dna_grid.htm
✅ Uploaded to server.


In [None]:
# Y-DNA cell 1

# === Cell 1: New user settings ===
USER_ID       = 'I56217'  # the new column header
PATH_STRING   = (      # the SNP chain for this user
    "R-M207 > R-M173 > R-M343 > R-M269 > R-FT266064 > R-FT266579 > R-FTF17042"
)
INSERT_MISSING = True       # if True, adds any SNPs from PATH_STRING that aren't yet rows
MASTER_CSV     = '/content/y_dna_user_detail_combo.csv'
UPDATED_CSV    = '/content/y_dna_user_detail_combo_updated.csv'


In [None]:
# Cell 2: Load → Append User → Save

import pandas as pd

# 1) Load the existing master CSV
df = pd.read_csv(MASTER_CSV)

# 2) Normalize the first column name to 'SNP' for easy matching
first_col = df.columns[0]
if first_col != 'SNP':
    df.rename(columns={first_col: 'SNP'}, inplace=True)

# 3) Parse the new user's SNP chain
chain = PATH_STRING.split('>')

# 4) Optionally insert any SNPs not yet present (appends at bottom)
if INSERT_MISSING:
    missing = [s for s in chain if s not in df['SNP'].values]
    if missing:
        df = pd.concat([df, pd.DataFrame([{'SNP': s} for s in missing])],
                       ignore_index=True)

# 5) Create the new user column in the next free position
df[USER_ID] = ''

# 6) Populate: copy the SNP value into that column where it matches the chain
df.loc[df['SNP'].isin(chain), USER_ID] = df['SNP']

# 7) Save the updated CSV back to /content
df.to_csv(UPDATED_CSV, index=False)
print(f"✅ Updated CSV saved to {UPDATED_CSV}")


✅ Updated CSV saved to /content/y_dna_user_detail_combo_updated.csv
