<a href="https://colab.research.google.com/github/ronyates47/Gedcom-Utils/blob/main/Gold__1_%26_2_20250511.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
!pip install pandas
!pip install python-gedcom
!pip install openpyxl
!pip install xlsxwriter
!pip install mlxtend




In [9]:
#credentials

import os

# Gmail SMTP creds
os.environ['GMAIL_USER']         = 'yatesvilleron@gmail.com'
os.environ['GMAIL_APP_PASSWORD'] = 'qtziwiblytgrlzvx'

# FTPS upload creds — make sure FTP_PASS is exactly your password, no < or >
os.environ['FTP_HOST']       = 'ftp.one-name.net'
os.environ['FTP_PORT']       = '21'
os.environ['FTP_USER']       = 'admin@yates.one-name.net'
os.environ['FTP_PASS']       = 'v(i83lfQB@dB'


In [None]:
# Cell 1 20250513
#!/usr/bin/env python
"""
GEDCOM Composite Score Script using:
 - Chunk-based Parallel Processing for Speed (Stage 1: genealogical line creation)
 - A Trie-based approach, then final "Value" = 5 * (number of couples with node.count >=2) + (total couples)

For ancestral lines where none of the couples are repeated (a one-off line), the Value is still computed.
Now, instead of composite scoring, two new columns are added:
  - Value Range (the numeric bracket)
  - Value Label (a descriptive label)

Exports final CSV/HTML sorted by "Yates DNA Ancestral Line", including a 'haplogroup' column.
"""
import csv
import glob
import logging
import functools
import os
from datetime import datetime
from collections import defaultdict, Counter
import numpy as np
import pandas as pd
from concurrent.futures import ProcessPoolExecutor
from tqdm import tqdm
from IPython.display import display, Javascript

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

###############################################################################
# Global Variables
###############################################################################
anchor_gen1 = None
visited_pairs = set()
generation_table = []

###############################################################################
# Trie Data Structure
###############################################################################
class TrieNode:
    """A simple Trie node for storing a couple and counting how many lines pass here."""
    def __init__(self):
        self.count = 0
        self.children = {}

class Trie:
    def __init__(self):
        self.root = TrieNode()

    def insert_line(self, couples_list):
        current = self.root
        for couple in couples_list:
            if couple not in current.children:
                current.children[couple] = TrieNode()
            current = current.children[couple]
            current.count += 1

    def get_couple_count(self, couples_list):
        counts = []
        current = self.root
        for couple in couples_list:
            if couple in current.children:
                current = current.children[couple]
                counts.append(current.count)
            else:
                counts.append(0)
                break
        return counts

###############################################################################
# Utility: chunk generator
###############################################################################
def chunks(lst, n):
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

###############################################################################
# GedcomDataset
###############################################################################
class GedcomDataset:
    def __init__(self, gen_person):
        self.gen_person = gen_person
        self.extractable_detail = {}
        self.anchor_gen1 = None

    def add_extractable_detail(self, key, value):
        self.extractable_detail[key] = value

    def get_gen_person(self):
        name = self.extractable_detail.get('NAME', '')
        parts = name.split('/', 1)
        first_name = parts[0].split(' ')[0]
        last_name = parts[1].rstrip('/') if len(parts) > 1 else ""
        self.anchor_gen1 = last_name.replace(" ", "") + first_name.replace(" ", "")
        global anchor_gen1
        anchor_gen1 = self.anchor_gen1
        return self.gen_person.strip('@')

    def get_extractable_NPFX(self):
        return self.extractable_detail.get('NPFX', '')

    def get_extractable_cm(self):
        npfx_value = self.extractable_detail.get('NPFX', '')
        if '&' in npfx_value:
            cm_value = npfx_value.split('&')[0].strip()
        elif '**' in npfx_value:
            cm_value = npfx_value.split('**')[0].strip()
        else:
            cm_value = npfx_value.strip()
        try:
            int(cm_value)
            return cm_value
        except ValueError:
            return ''

    def get_extractable_sort(self):
        npfx_value = self.extractable_detail.get('NPFX', '')
        if '&' in npfx_value:
            sort_part = npfx_value.split('&')[1]
            if '**' in sort_part:
                sort_value = sort_part.split('**')[0].strip()
            else:
                sort_value = sort_part.strip()
            return sort_value
        return ''

    def get_extractable_YDNA(self):
        npfx_value = self.extractable_detail.get('NPFX', '')
        if '**' in npfx_value:
            ydna_value = npfx_value.split('**')[1].strip()
            return ydna_value
        return ''

    def get_extractable_FAMC(self):
        return self.extractable_detail.get('FAMC', '').strip('@')

###############################################################################
# Gedcom Class
###############################################################################
class Gedcom:
    def __init__(self, file_name):
        self.file_name = file_name
        self.gedcom_datasets = []
        self.filter_pool = []

    def parse_gedcom(self):
        with open(self.file_name, 'r', encoding='utf-8-sig') as f:
            lines = f.readlines()

        current_dataset = None
        npfx_count = 0
        ydna_count = 0
        total_count = 0

        for line in lines:
            parts = line.strip().split(' ', 2)
            level = int(parts[0])
            tag = parts[1]
            value = parts[2] if len(parts) > 2 else None

            if level == 0 and tag.startswith('@') and tag.endswith('@') and value == 'INDI':
                total_count += 1
                current_dataset = GedcomDataset(tag)
                self.gedcom_datasets.append(current_dataset)
            elif current_dataset is not None:
                if level == 1 and tag in ['NAME', 'FAMC']:
                    current_dataset.add_extractable_detail(tag, value)
                elif level == 2 and tag == 'NPFX':
                    npfx_count += 1
                    current_dataset.add_extractable_detail(tag, value)
                    if '**' in value:
                        ydna_count += 1

        autosomal_count = npfx_count - ydna_count
        print(f"GEDCOM contained {total_count} total records")
        print(f"Records tagged and filtered by NPFX: {npfx_count}")
        print(f"Records with YDNA information: {ydna_count}")
        print(f"Autosomal matches: {autosomal_count}")

        for ds in self.gedcom_datasets:
            if ds.get_extractable_NPFX():
                self.filter_pool.append(ds)

        manual_filter_activated = True
        if manual_filter_activated:
            try:
                df = pd.read_excel('filtered_ids.xlsx')
            except FileNotFoundError:
                logger.warning("filtered_ids.xlsx not found. Skipping second-level manual filter.")
            else:
                manual_filtered_ids = set(df['ID'])
                self.filter_pool = [d for d in self.filter_pool if d.get_gen_person() in manual_filtered_ids]
                print(f"After manual filter, total records: {len(self.filter_pool)}")
                logger.info(f"After manual filter, total records: {len(self.filter_pool)}")

        return autosomal_count

###############################################################################
# quick_extract_name
###############################################################################
def quick_extract_name(full_text):
    name_marker = "\n1 NAME "
    idx = full_text.find(name_marker)
    if idx == -1:
        if full_text.startswith("1 NAME "):
            idx = 0
        else:
            return "UnknownName"
    start = idx + len(name_marker)
    end = full_text.find('\n', start)
    if end == -1:
        end = len(full_text)
    name_line = full_text[start:end].strip()
    if '/' not in name_line:
        return name_line[:10].replace(" ", "")
    first_name, last_name = name_line.split('/', 1)
    last_name = last_name.replace("/", "").strip()
    return last_name[:10].replace(" ", "") + first_name[:10].replace(" ", "")

###############################################################################
# Parents & Ancestors
###############################################################################
def find_parents(individual_id, generation, parents_map):
    global visited_pairs, generation_table
    if individual_id not in parents_map:
        return
    father_id, mother_id = parents_map[individual_id]
    if not father_id and not mother_id:
        return
    pair = (father_id, mother_id)
    if pair not in visited_pairs:
        visited_pairs.add(pair)
        generation_table.append((generation, pair))
    if father_id:
        find_parents(father_id, generation+1, parents_map)
    if mother_id:
        find_parents(mother_id, generation+1, parents_map)

def find_distant_ancestors(individual_id, parents_map, path=None):
    if path is None:
        path = []
    path.append(individual_id)
    if individual_id not in parents_map:
        return [path]
    father_id, mother_id = parents_map[individual_id]
    if not father_id and not mother_id:
        return [path]
    paths = []
    if father_id:
        paths.extend(find_distant_ancestors(father_id, parents_map, path[:]))
    if mother_id:
        paths.extend(find_distant_ancestors(mother_id, parents_map, path[:]))
    return paths if paths else [path]

###############################################################################
# filter_ancestral_line
###############################################################################
def filter_ancestral_line(winning_path_ids, generation_table_local, names_map):
    matching_table = []
    for generation, pair in generation_table_local:
        id1, id2 = pair
        if id1 in winning_path_ids or id2 in winning_path_ids:
            matching_table.append((generation, pair))
    matching_table.sort(key=lambda x: x[0])
    lines = []
    for gen, pair in matching_table:
        name_pair = [names_map.get(pid, "UnknownName") for pid in pair]
        lines.append(f"{name_pair[0]}&{name_pair[1]}")
    lines.reverse()
    return "~~~".join(lines)

###############################################################################
# process_record_wrapper (parallel) - STAGE 1
###############################################################################
def process_record_wrapper(individual_id, gedcom_instance, parents_map, names_map):
    global generation_table, visited_pairs, anchor_gen1
    generation_table = []
    visited_pairs = set()

    find_parents(individual_id, 1, parents_map)
    distant_anc_paths = find_distant_ancestors(individual_id, parents_map)

    best_score = None
    best_path = None
    for path in distant_anc_paths:
        name_path = [names_map.get(pid, "UnknownName") for pid in path]
        score = sum((idx+1) for idx, nm in enumerate(name_path) if 'Yates' in nm)
        if best_score is None or score > best_score:
            best_score = score
            best_path = path

    if not best_path:
        best_path = []

    best_path_cleaned = [pid for pid in best_path if pid != individual_id]
    line_str = filter_ancestral_line(set(best_path_cleaned), generation_table, names_map)

    cm_value = ''
    sort_value = ''
    ydna_value = ''
    for ds in gedcom_instance.filter_pool:
        if ds.get_gen_person() == individual_id:
            cm_value = ds.get_extractable_cm()
            sort_value = ds.get_extractable_sort()
            ydna_value = ds.get_extractable_YDNA()
            break

    short_name = names_map.get(individual_id, "UnknownName")
    # Return columns: ID#, Match to, Name, cM, Yates DNA Ancestral Line, haplogroup
    return [individual_id, sort_value, short_name, cm_value, line_str, ydna_value]

###############################################################################
# main()
###############################################################################
def main():
    def select_gedcom():
        files = glob.glob("*.ged")
        if not files:
            print("No GEDCOM files found.")
            return None
        print("Automatically selecting the first GEDCOM file.")
        return files[0]

    gedcom_file_path = select_gedcom()
    if not gedcom_file_path:
        print("No GEDCOM file selected; exiting.")
        return

    ged = Gedcom(gedcom_file_path)
    autosomal_count = ged.parse_gedcom()
    filter_count = len(ged.filter_pool)

    with open("autosomal_count.txt", "w") as f:
        f.write(str(autosomal_count))

    print("Records tagged and filtered by NPFX:", filter_count)

    with open(gedcom_file_path, 'r', encoding='utf-8') as f:
        raw_data = f.read()

    blocks = raw_data.split('\n0 ')
    all_records = {}
    for blk in blocks:
        blk = blk.strip()
        if not blk:
            continue
        flend = blk.find('\n')
        if flend == -1:
            flend = len(blk)
        first_line = blk[:flend]
        if '@' in first_line:
            start = first_line.find('@') + 1
            end = first_line.find('@', start)
            rec_id = first_line[start:end].strip()
            all_records[rec_id] = blk

    parents_map = {}
    names_map = {}
    for rec_id, txt in all_records.items():
        nm = quick_extract_name("\n" + txt)
        names_map[rec_id] = nm

    families = {}
    for rec_id, txt in all_records.items():
        if 'FAM' in txt[:50]:
            father_idx = txt.find('1 HUSB @')
            husb_id = txt[father_idx+len('1 HUSB @'):txt.find('@', father_idx+len('1 HUSB @'))] if father_idx != -1 else None
            wife_idx = txt.find('1 WIFE @')
            wife_id = txt[wife_idx+len('1 WIFE @'):txt.find('@', wife_idx+len('1 WIFE @'))] if wife_idx != -1 else None
            kids = [ln.split('@')[1] for ln in txt.split('\n') if ln.strip().startswith('1 CHIL @')]
            families[rec_id] = (husb_id, wife_id, kids)

    for fam_id, (f_id, m_id, k_list) in families.items():
        for kid in k_list:
            parents_map[kid] = (f_id, m_id)

    individual_ids = [d.get_gen_person() for d in ged.filter_pool]
    print(f"Processing {len(individual_ids)} individuals with chunk-based parallel...")

    combined_rows = []
    chunk_size = 50
    max_workers = os.cpu_count() or 4
    logger.info("Starting chunk-based parallel processing with %d workers.", max_workers)

    with ProcessPoolExecutor(max_workers=max_workers) as executor, tqdm(total=len(individual_ids), desc="Building Yates Lines (Stage 1)") as pbar:
        for chunk in chunks(individual_ids, chunk_size):
            func = functools.partial(process_record_wrapper, gedcom_instance=ged, parents_map=parents_map, names_map=names_map)
            results = list(executor.map(func, chunk))
            combined_rows.extend(results)
            pbar.update(len(chunk))

    columns = ["ID#", "Match to", "Name", "cM", "Yates DNA Ancestral Line", "haplogroup"]
    df = pd.DataFrame(combined_rows, columns=columns)
    df.index += 1

    def remove_specific_prefix(row):
        prefix = "YatesJohn&SearchingStill~~~YatesWilliam&SearchingStill~~~YatesWilliam&SearchingStill~~~YatesEdmund&CornellMargaret~~~YatesRichard&AshendonJoan~~~YatesJohn&HydeAlice~~~YatesThomas&WhiteFrances~~~"
        if row["Yates DNA Ancestral Line"].startswith(prefix):
            row["Yates DNA Ancestral Line"] = row["Yates DNA Ancestral Line"][len(prefix):]
        return row

    df = df.apply(remove_specific_prefix, axis=1)

    logger.info("Building Trie from reversed lines...")
    trie = Trie()
    for _, row in df.iterrows():
        line_str = row["Yates DNA Ancestral Line"]
        if pd.notna(line_str) and line_str.strip():
            trie.insert_line([x.strip() for x in line_str.split("~~~") if x.strip()])

    values, prefix_counts = [], []
    logger.info("Computing 'Value' = 5*(#couples with node.count >=2) + (total couples) ...")
    for _, row in df.iterrows():
        line_str = row["Yates DNA Ancestral Line"]
        if pd.isna(line_str) or not line_str.strip():
            values.append(0)
            prefix_counts.append(0)
        else:
            couples_list = [x.strip() for x in line_str.split("~~~") if x.strip()]
            node_counts = trie.get_couple_count(couples_list)
            prefix_count = sum(1 for c in node_counts if c >= 2)
            values.append(5 * prefix_count + len(couples_list))
            prefix_counts.append(prefix_count)

    df["Value"], df["PrefixCount"] = values, prefix_counts

    def assign_value_range_label(val):
        try:
            v = float(val)
        except:
            return "", ""
        if v >= 60: return ">=60", "1-likely correct"
        if 47 <= v <= 59: return "59~47", "2-lines forming"
        if 34 <= v <= 46: return "46~34", "3-patterns emerging"
        if 21 <= v <= 33: return "33~21", "4-notable patterns"
        if 8 <= v <= 20: return "20~8", "5-patterns stable"
        if 1 <= v <= 7:  return f"{v:.0f}", "6-need research"
        return f"{v:.0f}", "0-uncategorized"

    ranges, labels = zip(*(assign_value_range_label(v) for v in df["Value"]))
    df["Value Range"], df["Value Label"] = ranges, labels

    df.sort_values(by=["Yates DNA Ancestral Line"], inplace=True)
    df.drop("PrefixCount", axis=1, inplace=True)

    csv_name = "final_combined_df_with_value_labels.csv"
    df.to_csv(csv_name, index=False)
    logger.info("Exported final DataFrame to '%s'.", csv_name)

    html_name = "HTML_combined_df_with_value_labels.html"
    css_style = """
    <style>
    table { width: 100%; border-collapse: collapse; margin: 20px 0; }
    table, th, td { border: 1px solid #333; }
    th, td { padding: 8px 12px; text-align: center; }
    th { background-color: #f2f2f2; }
    /* Left-align the last column */
    td:nth-child(7) { text-align: left; }
    </style>
    """
    final_cols = ["ID#", "cM", "haplogroup", "Match to", "Value Range", "Value Label", "Yates DNA Ancestral Line"]
    html_content = css_style + df.to_html(index=False, columns=final_cols, escape=False)
    with open(html_name, "w", encoding="utf-8") as f:
        f.write(html_content)
    logger.info("Exported HTML to '%s'.", html_name)

if __name__ == '__main__':
    main()
    try:
        display(Javascript('alert("✅ GEDCOM processing (and HTML export) is complete!");'))
    except:
        pass

import smtplib, ssl
from email.mime.text import MIMEText

def send_email(subject, body, to_addr):
    smtp_server = 'smtp.gmail.com'
    port = 465
    sender = os.environ['GMAIL_USER']
    password = os.environ['GMAIL_APP_PASSWORD']
    msg = MIMEText(body)
    msg['Subject'] = subject
    msg['From'] = sender
    msg['To'] = to_addr
    context = ssl.create_default_context()
    with smtplib.SMTP_SSL(smtp_server, port, context=context) as server:
        server.login(sender, password)
        server.send_message(msg)

# Email summary
df_summary = pd.read_csv("final_combined_df_with_value_labels.csv")
total = len(df_summary)
top5 = df_summary.sort_values('Value', ascending=False).head(5)['Yates DNA Ancestral Line'].tolist()
summary = f"GEDCOM processing complete!\n\nTotal lines: {total}\nTop 5 lines:\n" + "\n".join(f"- {line}" for line in top5)
send_email(subject="✅ Cell #1 Report Ready", body=summary, to_addr=os.environ['GMAIL_USER'])




Automatically selecting the first GEDCOM file.




GEDCOM contained 60283 total records
Records tagged and filtered by NPFX: 1467
Records with YDNA information: 90
Autosomal matches: 1377
Records tagged and filtered by NPFX: 1467
Processing 1467 individuals with chunk-based parallel...


Building Yates Lines (Stage 1): 100%|██████████| 1467/1467 [13:57<00:00,  1.75it/s]


<IPython.core.display.Javascript object>

In [None]:
# Cell 2: XHTML Template + Export + Root FTP Upload 20250513

import pandas as pd
from datetime import datetime
from zoneinfo import ZoneInfo
from ftplib import FTP_TLS
import os

# ————— Load Data —————
df = pd.read_csv("final_combined_df_with_value_labels.csv")

# ————— Blank out any NaN haplogroups —————
df['haplogroup'] = df['haplogroup'].fillna('')


# ————— Hyperlink haplogroup values, send all Y- designations to a single overview page —————
hap_base = "gengen/haplogroup/"
ydna_overview = "gengen/Y-designation-overview.htm"

df['haplogroup'] = (
    df['haplogroup']
      .fillna("")
      .apply(lambda x: (
          f'<a href="{ydna_overview}" target="_blank">{x}</a>'
            if x.startswith("Y-") else
          f'<a href="{hap_base}{x}.htm" target="_blank">{x}</a>'
            if x else
          ""
      ))
)


# ————— Load counts —————
try:
    with open("autosomal_count.txt", "r") as f:
        autosomal_count = int(f.read().strip())
except Exception:
    autosomal_count = None

prev_count = None
additional_str = ""
if os.path.exists("autosomal_count_prev.txt"):
    try:
        with open("autosomal_count_prev.txt", "r") as f:
            prev_count = int(f.read().strip())
        if autosomal_count is not None and prev_count is not None:
            diff_count = autosomal_count - prev_count
            additional_str = f" (+{diff_count} since last run)"
    except Exception:
        additional_str = ""

# ————— Get current Eastern time & formatted timestamp —————
now = datetime.now(ZoneInfo("America/New_York"))
updated_str = now.strftime("%d %B %Y at %H%M hours EDT")

# ————— XHTML Template —————
full_html_template = """<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
  "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
<head>
  <meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
  <meta name="GENERATOR" content="Yatesville"/>
  <meta name="viewport" content="width=device-width, initial-scale=1"/>
  <title>DNA Report Card</title>
  <script src="../sorttable.js" type="text/javascript"></script>
  <style type="text/css">
    body { font-family: Arial, Helvetica, sans-serif; font-size: 14px; background-color: #faf9d3; }
    .output-table table {
      width:100%; border-collapse:collapse; margin:15px 0; background-color:#faf9d3;
    }
    .output-table table, .output-table th, .output-table td {
      border:1px solid #333; text-align:center; padding:5px 8px; background-color:#faf9d3;
    }
    .output-table th { background-color:#ffffcc; white-space:nowrap; }
    .output-table th:hover { background-color:#ffeb99; }
    .output-table td:nth-child(6) { min-width:180px; }
    .output-table td:last-child, .output-table th:last-child {
      text-align:left; white-space:nowrap;
    }
    /* make the haplogroup (3rd) column wider by ~6 characters */
    .output-table th:nth-child(3),
    .output-table td:nth-child(3) {
  min-width: 140px;
}

  </style>
</head>
<body>
<div align="center">
  <table class="fullpage-definedsection" cellpadding="0"><tr valign="top"><td>
    <table class="headersection" cellpadding="0"><tr valign="top"><td></td></tr></table>
    <table class="mainsection" cellpadding="7">
      <tr valign="top"><td>
        <h2>A report card for your DNA family tree</h2>
        <font size="-2">
          Return to <a href="https://yates.one-name.net/gengen/dna_cousin_surname_study.htm">Study Home</a>
          &nbsp;|&nbsp;
          Autosomal matches: {autosomal_count}{additional_str}
          &nbsp;|&nbsp;
          Updated: {updated_timestamp}
        </font>
        <p>Imagine you have a report card for your family tree that tells you how your family tree compares to other collateral family tree lines.<br><br>Here is how we break it down:</p>
        <p>Think of value like the total number of points you get from finding all the important family connections in your tree<br>
        and comparing them to all the other trees included in the Yates study.</p>
        <p>We then group them as a way to signal which ones seem to have potential for study:
          <b>>60:</b> likely correct, <b>59–47:</b> lines forming, <b>46–34:</b> patterns emerging,
          <b>33–21:</b> notable patterns, <b>20–8:</b> patterns stable, <b>7–1:</b> and 6-need research.</p>
        <p><b><i><font size="-1">Click on the header to sort any column</font></i></b>
          (And, remember <a href="https://yates.one-name.net/gengen/dna_theory_of_the_case.htm" target="_blank">what this is telling</a> us....)</p>

      </td></tr>
    </table>
    <div class="output-table" style="margin-top:10px;">
      <!-- TABLE_PLACEHOLDER -->
    </div>
  </td></tr></table>
</div>
<button onclick="topFunction()" id="myBtn" title="Go to top"
  style="position:fixed;bottom:40px;right:40px;z-index:99;background-color:red;color:white;
         padding:12px 20px;border:none;border-radius:10px;cursor:pointer;font-size:16px;">
  Top
</button>
<script>
let mybutton = document.getElementById("myBtn");
window.onscroll = function() {
  if (document.body.scrollTop > 20 || document.documentElement.scrollTop > 20) {
    mybutton.style.display = "block";
  } else {
    mybutton.style.display = "none";
  }
};
function topFunction() {
  document.body.scrollTop = 0;
  document.documentElement.scrollTop = 0;
}
</script>
</body>
</html>"""

# ————— Build and inject table, counts, and timestamp —————
final_cols = ["ID#", "cM", "haplogroup", "Match to", "Value Range", "Value Label", "Yates DNA Ancestral Line"]
df.sort_values(by=final_cols[-1], inplace=True)

html_table = df.to_html(
    index=False,
    columns=final_cols,
    escape=False,
    classes="dataframe sortable"
)

final_html = (
    full_html_template
    .replace("{autosomal_count}", str(autosomal_count or "Unknown"))
    .replace("{additional_str}", additional_str)
    .replace("{updated_timestamp}", updated_str)
)
final_html = final_html.replace("<!-- TABLE_PLACEHOLDER -->", html_table)

# ————— Save to local files —————
with open("HTML_combined_df_with_value_labels.html", "w", encoding="utf-8") as f:
    f.write(final_html)

with open("dna_cousin_surname_app.htm", "w", encoding="utf-8") as f:
    f.write(final_html)

# ————— FTP Upload to ROOT —————
ftp_host = os.environ['FTP_HOST']
ftp_port = int(os.environ.get('FTP_PORT', 21))
ftp_user = os.environ['FTP_USER']
ftp_pass = os.environ['FTP_PASS']

def upload_to_root(filenames):
    ftps = FTP_TLS()
    ftps.connect(ftp_host, ftp_port)
    ftps.login(ftp_user, ftp_pass)
    ftps.prot_p()
    for fname in filenames:
        try:
            ftps.delete(fname)
        except Exception:
            pass
        with open(fname, 'rb') as f:
            print(f"→ uploading {fname} …", end=' ')
            ftps.storbinary(f"STOR {fname}", f)
            print("done")
        try:
            ftps.sendcmd(f"SITE CHMOD 644 {fname}")
        except Exception:
            pass
    ftps.quit()
    print("✅ All files uploaded to One Name Study.")

# Run upload
upload_to_root(["dna_cousin_surname_app.htm"])

# ————— Update previous autosomal count for next run —————
if autosomal_count is not None:
    with open("autosomal_count_prev.txt", "w") as f:
        f.write(str(autosomal_count))


→ uploading dna_cousin_surname_app.htm … done
✅ All files uploaded to One Name Study.


In [12]:
# Cell 3 the Y-DNA compare

import pandas as pd
import os
import re
from IPython.display import HTML, display

# ——— File Paths ———
input_csv = "/content/y_dna_comparison.csv"
output_csv = "y_dna_comparison_fixed.csv"
output_html = "y_dna_comparison_fixed.htm"

# ——— Load CSV ———
df = pd.read_csv(input_csv, dtype=str).fillna("")
df.columns = df.columns.str.strip()

# ——— Select user column ———
user_col = 'i1-Big'
if user_col not in df.columns:
    raise KeyError(f"Column '{user_col}' not found.")

# ——— Extract & clean haplogroup path ———
def clean_id(text):
    return re.sub(r"\(.*?\)", "", text).strip() if isinstance(text, str) else ""

raw_path = df.at[0, user_col]
haplogroups_in_path = [clean_id(part) for part in raw_path.split(">") if part.strip()]

# ——— Clean column 1 for reliable comparison ———
df['clean_haplogroup'] = df.iloc[:, 0].apply(clean_id)

# ——— Fill user column only on matching rows ———
df[user_col] = df.apply(
    lambda row: raw_path if row['clean_haplogroup'] in haplogroups_in_path else "",
    axis=1
)

df.drop(columns=['clean_haplogroup'], inplace=True)
df.to_csv(output_csv, index=False)

# ——— CSS: gray-out empty user columns (starting at col 4) ———
user_start_col = 4
user_col_count = len(df.columns) - 3
nth_child_css = "\n".join([
    f"td:nth-child({i}):empty {{ background-color:#666; color:#fff; }}"
    for i in range(user_start_col, user_start_col + user_col_count)
])

style = f"""
<style>
  table.ydna-table {{ width:100%; border-collapse:collapse; }}
  th, td {{ border:1px solid #ccc; padding:8px; vertical-align:top; }}
  th:nth-child(1), th:nth-child(2),
  td:nth-child(1), td:nth-child(2) {{ text-align:center; min-width:10ch; }}
  th:nth-child(3), td:nth-child(3) {{ text-align:left; }}
  th {{ background-color:#f2f2f2; }}
  {nth_child_css}
</style>
"""

html_table = df.to_html(index=False, classes="ydna-table", border=1, escape=False)
final_html = style + html_table

with open(output_html, "w", encoding="utf-8") as f:
    f.write(final_html)

display(HTML(final_html))
print(f"✅ CSV saved as: {output_csv}")
print(f"✅ HTML saved as: {output_html}")


i1-Big,Description of mutation events and the human migrations and social identities
R-M207,"Originating around 27,000 BCE, carriers belonged to Upper Paleolithic hunter-gatherer bands across Eastern Europe and Central Asia. They hunted giant game on the mammoth-steppe and likely used clan totems or landmark-based nicknames to identify kin."
,"Emerging around 25,000 BCE during the Last Glacial Maximum, M173 carriers were small ice-age survival groups roaming Siberia and western Russia. They used seasonal camp names or game-based epithets to distinguish kin."
,"Dating to roughly 22,000 BCE, M343 (R1b) groups clustered in the Franco-Cantabrian refugium of southwestern Europe. Cro-Magnon communities painted caves and hunted reindeer, naming bands after rock shelters or streams."
,"Arising about 20,000 BCE as post-glacial populations expanded north from refugia, L754 carriers explored new western European landscapes. They used local topographical features—caves, rivers—as informal clan identifiers."
,"Splitting around 18,500 BCE among settlers in Iberia and southern France, L761 carriers repopulated thawed lands. Seasonal hunting grounds and coastal landmarks provided the band names marking each sister line."
,"Appearing near 17,000 BCE with Magdalenian societies in France and northern Spain, L389 carriers inhabited tool-making camps and decorated caves. These camps and painting sites doubled as clan identifiers."
,"Originating around 14,000 BCE in post-glacial western European plains, P297 carriers followed game migrations and seasonal fish runs. Animal-track totems and fishing-site names served to distinguish lineages."
,"Dating to about 12,000 BCE, M269 spread rapidly with Mesolithic foragers across Europe. Tribal identities formed around seasonal gatherings and ancestral cave names."
,"Emerging around 10,000 BCE among early Neolithic pioneers in the Balkans, L23 carriers brought farming into central Europe. Kinship groups often took names from local rivers or earthen longhouses."
,"Arising roughly 9,000 BCE along Atlantic-coast settlements, L51 carriers built megalithic structures. Passage-grave sites and distinctive pottery styles served as clan labels."


✅ CSV saved as: y_dna_comparison_fixed.csv
✅ HTML saved as: y_dna_comparison_fixed.htm


In [28]:
# CELL EXP

# Y-DNA HTML Table Export from CSV with auto column widths
import pandas as pd
from IPython.display import display, HTML

# Load the CSV file
csv_path = "/content/y_dna_comparison.csv"
df = pd.read_csv(csv_path)

# Remove first column if it's just a row number or placeholder
if df.columns[0].lower() in ['position', 'row', 'index']:
    df = df.drop(columns=df.columns[0])

# Begin HTML structure
html = """
<!DOCTYPE html>
<html>
<head>
<meta charset="UTF-8">
<title>Yates Y-DNA Comparison</title>
<style>
  body { font-family: Arial, sans-serif; }
  table { border-collapse: collapse; table-layout: auto; width: auto; }
  th, td { border: 1px solid #999; padding: 6px; text-align: center; white-space: nowrap; }
  th { background-color: #333; color: white; }
  td.blank { background-color: #ccc; color: #ccc; }
</style>
</head>
<body>
<h2>Yates Y-DNA Haplogroup Comparison</h2>
<table>
  <tr>
"""

# Add headers
for col in df.columns:
    html += f"<th>{col}</th>"
html += "</tr>\n"

# Add table rows
for _, row in df.iterrows():
    html += "  <tr>"
    for cell in row:
        if pd.isna(cell) or str(cell).strip() == "":
            html += '<td class="blank">–</td>'
        else:
            html += f"<td>{cell}</td>"
    html += "</tr>\n"

# Close out
html += """
</table>
</body>
</html>
"""

# Write to .htm file
output_path = "/content/y_dna_output.htm"
with open(output_path, "w", encoding="utf-8") as f:
    f.write(html)

print(f"✅ HTML output saved to: {output_path}")






✅ HTML output saved to: /content/y_dna_output.htm


In [29]:
# FULL CODE TO ADD A NEW USER TO THE CSV

import pandas as pd

# === Step 1: Load existing Y-DNA comparison grid ===
csv_path = "/content/y_dna_comparison.csv"
df = pd.read_csv(csv_path)

# === Step 2: Define new user (copy/paste next one here) ===
new_user_input = "i27775-Big&R-M207>M173>M343>L754>L761>L389>P297>M269>L23>L51>P310>L151>U106>Z2265>BY30097>FTT8>Z381>Z301>L48>Y37962>S23189>FT6679>L200>A11431>ACT920>BY15306>BY15314>BY92194>FT8553>FT8982>FT267444>FTF17042>FT266579>FT266064"

# === Step 3: Parse user ID and haplogroups ===
if '&' not in new_user_input:
    raise ValueError("Input must contain '&' between userID and haplogroup path.")

user_id, haplo_chain = new_user_input.split("&", 1)
haplogroups = haplo_chain.strip().split(">")

# === Step 4: Pad existing DataFrame if needed ===
max_rows = max(len(df), len(haplogroups))
df = df.reindex(range(max_rows))  # Ensure enough rows
df[user_id] = pd.Series(haplogroups)

# === Step 5: Save updated file ===
df.to_csv(csv_path, index=False)
print(f"✅ Added {user_id} to {csv_path} with {len(haplogroups)} haplogroups.")


✅ Added i27775-Big to /content/y_dna_comparison.csv with 34 haplogroups.
