<a href="https://colab.research.google.com/github/ronyates47/Gedcom-Utils/blob/main/GOLD_May.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install pandas
!pip install python-gedcom
!pip install openpyxl
!pip install xlsxwriter
!pip install mlxtend


Collecting python-gedcom
  Downloading python_gedcom-1.0.0-py2.py3-none-any.whl.metadata (15 kB)
Downloading python_gedcom-1.0.0-py2.py3-none-any.whl (35 kB)
Installing collected packages: python-gedcom
Successfully installed python-gedcom-1.0.0
Collecting xlsxwriter
  Downloading XlsxWriter-3.2.3-py3-none-any.whl.metadata (2.7 kB)
Downloading XlsxWriter-3.2.3-py3-none-any.whl (169 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m169.4/169.4 kB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: xlsxwriter
Successfully installed xlsxwriter-3.2.3


In [2]:
import os

# Gmail SMTP creds
os.environ['GMAIL_USER']         = 'yatesvilleron@gmail.com'
os.environ['GMAIL_APP_PASSWORD'] = 'qtziwiblytgrlzvx'

# FTPS upload creds — make sure FTP_PASS is exactly your password, no < or >
os.environ['FTP_HOST']       = 'ftp.one-name.net'
os.environ['FTP_PORT']       = '21'
os.environ['FTP_USER']       = 'admin@yates.one-name.net'
os.environ['FTP_PASS']       = 'v(i83lfQB@dB'
os.environ['FTP_REMOTE_DIR'] = '/public_html/gengen/'




In [25]:
# GOLD_May.ipynb

#!/usr/bin/env python
"""
GEDCOM Composite Score Script using:
 - Chunk-based Parallel Processing for Speed (Stage 1: genealogical line creation)
 - A Trie-based approach, then final "Value" = 5 * (number of couples with node.count >=2) + (total couples)

For ancestral lines where none of the couples are repeated (a one-off line), the Value is still computed.
Now, instead of composite scoring, two new columns are added:
  - Value Range (the numeric bracket)
  - Value Label (a descriptive label)

Exports final CSV/HTML sorted by "Yates DNA Ancestral Line".
"""
import csv
import glob
import logging
import functools
import os
from datetime import datetime
from collections import defaultdict, Counter
import numpy as np
import pandas as pd
from concurrent.futures import ProcessPoolExecutor
from tqdm import tqdm
from IPython.display import display, Javascript

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

###############################################################################
# Global Variables
###############################################################################
anchor_gen1 = None
visited_pairs = set()
generation_table = []

###############################################################################
# Trie Data Structure
###############################################################################
class TrieNode:
    """A simple Trie node for storing a couple and counting how many lines pass here."""
    def __init__(self):
        self.count = 0
        self.children = {}

class Trie:
    def __init__(self):
        self.root = TrieNode()

    def insert_line(self, couples_list):
        current = self.root
        for couple in couples_list:
            if couple not in current.children:
                current.children[couple] = TrieNode()
            current = current.children[couple]
            current.count += 1

    def get_couple_count(self, couples_list):
        counts = []
        current = self.root
        for couple in couples_list:
            if couple in current.children:
                current = current.children[couple]
                counts.append(current.count)
            else:
                counts.append(0)
                break
        return counts

###############################################################################
# Utility: chunk generator
###############################################################################
def chunks(lst, n):
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

###############################################################################
# GedcomDataset & Gedcom Classes
###############################################################################
class GedcomDataset:
    def __init__(self, gen_person):
        self.gen_person = gen_person
        self.extractable_detail = {}
        self.anchor_gen1 = None

    def add_extractable_detail(self, key, value):
        self.extractable_detail[key] = value

    def get_gen_person(self):
        name = self.extractable_detail.get('NAME', '')
        parts = name.split('/', 1)
        first_name = parts[0].split(' ')[0]
        last_name = parts[1].rstrip('/') if len(parts) > 1 else ""
        self.anchor_gen1 = last_name.replace(" ", "") + first_name.replace(" ", "")
        global anchor_gen1
        anchor_gen1 = self.anchor_gen1
        return self.gen_person.strip('@')

    def get_anchor_gen1(self):
        return self.anchor_gen1

    def get_extractable_NPFX(self):
        return self.extractable_detail.get('NPFX', '')

    def get_extractable_cm(self):
        npfx_value = self.extractable_detail.get('NPFX', '')
        if '&' in npfx_value:
            cm_value = npfx_value.split('&')[0].strip()
        elif '**' in npfx_value:
            cm_value = npfx_value.split('**')[0].strip()
        else:
            cm_value = npfx_value.strip()
        try:
            int(cm_value)
            return cm_value
        except ValueError:
            return ''

    def get_extractable_sort(self):
        npfx_value = self.extractable_detail.get('NPFX', '')
        if '&' in npfx_value:
            sort_part = npfx_value.split('&')[1]
            if '**' in sort_part:
                sort_value = sort_part.split('**')[0].strip()
            else:
                sort_value = sort_part.strip()
            return sort_value
        return ''

    def get_extractable_YDNA(self):
        npfx_value = self.extractable_detail.get('NPFX', '')
        if '**' in npfx_value:
            return npfx_value.split('**')[1].strip()
        return ''

    def get_extractable_FAMC(self):
        return self.extractable_detail.get('FAMC', '').strip('@')

class Gedcom:
    def __init__(self, file_name):
        self.file_name = file_name
        self.gedcom_datasets = []
        self.filter_pool = []
        # will be set in parse_gedcom():
        self.total_count = 0
        self.npfx_count  = 0
        self.ydna_count  = 0

    def parse_gedcom(self):
        with open(self.file_name, 'r', encoding='utf-8-sig') as f:
            lines = f.readlines()

        current_dataset = None
        npfx_count = 0
        ydna_count = 0
        total_count = 0

        for line in lines:
            parts = line.strip().split(' ', 2)
            level = int(parts[0])
            tag = parts[1]
            value = parts[2] if len(parts) > 2 else None

            if level == 0 and tag.startswith('@') and tag.endswith('@') and value == 'INDI':
                total_count += 1
                current_dataset = GedcomDataset(tag)
                self.gedcom_datasets.append(current_dataset)
            elif current_dataset is not None:
                if level == 1 and tag in ['NAME', 'FAMC']:
                    current_dataset.add_extractable_detail(tag, value)
                elif level == 2 and tag == 'NPFX':
                    npfx_count += 1
                    current_dataset.add_extractable_detail(tag, value)
                    if '**' in value:
                        ydna_count += 1

        self.total_count = total_count
        self.npfx_count  = npfx_count
        self.ydna_count  = ydna_count

        # build filter_pool
        for ds in self.gedcom_datasets:
            if ds.get_extractable_NPFX():
                self.filter_pool.append(ds)

        # manual filter
        try:
            df_manual = pd.read_excel('filtered_ids.xlsx')
            manual_ids = set(df_manual['ID'])
            self.filter_pool = [
                d for d in self.filter_pool if d.get_gen_person() in manual_ids
            ]
        except FileNotFoundError:
            logger.warning("filtered_ids.xlsx not found; skipping manual filter.")

        # return autosomal_count
        return npfx_count - ydna_count

###############################################################################
# Ancestor-finding Helpers
###############################################################################
def quick_extract_name(full_text):
    name_marker = "\n1 NAME "
    idx = full_text.find(name_marker)
    if idx == -1:
        if full_text.startswith("1 NAME "):
            idx = 0
        else:
            return "UnknownName"
    start = idx + len(name_marker)
    end = full_text.find('\n', start)
    if end == -1:
        end = len(full_text)
    name_line = full_text[start:end].strip()
    if '/' not in name_line:
        return name_line[:10].replace(" ", "")
    first_name, last_name = name_line.split('/', 1)
    last_name = last_name.replace("/", "").strip()
    return last_name[:10].replace(" ", "") + first_name[:10].replace(" ", "")

def find_parents(individual_id, generation, parents_map):
    global visited_pairs, generation_table
    if individual_id not in parents_map:
        return
    father_id, mother_id = parents_map[individual_id]
    if not father_id and not mother_id:
        return
    pair = (father_id, mother_id)
    if pair not in visited_pairs:
        visited_pairs.add(pair)
        generation_table.append((generation, pair))
    if father_id:
        find_parents(father_id, generation+1, parents_map)
    if mother_id:
        find_parents(mother_id, generation+1, parents_map)

def find_distant_ancestors(individual_id, parents_map, path=None):
    if path is None:
        path = []
    path.append(individual_id)
    if individual_id not in parents_map:
        return [path]
    father_id, mother_id = parents_map[individual_id]
    if not father_id and not mother_id:
        return [path]
    paths = []
    if father_id:
        paths.extend(find_distant_ancestors(father_id, parents_map, path[:]))
    if mother_id:
        paths.extend(find_distant_ancestors(mother_id, parents_map, path[:]))
    return paths

def filter_ancestral_line(winning_ids, gen_table, names_map):
    # only include valid pairs and guard against None or missing keys
    matches = [
        (g, (id1, id2))
        for g, (id1, id2) in gen_table
        if (id1 in winning_ids and id1 is not None) or (id2 in winning_ids and id2 is not None)
    ]
    matches.sort(key=lambda x: x[0])
    lines = []
    for _, (id1, id2) in matches:
        if id1 is None or id2 is None:
            continue
        name1 = names_map.get(id1, "Unknown")
        name2 = names_map.get(id2, "Unknown")
        lines.append(f"{name1}&{name2}")
    lines.reverse()
    return "~~~".join(lines)

def process_record_wrapper(individual_id, ged, parents_map, names_map):
    global generation_table, visited_pairs
    generation_table = []
    visited_pairs = set()

    find_parents(individual_id, 1, parents_map)
    paths = find_distant_ancestors(individual_id, parents_map)

    best_path, best_score = [], None
    for path in paths:
        score = sum((i+1) for i, pid in enumerate(path) if 'Yates' in names_map.get(pid, ""))
        if best_score is None or score > best_score:
            best_score, best_path = score, path

    cleaned = [pid for pid in best_path if pid != individual_id]
    line_str = filter_ancestral_line(set(cleaned), generation_table, names_map)

    cm = sort_val = ydna = anchor = ""
    for ds in ged.filter_pool:
        if ds.get_gen_person() == individual_id:
            cm = ds.get_extractable_cm()
            sort_val = ds.get_extractable_sort()
            ydna = ds.get_extractable_YDNA()
            anchor = ds.get_anchor_gen1()
            break

    return [individual_id, sort_val, names_map.get(individual_id, "Unknown"), cm, line_str]

###############################################################################
# Main
###############################################################################
def main():
    def select_gedcom():
        files = glob.glob("*.ged")
        if not files:
            print("No GEDCOM files found.")
            return None
        return files[0]

    gedfile = select_gedcom()
    if not gedfile:
        return

    ged = Gedcom(gedfile)
    autosomal_count = ged.parse_gedcom()
    filter_count = len(ged.filter_pool)

    # --- print counts once ---
    print(f"GEDCOM contained {ged.total_count} total records")
    print(f"Records tagged by NPFX: {ged.npfx_count}")
    print(f"Records with YDNA information: {ged.ydna_count}")
    print(f"Records after manual filter: {filter_count}")

    # handle previous-run counts
    prev = None
    if os.path.exists("autosomal_count.txt"):
        try:
            prev = int(open("autosomal_count.txt").read().strip())
        except:
            prev = None
    if prev is not None:
        open("autosomal_count_prev.txt", "w").write(str(prev))
    open("autosomal_count.txt", "w").write(str(autosomal_count))

    # print autosomal matches
    if prev is not None:
        diff = autosomal_count - prev
        print(f"Autosomal matches: {autosomal_count} (+{diff} since last run)")
    else:
        print(f"Autosomal matches: {autosomal_count}")

    # build parents_map & names_map
    raw = open(gedfile, 'r', encoding='utf-8').read()
    blocks = [b.strip() for b in raw.split('\n0 ') if b.strip()]
    all_recs = {}
    for blk in blocks:
        header = blk.split('\n', 1)[0]
        if '@' in header:
            rid = header.split('@')[1]
            all_recs[rid] = blk

    parents_map, names_map = {}, {}
    for rid, blk in all_recs.items():
        names_map[rid] = quick_extract_name("\n" + blk)
        if '1 HUSB @' in blk or '1 WIFE @' in blk:
            husb = blk.split('1 HUSB @')[1].split('@')[0] if '1 HUSB @' in blk else None
            wife = blk.split('1 WIFE @')[1].split('@')[0] if '1 WIFE @' in blk else None
            kids = [ln.split('@')[1] for ln in blk.split('\n') if ln.startswith('1 CHIL @')]
            for k in kids:
                parents_map[k] = (husb, wife)

    ids = [d.get_gen_person() for d in ged.filter_pool]
    print(f"Processing {len(ids)} individuals...")

    # parallel processing
    rows = []
    with ProcessPoolExecutor(max_workers=os.cpu_count() or 4) as exe, tqdm(total=len(ids), desc="Building Yates Lines") as pbar:
        for chunk in chunks(ids, 50):
            func = functools.partial(process_record_wrapper, ged=ged, parents_map=parents_map, names_map=names_map)
            res = list(exe.map(func, chunk))
            rows.extend(res)
            pbar.update(len(chunk))

    df = pd.DataFrame(rows, columns=["ID#", "Match to", "Name", "cM", "Yates DNA Ancestral Line"])
    df.index += 1

    # remove prefix
    def fix_pref(r):
        pref = ("YatesJohn&SearchingStill~~~YatesWilliam&SearchingStill~~~"
                "YatesWilliam&SearchingStill~~~YatesEdmund&CornellMargaret~~~"
                "YatesRichard&AshendonJoan~~~YatesJohn&HydeAlice~~~"
                "YatesThomas&WhiteFrances~~~")
        if r["Yates DNA Ancestral Line"].startswith(pref):
            r["Yates DNA Ancestral Line"] = r["Yates DNA Ancestral Line"][len(pref):]
        return r
    df = df.apply(fix_pref, axis=1)

    # build trie, compute Value
    trie = Trie()
    for _, r in df.iterrows():
        line = r["Yates DNA Ancestral Line"]
        if not line.strip():
            continue
        couples = [x for x in line.split("~~~") if x.strip()]
        trie.insert_line(couples)

    vals, pcs = [], []
    for _, r in df.iterrows():
        line = r["Yates DNA Ancestral Line"]
        if not line.strip():
            vals.append(0)
            pcs.append(0)
        else:
            couples = [x for x in line.split("~~~") if x.strip()]
            counts = trie.get_couple_count(couples)
            pc = sum(1 for c in counts if c >= 2)
            vals.append(5 * pc + len(couples))
            pcs.append(pc)
    df["Value"], df["PrefixCount"] = vals, pcs

    # assign labels
    def label(v):
        v = float(v)
        if v >= 60:
            return ">=60", "1-likely correct"
        if 47 <= v <= 59:
            return "59~47", "2-lines forming"
        if 34 <= v <= 46:
            return "46~34", "3-patterns emerging"
        if 21 <= v <= 33:
            return "33~21", "4-notable patterns"
        if 8 <= v <= 20:
            return "20~8", "5-patterns stable"
        if 1 <= v <= 7:
            return f"{v:.0f}", "6-need research"
        return f"{v:.0f}", "0-uncategorized"

    ranges, labels = [], []
    for v in df["Value"]:
        r, l = label(v)
        ranges.append(r)
        labels.append(l)
    df["Value Range"], df["Value Label"] = ranges, labels

    df.sort_values("Yates DNA Ancestral Line", inplace=True)
    df.drop("PrefixCount", axis=1, inplace=True)
    df.to_csv("final_combined_df_with_value_labels.csv", index=False)
    css = """
    <style>table{width:100%;border-collapse:collapse;margin:20px 0;}
    th,td{border:1px solid #333;padding:8px 12px;text-align:center;}
    th{background:#f2f2f2;}td:nth-child(6){text-align:left;}</style>"""
    html = css + df.to_html(
        index=False,
        columns=["ID#", "cM", "Match to", "Value Range", "Value Label", "Yates DNA Ancestral Line"],
        escape=False
    )
    with open("HTML_combined_df_with_value_labels.html", "w", encoding="utf-8") as f:
        f.write(html)
    logger.info("Export complete.")

    # optional email
    import smtplib, ssl
    from email.mime.text import MIMEText
    def send_email(sub, body, to):
        srv, pt = 'smtp.gmail.com', 465
        snd, pw = os.environ['GMAIL_USER'], os.environ['GMAIL_APP_PASSWORD']
        msg = MIMEText(body); msg['Subject'] = sub; msg['From'] = snd; msg['To'] = to
        ctx = ssl.create_default_context()
        with smtplib.SMTP_SSL(srv, pt, context=ctx) as s:
            s.login(snd, pw); s.send_message(msg)

    total = len(df)
    send_email("✅ GEDCOM Report Ready", f"Processed {total} lines", os.environ['GMAIL_USER'])

if __name__ == '__main__':
    main()


GEDCOM contained 60113 total records
Records tagged by NPFX: 1446
Records with YDNA information: 90
Records after manual filter: 78
Autosomal matches: 1356 (+0 since last run)
Processing 78 individuals...


Building Yates Lines: 100%|██████████| 78/78 [00:44<00:00,  1.77it/s]


In [26]:
#KEEP THIS TO REFERENCE CELL 1 TO RUN CELL 2 AUTOMATICLLY
# Cell 2: XHTML Template + Export + Root FTP Upload

import pandas as pd
from IPython.display import Javascript  # comment out alert if not needed
from datetime import datetime
from zoneinfo import ZoneInfo
from ftplib import FTP_TLS
import os

# ————— Load Data —————
df = pd.read_csv("final_combined_df_with_value_labels.csv")

# ————— Load counts —————
try:
    with open("autosomal_count.txt", "r") as f:
        autosomal_count = int(f.read().strip())
except Exception:
    autosomal_count = None

prev_count = None
additional_str = ""
if os.path.exists("autosomal_count_prev.txt"):
    try:
        with open("autosomal_count_prev.txt", "r") as f:
            prev_count = int(f.read().strip())
        if autosomal_count is not None:
            diff_count = autosomal_count - prev_count
            additional_str = f" (+{diff_count} since last run)"
    except Exception:
        additional_str = ""

# ————— Current EST timestamp —————
now_est = datetime.now(ZoneInfo("America/New_York"))
updated_timestamp = now_est.strftime('%Y-%m-%d %H:%M:%S %Z')

# ————— Build HTML —————
full_html_template = """<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
  "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
<head>
  <meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
  <meta name="GENERATOR" content="Yatesville"/>
  <meta name="viewport" content="width=device-width, initial-scale=1"/>
  <title>DNA Report Card</title>
  <script src="../sorttable.js" type="text/javascript"></script>
  <style type="text/css">
    body { font-family: Arial, Helvetica, sans-serif; font-size: 14px; background-color: #faf9d3; }
    .output-table table {
      width:100%; border-collapse:collapse; margin:15px 0; background-color:#faf9d3;
    }
    .output-table table, .output-table th, .output-table td {
      border:1px solid #333; text-align:center; padding:5px 8px; background-color:#faf9d3;
    }
    .output-table th { background-color:#ffffcc; white-space:nowrap; }
    .output-table th:hover { background-color:#ffeb99; }
    .output-table td:nth-child(5) { min-width:180px; }
    .output-table td:last-child, .output-table th:last-child {
      text-align:left; white-space:nowrap;
    }
  </style>
</head>
<body>
<div align="center">
  <table class="fullpage-definedsection" cellpadding="0"><tr valign="top"><td>
    <table class="headersection" cellpadding="0"><tr valign="top"><td></td></tr></table>
    <table class="mainsection" cellpadding="7">
      <tr valign="top"><td>
        <h2>A report card for your DNA family tree</h2>
        <font size="-2">
          Return to <a href="https://yates.one-name.net/gengen/dna_cousin_surname_study.htm">Study Home</a>
          &nbsp;|&nbsp;
          Autosomal matches: {autosomal_count}{additional_str}
          &nbsp;|&nbsp;
          Updated: {updated_timestamp}
        </font>
        <p>Imagine you have a report card for your family tree that tells you how your family tree compares to other collateral family tree lines.<br><br>Here is how we break it down:</p>
        <p>Think of value like the total number of points you get from finding all the important family connections in your tree<br>
        and comparing them to all the other trees included in the Yates study.</p>
        <p>We then group them as a way to signal which ones seem to have potential for study:
          <b>>60:</b> likely correct, <b>59–47:</b> lines forming, <b>46–34:</b> patterns emerging,
          <b>33–21:</b> notable patterns, <b>20–8:</b> patterns stable, <b>7–1:</b> and 6-need research.</p>
        <p><b><i><font size="-1">Click on the header to sort any column</font></i></b>
          (And, remember <a href="https://yates.one-name.net/gengen/dna_theory_of_the_case.htm" target="_blank">what this is telling</a> us....)</p>
      </td></tr>
    </table>
    <div class="output-table" style="margin-top:10px;">
      <!-- TABLE_PLACEHOLDER -->
    </div>
  </td></tr></table>
</div>
<button onclick="topFunction()" id="myBtn" title="Go to top"
  style="position:fixed;bottom:40px;right:40px;z-index:99;background-color:red;color:white;
         padding:12px 20px;border:none;border-radius:10px;cursor:pointer;font-size:16px;">
  Top
</button>
<script>
let mybutton = document.getElementById("myBtn");
window.onscroll = function() {
  if (document.body.scrollTop > 20 || document.documentElement.scrollTop > 20) {
    mybutton.style.display = "block";
  } else {
    mybutton.style.display = "none";
  }
};
function topFunction() {
  document.body.scrollTop = 0;
  document.documentElement.scrollTop = 0;
}
</script>
</body>
</html>"""

final_cols = ["ID#", "cM", "Match to", "Value Range", "Value Label", "Yates DNA Ancestral Line"]
df.sort_values(by=["Yates DNA Ancestral Line"], inplace=True)
html_table = df.to_html(index=False, columns=final_cols, escape=False, classes="dataframe sortable")

# Inject counts and timestamp
final_html = (
    full_html_template
    .replace("{autosomal_count}", str(autosomal_count or "Unknown"))
    .replace("{additional_str}", additional_str)
    .replace("{updated_timestamp}", updated_timestamp)
)
final_html = final_html.replace("<!-- TABLE_PLACEHOLDER -->", html_table)

# Save locally
with open("dna_cousin_surname_app.htm", "w", encoding="utf-8") as f:
    f.write(final_html)

# Notebook-level alert (commented out)
# display(Javascript('alert("✅ DNA Report Card generated locally.");'))

# ————— FTP Upload to ROOT —————
ftp_host = os.environ['FTP_HOST']
ftp_port = int(os.environ.get('FTP_PORT', 21))
ftp_user = os.environ['FTP_USER']
ftp_pass = os.environ['FTP_PASS']

def upload_to_root(filenames):
    ftps = FTP_TLS()
    ftps.connect(ftp_host, ftp_port)
    ftps.login(ftp_user, ftp_pass)
    ftps.prot_p()
    for fname in filenames:
        try:
            ftps.delete(fname)
        except:
            pass
        with open(fname, 'rb') as f:
            print(f"→ uploading {fname} …", end=' ')
            ftps.storbinary(f"STOR {fname}", f)
            print("done")
        try:
            ftps.sendcmd(f"SITE CHMOD 644 {fname}")
        except:
            pass
    ftps.quit()
    print("✅ All files uploaded to One Name Study.")

# Run upload
upload_to_root(["dna_cousin_surname_app.htm"])




→ uploading dna_cousin_surname_app.htm … done
✅ All files uploaded to One Name Study.
