<a href="https://colab.research.google.com/github/ronyates47/Gedcom-Utils/blob/main/gedcomscript_v_230730_1035_hrs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pandas
!pip install python-gedcom

Collecting python-gedcom
  Downloading python_gedcom-1.0.0-py2.py3-none-any.whl (35 kB)
Installing collected packages: python-gedcom
Successfully installed python-gedcom-1.0.0


In [2]:
# Keep as baseline
class Gedcom:
    def __init__(self, file_name):
        self.file_name = file_name
        self.gedcom_datasets = []
        self.filter_pool = []

    def parse_gedcom(self):
        with open(self.file_name, 'r', encoding='utf-8-sig') as f:
            gedcom_lines = f.readlines()

        current_dataset = None
        npfx_count = 0
        total_count = 0
        for line in gedcom_lines:
            parts = line.strip().split(' ', 2)
            level = int(parts[0])
            tag = parts[1]
            value = parts[2] if len(parts) > 2 else None

            if level == 0 and tag.startswith('@') and tag.endswith('@') and value == 'INDI':
                total_count += 1
                current_dataset = GedcomDataset(tag)
                self.gedcom_datasets.append(current_dataset)
            elif current_dataset is not None:
                if level == 1 and tag in ['NAME', 'FAMC']:
                    current_key = tag
                    current_dataset.add_extractable_detail(current_key, value)
                elif level == 2 and tag == 'NPFX':
                    npfx_count += 1
                    current_dataset.add_extractable_detail(tag, value)

        print(f'Found {total_count} total records')
        print(f'Found {npfx_count} NPFX tags')

        # Filter datasets based on presence of NPFX tag
        for dataset in self.gedcom_datasets:
            if dataset.get_extractable_NPFX():
                self.filter_pool.append(dataset)

    def print_extractable_details(self):
        headers = ['ID#', 'Anchor_gen1', 'cM', 'Sort', 'Parents']
        col_widths = [max(len(header), 20) for header in headers]
        header_row = ''.join(header.center(col_widths[i]) for i, header in enumerate(headers))
        print(header_row)
        for i, dataset in enumerate(self.filter_pool):
            if i >= 500:
                break
            row = []
            row.append(dataset.get_gen_person())
            row.append(dataset.get_anchor_gen1())
            row.append(dataset.get_extractable_cm())
            row.append(dataset.get_extractable_sort())
            row.append(dataset.get_extractable_FAMC())
            data_row = ''.join(str(data).center(col_widths[i]) for i, data in enumerate(row))
            print(data_row)

class GedcomDataset:
    def __init__(self, gen_person):
        self.gen_person = gen_person
        self.extractable_detail = {}

    def add_extractable_detail(self, key, value):
        self.extractable_detail[key] = value

    def get_gen_person(self):
        # Extract and return the gen_person value from the gedcom_dataset
        name = self.extractable_detail.get('NAME', '')
        first_name, last_name = name.split('/', 1)
        first_name = first_name.split(' ')[0]
        last_name = last_name.rstrip('/')
        self.anchor_gen1 = last_name.replace(" ", "") + first_name.replace(" ", "")
        return self.gen_person.strip('@')

    def get_anchor_gen1(self):
        # Return the anchor_gen1 value from the gedcom_dataset
        return self.anchor_gen1

    def get_extractable_NPFX(self):
        # Extract and return the extractable_NPFX (prefix of extractable_name) from the gedcom_dataset
        return self.extractable_detail.get('NPFX', '')

    def get_extractable_cm(self):
        # Extract and return the extractable_cm (left side of NPFX split at "&") from the gedcom_dataset
        npfx_value = self.extractable_detail.get('NPFX', '')
        if '&' in npfx_value:
            cm_value = npfx_value.split('&')[0].strip()
        else:
            cm_value = npfx_value.strip()
        try:
            int(cm_value)
            return cm_value
        except ValueError:
            return 'error'

    def get_extractable_sort(self):
        # Extract and return the extractable_sort (right side of NPFX split at "&") from the gedcom_dataset
        npfx_value = self.extractable_detail.get('NPFX', '')
        if '&' in npfx_value:
            sort_value = npfx_value.split('&')[1].strip()
            return sort_value
        else:
            return ''

    def get_extractable_FAMC(self):
        # Extract and return the extractable_FAMC (paired parents of extractable_name) from the gedcom_dataset
        return self.extractable_detail.get('FAMC', '').strip('@')

dna_generations = Gedcom('dna_generations.ged')
dna_generations.parse_gedcom()
dna_generations.print_extractable_details()


Found 50775 total records
Found 155 NPFX tags
        ID#             Anchor_gen1              cM                 Sort              Parents       
        I335            MorganMorgan           error                                    F274        
        I353           PrickettJacob           error                                                
        I360            MorganJames            error                                    F258        
       I1193            MorganMorgan           error                                                
       I1706             YatesJohn             error                                    F403        
       I3670            EminentJames           error                                                
       I4352             GlennJames            error                                   F2337        
       I5620             YatesAnne             error                                   F8412        
       I6966           GillilandRalph        