In [4]:
import sys
import re

def process_file(input_file, output_file):
    """
    Process the input file to extract electronic part information,
    ensuring no duplicate taes_numbers and that entries are sorted by taes_number.
    Author: Raphael Leveque

    Args:
    input_file (str): Path to the input file.
    output_file (str): Path to the output CSV file.
    """
    with open(input_file, 'r', encoding='utf-8') as infile, open(output_file, 'w', encoding='utf-8') as outfile:
        headers = []
        part_name = ''
        processing_part = False
        entries = {}
        sep_input_file = '|'
        sep_output_file = ';'
        # write header output file
        outfile.write('taes_number'+sep_output_file+'HDLSChematicSymbol'+sep_output_file+'AllegroFootprint'+'AltSymbols\n')

        for line in infile:
            if 'PART ' in line:
                part_name = re.search(r"PART '([^']+)'", line).group(1)
                processing_part = True
                values_buffer = []
            elif processing_part and line.startswith(':'):
                headers = line.strip(':;\n').split(sep_input_file)
            elif processing_part and 'END_PART' in line:
                for values_line in values_buffer:
                    # Split values using the sep_input_file separator first and removing ' at beginning and ' at the end
                    raw_values = values_line.split(sep_input_file)
                    #values = [re.search(r"'(.*)'", val).group(1) if re.search(r"'(.*)'", val) else '' for val in raw_values]
                    values = []
                    for val in raw_values:
                        found = re.search(r"'(.*)'", val)
                        if found:
                            values.append(found.group(1))
                        else:
                            values.append('')  # Append empty string if no match

                   # Correcting the specific handling of the ACCESSOIRE field
                    acc_index = headers.index("ACCESSOIRE (OPT='-')=PART_NUMBER")
                    # Check if the value does not contains -'=' substring
                    # in this case additional number is inserted and PN number is a duplicate value from another part name
                    # such value must be skipped : TBC !!!
                    accessory_value = values[acc_index]
                    if not"-'='" in accessory_value:
                        print(f"Warning: part_number with ACCESSOIRE < {accessory_value} > found in < {part_name} > part name. Not added to output.")
                        continue  # Skip this entry completely to avoid duplicates with ACCESSOIRE SOCKET, SUPCCJ32_SANS_PIONS,  etc..

                    # Regex to extract only PN value
                    # solution1: keep PN value and remove -'=' substring - any value not containing -'=' is skipped by previous rule
                    values[acc_index] = re.sub("-'='", "", values[acc_index])
                    # solution2: keep value after last ' char
                    #values[acc_index] = re.search(r"'.*?'([^']*)$", values[acc_index]).group(1)
                    
                    if len(values) != len(headers):
                        print(f"Data mismatch in headers and values for part {part_name}: Expected {len(headers)}, found {len(values)}")
                        print(f"Headers: {headers}")
                        print(f"Values: {values}")
                        continue

                    data = dict(zip(headers, values))
                    try:
                        taes_number = data['ACCESSOIRE (OPT=\'-\')=PART_NUMBER']
                        if taes_number in entries:
                            print(f"Warning: DUPLICATE taes_number < {taes_number} > found in < {part_name} > part name. Not added to output.")
                        else:
                            entry = f"{taes_number}{sep_output_file}{part_name}{sep_output_file}{data['JEDEC_TYPE']}{sep_output_file}{data['ALT_SYMBOLS'].strip('()')}\n"
                            entries[taes_number] = entry
                    except KeyError as e:
                        print(f"Key error: {e} in part {part_name}")

                processing_part = False
                values_buffer = []

            elif processing_part and not line.strip().endswith(';'):
                values_buffer.append(line.strip())

        # Write sorted entries by taes_number
        #for entry in sorted(entries.values()):
        #    outfile.write(entry)
        for taes_number in sorted(entries):#by default sorts the dictionary by its keys
            outfile.write(entries[taes_number])
            #print(f"Entry: {entries[taes_number].strip()}")

        print("Processing complete.")

input_path = 'part_table_taes_16022024.ptf'
output_path = 'output.csv'
process_file(input_path, output_path)


Skipping entry with disallowed ACCESSOIRE PN - HEATSINK 573400'='99253651 - in ADA4870 part name - to avoid PN duplicate from other part name
Skipping entry with disallowed ACCESSOIRE PN - BARRETE_NAS_1370_C'='91778591 - in ADC309 part name - to avoid PN duplicate from other part name
Skipping entry with disallowed ACCESSOIRE PN - SUPCCJ-CMS'='91764307 - in ADG406X part name - to avoid PN duplicate from other part name
Skipping entry with disallowed ACCESSOIRE PN - SUPCCJ-CMS'='91844464 - in ADG406X part name - to avoid PN duplicate from other part name
Skipping entry with disallowed ACCESSOIRE PN - SUPCCJ-CMS'='99206944 - in ADG428 part name - to avoid PN duplicate from other part name
Skipping entry with disallowed ACCESSOIRE PN - SUPCCJ-CMS'='99206943 - in ADG429 part name - to avoid PN duplicate from other part name
Skipping entry with disallowed ACCESSOIRE PN - SUPCCJ-CMS'='91769109 - in ADG526X part name - to avoid PN duplicate from other part name
Skipping entry with disallowed 