In [None]:
import sys
import requests
from bs4 import BeautifulSoup
import unicodedata
import itertools
import itertools
import pprint
import re

URL = "https://arm-software.github.io/acle/neon_intrinsics/advsimd.html"
page = requests.get(URL)

soup = BeautifulSoup(page.content, "html.parser")
intrinsics = [unicodedata.normalize('NFKD', code.text) for code in soup.select("tbody > tr > td:nth-child(1) > code")]
archs =  [unicodedata.normalize('NFKD', code.text) for code in soup.select("tbody > tr > td:nth-child(5) > code")]

decls = [' '.join(entry.split()).replace('( ', '(') for entry in intrinsics]

decl_archs = list(zip(decls, archs))


In [None]:
def grouper(iterable, n, *, incomplete='fill', fillvalue=None):
    "Collect data into non-overlapping fixed-length chunks or blocks"
    # grouper('ABCDEFG', 3, fillvalue='x') --> ABC DEF Gxx
    # grouper('ABCDEFG', 3, incomplete='strict') --> ABC DEF ValueError
    # grouper('ABCDEFG', 3, incomplete='ignore') --> ABC DEF
    args = [iter(iterable)] * n
    if incomplete == 'fill':
        return itertools.zip_longest(*args, fillvalue=fillvalue)
    if incomplete == 'strict':
        return zip(*args, strict=True)
    if incomplete == 'ignore':
        return zip(*args)
    else:
        raise ValueError('Expected fill, strict, or ignore')

In [None]:
lengths = ["8", "16", "32", "64", "128", "256"]

def prefix_all(prefix, list):
  return [prefix + elem for elem in list]

types = {
  *prefix_all('u', lengths), # unsigned integers
  *prefix_all('s', lengths), # signed integers
  *prefix_all('f', lengths), # floating point
  *prefix_all('p', lengths), # polynomial
  'bf16', # brain floating point
  'mf8', # modal 8-bit floating point
}


class NEONIdent:
  suffixes = ['q', # Q-register
              'w', # Widening, i.e one of the inputs is longer than the other and the output will be placed in a longer type
              'l'] # Long, i.e. the output type is twice as long as the inputs

  def __init__(self, func_name):
    self.scalar = False
    self.lane = False
    self.high = False
    self.low = False
    parts = func_name.lstrip('_').split('_')

    if parts[0][0] == 'v':
      parts[0] = parts[0][1:] # remove first char

    if parts[0][-1] in NEONIdent.suffixes:
      self.suffix = parts[0][-1]
      if self.suffix == 'q':
        parts[0] = parts[0][:-1]
    else:
      self.suffix = None


    name_types = [type for type in parts if type in types]
    parts = [part for part in parts if part not in types]

    if len(name_types) == 1:
      [out_type, in_type] =  name_types * 2
    elif len(name_types) == 2:
      [out_type, in_type] = name_types
    else:
      [out_type, in_type] = [None, None]

    self.types = {'out' : out_type, 'in': in_type}

    if 'n' in parts:
      parts.remove('n')
      self.scalar = True
    if 'high' in parts:
      parts.remove('high')
      self.high = True
    if 'low' in parts:
      parts.remove('low')
      self.low = True
    if 'lane' in parts:
      parts.remove('lane')
      self.lane = True
    if 'laneq' in parts:
      parts.remove('laneq')
      self.lane = True

    self.name = '_'.join(parts)


  def __repr__(self):
    return f'NEON:{{ name: "{self.name}", suffix: {self.suffix}, types: {self.types}}}'

In [None]:
name_map = {
    "aba": "subtract_abs_add",
    "abal": "subtract_abs_add",
    "abd": "subtract_abs",
    "abdd": "absolute_difference",
    "abdh": "absolute_difference",
    "abdl": "subtract_abs_long",
    "abds": "absolute_difference",
    "abs": "absolute",
    "absd": "absolute",
    "absh": "absolute",
    "add": "add",
    "addd": "add",
    "addh": "add",
    "addhn": "add_narrow_high",
    "addl": "add_long",
    "addlv": "reduce_add_long",
    "addv": "reduce_add",
    "addw": "add",
    "aesd": "aes_decrypt",
    "aese": "aes_encrypt",
    "aesimc": "aes_inverse_mix_columns",
    "aesmc": "aes_mix_columns",
    "amax": "absolute_max",
    "amin": "absolute_min",
    "and": "bitwise_and",
    "bcax": "bit_clear_xor",
    "bfdot": "dot_product",
    "bfmlalb": "multiply_add_long_widen_bottom",
    "bfmlalt": "multiply_add_long_widen_top",
    "bfmmla": "matrix_multiply_add",
    "bic": "bitwise_clear",
    "bsl": "bitwise_select",
    "cadd_rot270": "complex_add_rotate_270",
    "cadd_rot90": "complex_add_rotate_90",
    "cage": "absolute_greater_than_or_equal",
    "caged": "absolute_greater_than_or_equal",
    "cageh": "absolute_greater_than_or_equal",
    "cages": "absolute_greater_than_or_equal",
    "cagt": "absolute_greater_than",
    "cagtd": "absolute_greater_than",
    "cagth": "absolute_greater_than",
    "cagts": "absolute_greater_than",
    "cale": "absolute_less_than_or_equal",
    "caled": "absolute_less_than_or_equal",
    "caleh": "absolute_less_than_or_equal",
    "cales": "absolute_less_than_or_equal",
    "calt": "absolute_less_than",
    "caltd": "absolute_less_than",
    "calth": "absolute_less_than",
    "calts": "absolute_less_than",
    "ce": "equal",
    "ceq": "equal",
    "ceqd": "equal",
    "ceqh": "equal",
    "ceqs": "equal",
    "ceqz": "equal_to_zero",
    "ceqzd": "equal_to_zero",
    "ceqzh": "equal_to_zero",
    "ceqzs": "equal_to_zero",
    "cge": "greater_than_or_equal",
    "cged": "greater_than_or_equal",
    "cgeh": "greater_than_or_equal",
    "cges": "greater_than_or_equal",
    "cgez": "greater_than_or_equal_to_zero",
    "cgezd": "greater_than_or_equal_to_zero",
    "cgezh": "greater_than_or_equal_to_zero",
    "cgezs": "greater_than_or_equal_to_zero",
    "cgt": "greater_than",
    "cgtd": "greater_than",
    "cgth": "greater_than",
    "cgts": "greater_than",
    "cgtz": "greater_than_zero",
    "cgtzd": "greater_than_zero",
    "cgtzh": "greater_than_zero",
    "cgtzs": "greater_than_zero",
    "cle": "less_than_or_equal",
    "cled": "less_than_or_equal",
    "cleh": "less_than_or_equal",
    "cles": "less_than_or_equal",
    "clez": "less_than_or_equal_to_zero",
    "clezd": "less_than_or_equal_to_zero",
    "clezh": "less_than_or_equal_to_zero",
    "clezs": "less_than_or_equal_to_zero",
    "cls": "count_leading_sign_bits",
    "clt": "less_than",
    "cltd": "less_than",
    "clth": "less_than",
    "clts": "less_than",
    "cltz": "less_than_zero",
    "cltzd": "less_than_zero",
    "cltzh": "less_than_zero",
    "cltzs": "less_than_zero",
    "clz": "count_leading_zero_bits",
    "cmla": "complex_multiply_add",
    "cmla_rot180": "complex_multiply_add_rotate_180",
    "cmla_rot270": "complex_multiply_add_rotate_270",
    "cmla_rot90": "complex_multiply_add_rotate_90",
    "cnt": "count_active_bits",
    "combine": "combine",
    "copy": "copy",
    "crc32b": "crc32",
    "crc32cb": "crc32_castagnoli",
    "crc32cd": "crc32_castagnoli",
    "crc32ch": "crc32_castagnoli",
    "crc32cw": "crc32_castagnoli",
    "crc32d": "crc32",
    "crc32h": "crc32",
    "crc32w": "crc32",
    "create": "create",
    "cvt": "convert",
    "cvta": "convert_round_to_nearest_with_ties_away_from_zero",
    "cvtad": "convert_round_to_nearest_with_ties_away_from_zero",
    "cvtah": "convert_round_to_nearest_with_ties_away_from_zero",
    "cvtas": "convert_round_to_nearest_with_ties_away_from_zero",
    "cvtd": "convert",
    "cvth": "convert",
    "cvtm": "convert_round_toward_negative_infinity",
    "cvtmd": "convert_round_toward_negative_infinity",
    "cvtmh": "convert_round_toward_negative_infinity",
    "cvtms": "convert_round_toward_negative_infinity",
    "cvtn": "convert_round_to_nearest_with_ties_to_even",
    "cvtnd": "convert_round_to_nearest_with_ties_to_even",
    "cvtnh": "convert_round_to_nearest_with_ties_to_even",
    "cvtns": "convert_round_to_nearest_with_ties_to_even",
    "cvtp": "convert_round_toward_positive_infinity",
    "cvtpd": "convert_round_toward_positive_infinity",
    "cvtph": "convert_round_toward_positive_infinity",
    "cvtps": "convert_round_toward_positive_infinity",
    "cvts": "convert",
    "cvtx": "convert_round_to_odd",
    "cvtxd": "convert_round_to_odd",
    "div": "divide",
    "divh": "divide",
    "dot": "dot_product",
    "dup": "duplicate",
    "dupb": "duplicate",
    "dupd": "duplicate",
    "duph": "duplicate",
    "dups": "duplicate",
    "eor": "bitwise_xor",
    "eor3": "xor",
    "ext": "extract",
    "fma": "multiply_add_fused",
    "fmad": "multiply_add_fused",
    "fmah": "multiply_add_fused",
    "fmas": "multiply_add_fused",
    "fmlal": "multiply_add_long_fused",
    "fmlsl": "multiply_subtract_long_fused",
    "fms": "multiply_subtract_fused",
    "fmsd": "multiply_subtract_fused",
    "fmsh": "multiply_subtract_fused",
    "fmss": "multiply_subtract_fused",
    "get": "get",
    "hadd": "add_halve",
    "hsub": "subtract_high",
    "ld1": "load1",
    "ld1_dup": "load1_duplicate",
    "ld1_x2": "load1_x2",
    "ld1_x3": "load1_x3",
    "ld1_x4": "load1_x4",
    "ld2": "load2",
    "ld2_dup": "load2_duplicate",
    "ld3": "load3",
    "ld3_dup": "load3_duplicate",
    "ld4": "load4",
    "ld4_dup": "load4_duplicate",
    "ldap1": "load_acquire1",
    "ldr": "load_register",
    "luti2": "lookup_table_2bit_index",
    "luti4": "lookup_table_4bit_index",
    "luti4_x2": "lookup_table_4bit_index",
    "max": "max",
    "maxh": "max",
    "maxnm": "max",
    "maxnmh": "max",
    "maxnmv": "reduce_max_strict",
    "maxv": "reduce_max",
    "min": "min",
    "minh": "min",
    "minnm": "min",
    "minnmh": "min",
    "minnmv": "reduce_min_strict",
    "minv": "reduce_min",
    "mla": "multiply_add",
    "mlal": "multiply_add_long",
    "mls": "multiply_subtract",
    "mlsl": "multiply_subtract_long",
    "mmla": "matrix_multiply_add",
    "mov": "move",
    "movl": "move_long",
    "movn": "move_narrow",
    "movn_high": "move_high_narrow",
    "mul": "multiply",
    "muld": "multiply",
    "mulh": "multiply",
    "mull": "multiply_long",
    "muls": "multiply",
    "mulx": "multiply_extended",
    "mulxd": "multiply_extended",
    "mulxh": "multiply_extended",
    "mulxs": "multiply_extended",
    "mvn": "bitwise_not",
    "neg": "negate",
    "negd": "negate",
    "negh": "negate",
    "orn": "bitwise_or_not",
    "orr": "bitwise_or",
    "padal": "pairwise_add_accumulate_long",
    "padd": "pairwise_add",
    "paddd": "pairwise_add",
    "paddl": "pairwise_add_long",
    "padds": "pairwise_add",
    "pmax": "pairwise_max",
    "pmaxnm": "pairwise_max_strict",
    "pmaxnmqd": "pairwise_max_strict",
    "pmaxnms": "pairwise_max_strict",
    "pmaxqd": "pairwise_max",
    "pmaxs": "pairwise_max",
    "pmin": "pairwise_min",
    "pminnm": "pairwise_min_strict",
    "pminnmqd": "pairwise_min_strict",
    "pminnms": "pairwise_min_strict",
    "pminqd": "pairwise_min",
    "pmins": "pairwise_min",
    "qabs": "absolute_saturate",
    "qabsb": "absolute_saturate",
    "qabsd": "absolute_saturate",
    "qabsh": "absolute_saturate",
    "qabss": "absolute_saturate",
    "qadd": "add_saturate",
    "qaddb": "add_saturate",
    "qaddd": "add_saturate",
    "qaddh": "add_saturate",
    "qadds": "add_saturate",
    "qdmlal": "multiply_double_add_saturate_long",
    "qdmlalh": "multiply_double_add_saturate_long",
    "qdmlals": "multiply_double_add_saturate_long",
    "qdmlsl": "multiply_double_subtract_saturate_long",
    "qdmlslh": "multiply_double_subtract_saturate_long",
    "qdmlsls": "multiply_double_subtract_saturate_long",
    "qdmulh": "multiply_double_saturate_high",
    "qdmulhh": "multiply_double_saturate_high",
    "qdmulhs": "multiply_double_saturate_high",
    "qdmull": "multiply_double_saturate_long",
    "qdmullh": "multiply_double_saturate_long",
    "qdmulls": "multiply_double_saturate_long",
    "qmovn": "move_saturate_narrow",
    "qmovnd": "move_saturate_narrow",
    "qmovnh": "move_saturate_narrow",
    "qmovns": "move_saturate_narrow",
    "qmovun": "move_unsigned_saturate_narrow",
    "qmovund": "move_unsigned_saturate_narrow",
    "qmovunh": "move_unsigned_saturate_narrow",
    "qmovuns": "move_unsigned_saturate_narrow",
    "qneg": "negate_saturate",
    "qnegb": "negate_saturate",
    "qnegd": "negate_saturate",
    "qnegh": "negate_saturate",
    "qnegs": "negate_saturate",
    "qrdmlah": "multiply_double_add_round_saturate_high",
    "qrdmlahh": "multiply_double_add_round_saturate_high",
    "qrdmlahs": "multiply_double_add_round_saturate_high",
    "qrdmlsh": "multiply_double_subtract_round_saturate_high",
    "qrdmlshh": "multiply_double_subtract_round_saturate_high",
    "qrdmlshs": "multiply_double_subtract_round_saturate_high",
    "qrdmulh": "multiply_double_round_saturate_high",
    "qrdmulhh": "multiply_double_round_saturate_high",
    "qrdmulhs": "multiply_double_round_saturate_high",
    "qrshl": "shift_left_round_saturate",
    "qrshlb": "shift_left_round_saturate",
    "qrshld": "shift_left_round_saturate",
    "qrshlh": "shift_left_round_saturate",
    "qrshls": "shift_left_round_saturate",
    "qrshrn": "shift_right_saturate_narrow",
    "qrshrnd": "shift_right_saturate_narrow",
    "qrshrnh": "shift_right_saturate_narrow",
    "qrshrns": "shift_right_saturate_narrow",
    "qrshrun": "shift_right_unsigned_saturate_narrow",
    "qrshrund": "shift_right_unsigned_saturate_narrow",
    "qrshrunh": "shift_right_unsigned_saturate_narrow",
    "qrshruns": "shift_right_unsigned_saturate_narrow",
    "qshl": "shift_left_saturate",
    "qshlb": "shift_left_saturate",
    "qshld": "shift_left_saturate",
    "qshlh": "shift_left_saturate",
    "qshls": "shift_left_saturate",
    "qshlu": "shift_left_unsigned_saturate",
    "qshlub": "shift_left_unsigned_saturate",
    "qshlud": "shift_left_unsigned_saturate",
    "qshluh": "shift_left_unsigned_saturate",
    "qshlus": "shift_left_unsigned_saturate",
    "qshrn": "shift_right_saturate_narrow",
    "qshrnd": "shift_right_saturate_narrow",
    "qshrnh": "shift_right_saturate_narrow",
    "qshrns": "shift_right_saturate_narrow",
    "qshrun": "shift_right_saturate_narrow_unsigned",
    "qshrund": "shift_right_saturate_narrow_unsigned",
    "qshrunh": "shift_right_saturate_narrow_unsigned",
    "qshruns": "shift_right_saturate_narrow_unsigned",
    "qsub": "subtract_saturate",
    "qsubb": "subtract_saturate",
    "qsubd": "subtract_saturate",
    "qsubh": "subtract_saturate",
    "qsubs": "subtract_saturate",
    "qtbl1": "table_lookup1_saturate",
    "qtbl2": "table_lookup2_saturate",
    "qtbl3": "table_lookup3_saturate",
    "qtbl4": "table_lookup4_saturate",
    "qtbx1": "table_extend1_saturate",
    "qtbx2": "table_extend2_saturate",
    "qtbx3": "table_extend3_saturate",
    "qtbx4": "table_extend4_saturate",
    "raddhn": "add_round_narrow_high",
    "rax1": "rotate_and_xor",
    "rbit": "reverse_bits",
    "recpe": "reciprocal_estimate",
    "recped": "reciprocal_estimate",
    "recpeh": "reciprocal_estimate",
    "recpes": "reciprocal_estimate",
    "recps": "reciprocal_step",
    "recpsd": "reciprocal_step",
    "recpsh": "reciprocal_step",
    "recpss": "reciprocal_step",
    "recpxd": "reciprocal_exponent",
    "recpxh": "reciprocal_exponent",
    "recpxs": "reciprocal_exponent",
    "reinterpret": "reinterpret",
    "rev16": "reverse_16bit",
    "rev32": "reverse_32bit",
    "rev64": "reverse_64bit",
    "rhadd": "add_halve_round",
    "rnd": "round",
    "rnd32x": "round_to_32bit_integer_using_current_mode",
    "rnd32z": "round_to_32bit_integer",
    "rnd64x": "round_to_64bit_integer_using_current_mode",
    "rnd64z": "round_to_64bit_integer",
    "rnda": "round_to_nearest_with_ties_away_from_zero",
    "rndah": "round_to_nearest_with_ties_away_from_zero",
    "rndh": "round",
    "rndi": "round_using_current_mode",
    "rndih": "round_using_current_mode",
    "rndm": "round_toward_negative_infinity",
    "rndmh": "round_toward_negative_infinity",
    "rndn": "round_to_nearest_with_ties_to_even",
    "rndnh": "round_to_nearest_with_ties_to_even",
    "rndns": "round_to_nearest_with_ties_to_even",
    "rndp": "round_toward_positive_infinity",
    "rndph": "round_toward_positive_infinity",
    "rndx": "round_inexact",
    "rndxh": "round_inexact",
    "rshl": "shift_left_round",
    "rshld": "shift_left_round",
    "rshr": "shift_right_round",
    "rshrd": "shift_right_round",
    "rshrn": "shift_right_round_saturate_narrow",
    "rsqrte": "reciprocal_sqrt_estimate",
    "rsqrted": "reciprocal_sqrt_estimate",
    "rsqrteh": "reciprocal_sqrt_estimate",
    "rsqrtes": "reciprocal_sqrt_estimate",
    "rsqrts": "reciprocal_sqrt_step",
    "rsqrtsd": "reciprocal_sqrt_step",
    "rsqrtsh": "reciprocal_sqrt_step",
    "rsqrtss": "reciprocal_sqrt_step",
    "rsra": "shift_right_accumulate_round",
    "rsrad": "shift_right_accumulate_round",
    "rsubhn": "subtract_round_narrow_high",
    "scale": "scale_exponent",
    "set": "set",
    "sha1c": "sha1_choose",
    "sha1h": "sha1_fixed_rotate",
    "sha1m": "sha1_majority",
    "sha1p": "sha1_parity",
    "sha1su0": "sha1_schedule_update_0",
    "sha1su1": "sha1_schedule_update_1",
    "sha256h": "sha256_hash_part_1",
    "sha256h2": "sha256_hash_part_2",
    "sha256su0": "sha256_schedule_update_0",
    "sha256su1": "sha256_schedule_update_1",
    "sha512h": "sha512_hash_part_1",
    "sha512h2": "sha512_hash_part_2",
    "sha512su0": "sha512_schedule_update_0",
    "sha512su1": "sha512_schedule_update_1",
    "shl": "shift_left",
    "shld": "shift_left",
    "shll": "shift_left_long",
    "shr": "shift_right",
    "shrd": "shift_right",
    "shrn": "shift_right_narrow",
    "sli": "shift_left_insert",
    "slid": "shift_left_insert",
    "sm3partw1": "sm3_part_w_1",
    "sm3partw2": "sm3_part_w_2",
    "sm3ss1": "sm3_ss_1",
    "sm3tt1a": "sm3_tt_1a",
    "sm3tt1b": "sm3_tt_1b",
    "sm3tt2a": "sm3_tt_2a",
    "sm3tt2b": "sm3_tt_2b",
    "sm4e": "sm4_encode",
    "sm4ekey": "sm4_encode_key",
    "sqadd": "add_saturate",
    "sqaddb": "add_saturate",
    "sqaddd": "add_saturate",
    "sqaddh": "add_saturate",
    "sqadds": "add_saturate",
    "sqrt": "square_root",
    "sqrth": "square_root",
    "sra": "shift_right_add",
    "srad": "shift_right_add",
    "sri": "shift_right_insert",
    "srid": "shift_right_insert",
    "st1": "store1",
    "st1_x2": "store1_x2",
    "st1_x3": "store1_x3",
    "st1_x4": "store1_x4",
    "st2": "store2",
    "st3": "store3",
    "st4": "store4",
    "stl1": "store_release1",
    "str": "store_register",
    "sub": "subtract",
    "subd": "subtract",
    "subh": "subtract",
    "subhn": "subtract_narrow_high",
    "subl": "subtract_long",
    "subw": "subtract",
    "sudot": "dot_product",
    "tbl1": "table_lookup1",
    "tbl2": "table_lookup2",
    "tbl3": "table_lookup3",
    "tbl4": "table_lookup4",
    "tbx1": "table_extend1",
    "tbx2": "table_extend2",
    "tbx3": "table_extend3",
    "tbx4": "table_extend4",
    "trn": "transpose",
    "trn1": "transpose_step_1",
    "trn2": "transpose_step_2",
    "tst": "compare_test_nonzero",
    "tstd": "compare_test_nonzero",
    "uqadd": "add_saturate",
    "uqaddb": "add_saturate",
    "uqaddd": "add_saturate",
    "uqaddh": "add_saturate",
    "uqadds": "add_saturate",
    "usdot": "dot_product",
    "usmmla": "matrix_multiply_add",
    "uzp": "unzip",
    "uzp1": "unzip1",
    "uzp2": "unzip2",
    "xar": "xor_and_rotate",
    "zip": "zip",
    "zip1": "zip1",
    "zip2": "zip2",
}

#pprint({k: name_map[k] for k in sorted(name_map)})

In [None]:
import functools


type_order = [
  "uint8x8_t",
  "uint8x16_t",
  "int8x8_t",
  "int8x16_t",
  "uint16x4_t",
  "uint16x8_t",
  "int16x4_t",
  "int16x8_t",
  "uint32x2_t",
  "uint32x4_t",
  "int32x2_t",
  "int32x4_t",
  "uint64x1_t",
  "uint64x2_t",
  "uint32x2_t",
  "uint32x4_t",
  "float16x4_t",
  "float16x8_t",
  "float32x2_t",
  "float32x4_t",
  "float64x1_t",
  "float64x2_t",
  "poly8x8_t",
  "poly16x4_t",
]

class Var:
  type_map = { key:value for (value,key) in enumerate(type_order) }

  def __init__(self, string):
    components = string.split()
    self.ident = components.pop(-1)
    self.type = ' '.join(components)

  def __str__(self):
    return f"{self.type} {self.ident}"

  def __repr__(self):
    return f'Var:{{ type: "{self.type}", ident: "{self.ident}"}}'

  def __eq__(self, other):
    if self.type in Var.type_map and other.type in Var.type_map:
      return Var.type_map[self.type] == Var.type_map[other.type]
    else:
      return False

  def __lt__(self, other):
    if self.type in Var.type_map and other.type in Var.type_map:
      return Var.type_map[self.type] < Var.type_map[other.type]
    elif self.type in Var.type_map and other.type not in Var.type_map:
      return True
    else:
      return False

class Function:
  def __init__(self, decl_arch):
    (decl, arch) = decl_arch
    self.decl = decl
    in_parens = r"\((.)+\)"
    args = re.search(in_parens, decl).group().removeprefix('(').removesuffix(')')
    decl = re.sub(in_parens, '', decl).split()
    self.return_type = decl.pop(0)
    self.intrinsic = decl.pop(0)
    self.args = [Var(arg) for arg in args.split(',')]
    self.decoded = NEONIdent(self.intrinsic)
    self.archs = arch.split('/')
    if self.args[-1].type == "const int":
      self.const = self.args[-1].ident
      self.args = self.args[:-1]
    else:
      self.const = None
    self.name = name_map[self.decoded.name] if self.decoded.name in name_map else self.decoded.name
    if self.decoded.lane:
      self.name += '_lane'
    if self.decoded.high:
      self.name += '_high'
    if self.decoded.low:
      self.name += '_low'

  def __repr__(self):
    return f'Function:{{ name: "{self.name}" intrinsic: "{self.intrinsic}", decoded: {self.decoded}, return_type: "{self.return_type}", args: "{self.args}" }}'

  def __equal__(self, other):
    var_match = [arg1 == arg2 for (arg1, arg2) in zip(self.args, other.args)]
    return functools.reduce(lambda a,b: a and b, var_match)

  def __lt__(self, other):
    for (arg1, arg2) in zip(self.args, other.args):
      if arg1 == arg2:
        continue
      else:
        return arg1 < arg2
    return False


In [None]:
funcs = [Function((decl, arch)) for (decl, arch) in decl_archs if 'fpm' not in decl]

In [None]:
v7_funcs = [f for f in funcs if "v7" in f.archs]  # Filter architecture

a32_funcs = [
    f for f in funcs if ("A32" in f.archs and not "v7" in f.archs)
]  # Filter architecture

a64_funcs = [
    f for f in funcs if ("A64" in f.archs and not "A32" in f.archs)
]  # Filter architecture

vfpv3_funcs = [
    f
    for f in v7_funcs
    if "float16" not in f.return_type
    and not any(["float16" in arg.type for arg in f.args])
    and "float64" not in f.return_type
    and not any(["float64" in arg.type for arg in f.args])
]  # remove vfpv4 stuff

vfpv4_funcs = [
    f
    for f in v7_funcs
    if "float16" in f.return_type
    or any(["float16" in arg.type for arg in f.args])
    or "float64" in f.return_type
    or any(["float64" in arg.type for arg in f.args])
]  # vfpv4 stuff

In [None]:
# import pprint
# names = [f for f in a64_funcs if f.name == '']
# pprint.pp(list(dict.fromkeys(names)))

In [None]:
uniq_funcs = { f.decl:f for f in a64_funcs}.values() # uniquify (vshll_n duped for some reason)

In [None]:
funcs = uniq_funcs

In [None]:
from pprint import pprint

name_args = {}
missing = {}
for func in funcs:
  args = ', '.join([str(arg) for arg in func.args])
  try:
    key = f"{name_map[func.decoded.name]}({args})"
  except KeyError:
    print(missing)
    missing[func.decoded.name] = ''
    continue
  if key in name_args.keys():
    name_args[key] += [func]
  else:
    name_args[key] = [func]

pprint(missing)

needs_template = {k:v for (k,v) in name_args.items() if len(v) > 1}
needs_template_sig = needs_template.keys()
needs_template_funcs = [f for v in needs_template.values() for f in v]

In [None]:

blacklist = [
]

no_constexpr = [
  "load1",
  "load2",
  "load3",
  "load4",
  "load1_duplicate",
  "load2_duplicate",
  "load3_duplicate",
  "load4_duplicate",
  "load1_x2",
  "load1_x3",
  "load1_x4",
  "store1",
  "store2",
  "store3",
  "store4",
  "store1_x2",
  "store1_x3",
  "store1_x4",
]

always_inline = "[[gnu::always_inline]] "

def simplify_type(type):
  parts = type.split('x')
  return parts[0] + "_v"

def generate_function(func):
  args = ', '.join([str(arg) for arg in func.args])
  arg_idents = ', '.join([arg.ident.replace('*','') for arg in func.args])
  definition = "template <> " if func in needs_template_funcs else ""
  definition += always_inline
  definition += "nce " if func.name not in no_constexpr else "inline "
  definition += f"{func.return_type} {func.name}({args}) {{ return {func.intrinsic}({arg_idents}); }}"
  return definition

def generate_templated_function(func):
  args = ', '.join([str(arg) for arg in func.args])
  arg_idents = ', '.join([arg.ident.replace('*','') for arg in func.args])
  const_name = func.const
  args = args.replace(f', const int {const_name}', '')
  definition = f"template <int {const_name}>"
  definition += always_inline
  definition += f"nce {func.return_type} {func.name}({args}) {{ return {func.intrinsic}({arg_idents}, {const_name}); }}"
  return definition


In [None]:
#with open('neon.hpp', 'w') as sys.stdout:
print('#pragma once')
print('#include <arm_neon.h>')
print('#ifdef __cplusplus')
print('''#ifdef __clang__
#define nce constexpr
#else
#define nce inline
#endif
''')
print('namespace neon {')
print('// clang-format off')
for (name_arg, func_list) in needs_template.items():
  print(f"template <typename T> nce T {name_arg};")
for func in sorted(funcs):
  if func.name not in blacklist:
    if func.const != None:
      out = generate_templated_function(func)
    else:
      out = generate_function(func)
    print(out)
print('// clang-format on')
print('}  // namespace neon')
print('#endif')