# Betaface Parser notebook

I'll try to build up a parser for the betaface data in a clear and repeatable way, and
avoid the need for anything that's not basic python (3.11).

In [10]:
from csv import reader, DictWriter, DictReader
from dataclasses import dataclass
from pathlib import Path
from typing import Generator
from itertools import islice

In [11]:
@dataclass
class FacialAttribute:
  """
  A class to represent a facial attribute.
  """
  name: str
  raw: str
  text: str | None = None
  value: float | None = None
  label: str | None = None
  
  def __str__(self) -> str:
    return f"{self.name}: {self.raw}"
  
  def make_face_part(self, *header: str) -> dict[str, str | int | float]:
    """Return a dictionary of face part."""
    text: int | float | str = ""
    if self.text is not None:
      text = self.text.strip()
    elif self.value is not None:
      text = self.value

    label = ""

    # here we introduce other labels:
    match self.name:
      case "gender":
        label = ".male/female"
      case "age":
        label = ".number"
      case "expression":
        label = ".expression"
      case _ if "color" in self.name:
        label = ".color"

    if text in ["yes", "no"]:
      label = ".yes/no"
      text = int(self.text == "yes")

    col = f"{self.name}{label}"
    if header and col not in header:
      # print(f"finding missing col header {col}")
      col = next(filter(lambda c: c.startswith(self.name), header))
      # print(f"found missing col header {col}")

    return {
        col: text,
        **(
          {} if f"{self.name}.% value" not in header
          else {f"{self.name}.% value": self.value}
          if self.value is not None
          else {f"{self.name}.% value": 999}
        ),
      }


In [12]:

def rows(filename) -> Generator[list[str], None, None]:
  """
  Break a file into lists of strings in each line, each string is a region between
  commas.
  """
  with Path(filename).open("r") as f:
    yield from reader(f)


In [13]:
def parse_attribute(raw_attribute: str) -> FacialAttribute | None:
  """
  Convert a string into a FacialAttribute.
  raw_attribute is expected to have the format
    "<name>: <text> [(<value>)]"
  """
  if not raw_attribute:
    return None
  name, _, raw = raw_attribute.strip().partition(": ")
  text, _, value = raw.partition(" (")
  if not value:
    try:
      value = float(text.strip().rstrip("%)"))
      text = None
    except ValueError:
      value = None
  else:
    value = float(value.rstrip("%)"))
  return FacialAttribute(
    name=name.strip(),
    raw=raw.strip(),
    text=text,
    value=value,
  )

In [14]:
# Test the attribute_parser

assert(parse_attribute("attractive: no (77%)") == FacialAttribute(
  name="attractive",
  raw="no (77%)",
  value=77,
  text="no",
))
assert(parse_attribute("gray hair: no") == FacialAttribute(
  name="gray hair",
  raw="no",
  text="no",
))
assert(parse_attribute("yaw: 3.44") == FacialAttribute(
  name="yaw",
  raw="3.44",
  value=3.44,
))

In [15]:
def parse_row(row: list[str]) -> Generator[FacialAttribute, None, None]:
  """Parse the row into FacialAttributes."""
  yield from filter(None, map(parse_attribute, row))

In [16]:
test_row = "5oclock shadow: no (51%), age: 24 (60%), arched eyebrows: no (14%), attractive: yes (32%), bags under eyes: no, bald: no (67%), bangs: no (94%), beard: no (85%), big lips: no (59%), big nose: no (60%), black hair: no (90%), blond hair: no (49%), blurry: no, brown hair: yes (33%), bushy eyebrows: yes (28%), chubby: no (79%), double chin: no (97%), expression: neutral (80%), gender: male (27%), glasses: no, goatee: no, gray hair: no (80%), heavy makeup: no (75%), high cheekbones: no, mouth open: no (89%), mustache: no (99%), narrow eyes: no, oval face: yes (81%), pale skin: yes (19%), pitch: -9.76, pointy nose: no (32%), race: white, receding hairline: no (20%), rosy cheeks: no (95%), sideburns: no, straight hair: yes (55%), wavy hair: no (99%), wearing earrings: no (35%), wearing hat: no (95%), wearing lipstick: no (55%), wearing necklace: no (84%), wearing necktie: no (13%), yaw: 0.87, young: yes (87%), chin size: small, color clothes middle: fff7fe (44%), color hair: 7e563e (90%), color skin: d29799, eyebrows corners: low, eyebrows position: extra low, eyebrows size: extra thin, eyes corners: average, eyes distance: far, eyes position: high, eyes shape: extra round, glasses rim: no, hair beard: none, hair color type: brown light (90%), hair forehead: no, hair length: none, hair mustache: none, hair sides: very thin, hair top: short, head shape: extra rect, head width: extra narrow, mouth corners: low, mouth height: thin, mouth width: extra small, nose shape: extra straight, nose width: extra wide, teeth visible: no,"
for attr in parse_row(test_row.split(",")):
    print(attr)
    print(attr.make_face_part())


5oclock shadow: no (51%)
{'5oclock shadow.yes/no': 0}
age: 24 (60%)
{'age.number': '24'}
arched eyebrows: no (14%)
{'arched eyebrows.yes/no': 0}
attractive: yes (32%)
{'attractive.yes/no': 1}
bags under eyes: no
{'bags under eyes.yes/no': 0}
bald: no (67%)
{'bald.yes/no': 0}
bangs: no (94%)
{'bangs.yes/no': 0}
beard: no (85%)
{'beard.yes/no': 0}
big lips: no (59%)
{'big lips.yes/no': 0}
big nose: no (60%)
{'big nose.yes/no': 0}
black hair: no (90%)
{'black hair.yes/no': 0}
blond hair: no (49%)
{'blond hair.yes/no': 0}
blurry: no
{'blurry.yes/no': 0}
brown hair: yes (33%)
{'brown hair.yes/no': 1}
bushy eyebrows: yes (28%)
{'bushy eyebrows.yes/no': 1}
chubby: no (79%)
{'chubby.yes/no': 0}
double chin: no (97%)
{'double chin.yes/no': 0}
expression: neutral (80%)
{'expression.expression': 'neutral'}
gender: male (27%)
{'gender.male/female': 'male'}
glasses: no
{'glasses.yes/no': 0}
goatee: no
{'goatee.yes/no': 0}
gray hair: no (80%)
{'gray hair.yes/no': 0}
heavy makeup: no (75%)
{'heavy ma

In [19]:
all_cols = {}

parsed_rows_30_72 = list(map(list, map(parse_row, rows("../betaface-raw-30-72.txt"))))
parsed_rows_95_176 = list(map(list, map(parse_row, rows("../betaface-raw-95-176.txt"))))
parsed_rows_new = list(map(list, map(parse_row, rows("../betaface-new-raw.txt"))))


header = None
preprocessed: list[dict] = []

for i, row in enumerate(parsed_rows_new):
    row[0:0] = [parse_attribute(f"ItemNumber: {i + 1}")]


with Path("../betaface-preprocessed.bkp.csv").open("r") as f:
    header = next(reader(f))
# with Path("../betaface-new-header.csv").open("r") as f:
#     header = next(reader(f))
for f in parsed_rows_new:
    for attr in f:
        all_cols |= attr.make_face_part(*header)

# for f in parsed_rows_95_176:
#     for attr in f:
#         all_cols |= attr.make_face_part(*header)

def row_to_face(fattrs: list[FacialAttribute]) -> dict[str, str | int | float]:
    """
    Converts a row to a face.

    :param row: A row of a csv file.
    """
    face: dict[str, str | int | float] = {k: 999. for k in all_cols.keys()}
    for facial_attr in fattrs:
        face |= facial_attr.make_face_part(*header)
    return face

all_faces = [
    #*map(row_to_face, parsed_rows_30_72),
    #*map(row_to_face, parsed_rows_95_176),
    *map(row_to_face, parsed_rows_new),

]


for col in all_cols.keys():
    assert col in header, f"{col} not found in header"

with Path("../betaface-preprocessed.bkp.csv").open("r") as f:
    preprocessed = list(DictReader(f, fieldnames=header))

with Path("./betaface-processed-new.csv").open("wt") as f:
    writer = DictWriter(
        f,
        fieldnames=all_cols.keys(),
        delimiter=",",
        dialect="excel",
    )
    #for face in preprocessed:
        #writer.writerow(face)
    for face in all_faces:
        writer.writerow(face)

# with Path("../betaface-preprocessed.csv").open("at+") as f:
#     writer = DictWriter(
#         f,
#         fieldnames=header,
#         delimiter=",",
#     )
#     writer.writeheader()
#     for face in all_faces:
#         writer.writerow(face)


StopIteration: 