In [376]:
from datetime import datetime, date
from pydantic import BaseModel, Field, validator
from typing import Any, Optional

In [377]:
import re
from functools import singledispatchmethod

def to_snake_case(string: str) -> str:
    return string.lower().replace(' ', '_')

class RegexMatch(str):
    @singledispatchmethod
    def __eq__(self, other) -> bool:
        return super().__eq__(other)
    
    @__eq__.register
    def _(self, pattern: re.Pattern) -> bool:
        match = pattern.match(self)
        if not match:
            return False
        self.groups = match.groupdict()
        return True
    
class Patterns:
    release_price = re.compile(r'release price', re.IGNORECASE)
    model = re.compile(r'^model', re.IGNORECASE)
    code_name = re.compile(r'^code name', re.IGNORECASE)
    launch = re.compile(r'^launch', re.IGNORECASE)
    processing_power = re.compile(r'^processing power \((?P<unit>G|T)FLOPS\)([^:]*$|: single precision)', re.IGNORECASE)

re_number = '(?P<integral>(?:\d+(?P<sep> |,)?)+)(?P<decimal>\.\d+)?'

def preprocess(**data):
    for key, value in data.items():
        match RegexMatch(key):
            case Patterns.release_price:
                key = 'release_price'
                match = re.search(f"\$\s*({re_number})", value)
                value = float(match.group(1)) if match else None

            case Patterns.model:
                key = 'model'

            case Patterns.code_name:
                key = 'code_name'

            case Patterns.launch:
                key = 'launch'

            case Patterns.processing_power as field:
                key = 'processing_power'
                match = re.search(re_number, value)
                value = float(match.group(0).replace(match.group('sep'), '')) if match else None

                match field.groups['unit']:
                    case 'G':
                        pass
                    case 'T':
                        value /= 1e3
                    case _:
                        raise ValueError(f'Unknown unit {field.groups["unit"]}')
                
        yield to_snake_case(key), value


In [378]:
import calendar

abbr2month = dict(zip(calendar.month_abbr, calendar.month_name))
month2int = {m: i for i, m in enumerate(calendar.month_name)}

def str2dt(*, year: str, month: str = None, day: str = None):
    return datetime(
        day=int(day) if day is not None else 15,
        month=month2int.get(abbr2month.get(month, month), 7), 
        year=int(year)
    )

In [379]:
from lenientlist import LenientList

class GraphicCard(BaseModel):
    model: str
    launch: date
    code_name: str

    processing_power: float = Field(description="Single-precision processing power in FLOPS.")
    tdp: Optional[float] = Field(title="TDP", description="Thermal Design Power in watts (W).")
    release_price: float

    class Config:
        title = 'Graphic Card'

    def __init__(self, **data: Any) -> None:
        super().__init__(**dict(preprocess(**data)))

    # call the validator before field validation using pre=True
    @validator('launch', pre=True)
    def launch_validator(cls, value):
        match = re.match("((?P<month>[A-Z][a-z]+),? ((?P<day>\d{1,2}),? )?)?(?P<year>\d{4})", value)
        if not match:
            raise ValueError(value)
        return str2dt(**match.groupdict())

class Generation(BaseModel):
    name: str
    gpus: LenientList[GraphicCard] = Field(title="GPUs")

class Category(BaseModel):
    name: str
    generations: list[Generation]

class Export(BaseModel):
    date: datetime
    categories: list[Category]

# print(Export.schema_json(indent=2))


In [380]:
parsed = Export.parse_file("nv-gpus-db.json")

In [381]:
import json

with open("parsed.json", 'w') as f:
    f.write(parsed.json(indent=2))
