In [1]:
import re
from functools import partial, reduce

import pandas as pd
import requests
from bs4 import BeautifulSoup

In [2]:
r = requests.get('https://en.wikipedia.org/wiki/List_of_Nvidia_graphics_processing_units')
soup = BeautifulSoup(r.text)

In [3]:
def css_escape(string: str):
    return re.sub(r'([\[\]{}()])', r'\\\1', string)


def compose(*fns):
    return reduce(lambda f, g: lambda x: f(g(x)), fns, lambda x: x)


def clean(*strs: str) -> str:
    # bottom-up function execution
    cleaner = compose(
        str.strip,  # remove leading and trailling spaces
        partial(re.sub, r'\s+', ' '),  # replace extra spaces
        partial(re.sub, r'(?<=\b[A-Z])\s*(?=FLOPS)', ''),  # remove spaces between multiplier and 'FLOPS'
        partial(re.sub, r'\s*(?=\/s\b)', ''),  # remove spaces between <unit> and '/s'
        partial(re.sub, r'(?<=\()\s*|\s*(?=\))', ''),  # remove spaces preceded by '(' or followed by ')'
        partial(re.sub, r'\[.*?\]', ''),  # remove references like '[1]'
    )
    for s in strs:
        yield cleaner(s)

In [4]:
def find_flat_children(tag, name, **attrs):
    for sibling in tag.next_siblings:
        if sibling.name == tag.name:
            return
        if sibling.name != name:
            continue
        if not all(sibling.attrs[key] == value for key, value in attrs.items()):
            continue
        yield sibling

In [5]:
from itertools import chain

tables = {}
keys = {}
generations = find_flat_children(
    soup.select_one('#mw-content-text > div.mw-parser-output > h2:nth-child(7)'),
    'h3'
)

for gen in generations:
    name = gen.select_one('span.mw-headline').text
    table = next(find_flat_children(gen, 'table'), None)
    if not table:
        continue
    df = pd.concat(pd.read_html(str(table)), axis=1)

    tables[name] = df
    keys[name] = set(clean(*chain.from_iterable(df.iloc[0].to_dict().keys())))

    print(f"\n\n{name}:")
    for model in sorted(
            set(re.sub('\s+', ' ', re.sub('\s*\[.*\]\s*|Model|\(OEM\)', '', m[0])) for m in df['Model'].values)):
        if model:
            print(f"\t- {model}")



Pre-GeForce:
	- Riva 128
	- Riva 128ZX
	- Riva TNT
	- Riva TNT2
	- Riva TNT2 M64
	- Riva TNT2 Pro
	- Riva TNT2 Ultra
	- STG-2000
	- Vanta
	- Vanta LT


GeForce 256 series:
	- GeForce 256 DDR
	- GeForce 256 SDR


GeForce2 series:
	- GeForce2 GTS
	- GeForce2 MX
	- GeForce2 MX IGP + nForce 220/420
	- GeForce2 MX200
	- GeForce2 MX400
	- GeForce2 Pro
	- GeForce2 Ti
	- GeForce2 Ultra


GeForce3 series:
	- GeForce3
	- GeForce3 Ti200
	- GeForce3 Ti500


GeForce4 series:
	- GeForce MX4000
	- GeForce PCX4300
	- GeForce4 MX IGP + nForce2
	- GeForce4 MX420
	- GeForce4 MX440
	- GeForce4 MX440 8x
	- GeForce4 MX440 SE
	- GeForce4 MX460
	- GeForce4 Ti4200
	- GeForce4 Ti4200 8x
	- GeForce4 Ti4400
	- GeForce4 Ti4400 8x (Ti4800SE)
	- GeForce4 Ti4600
	- GeForce4 Ti4600 8x (Ti4800)


GeForce FX (5xxx) series:
	- GeForce FX 5100
	- GeForce FX 5200
	- GeForce FX 5200 LE
	- GeForce FX 5200 Ultra
	- GeForce FX 5500
	- GeForce FX 5600
	- GeForce FX 5600 Ultra
	- GeForce FX 5600 Ultra Rev.2
	- GeForce FX 5600 

In [6]:


for e in sorted(set.intersection(*keys.values())):
    print(f"{e!r}")

'Bandwidth (GB/s)'
'Bus width (bit)'
'Code name'
'Core config'
'Fillrate'
'Launch'
'Model'


### common attrs:
  - 'bandwidth'
  - 'bus width'
  - 'code name'
  - 'core config'
  - 'fillrate'
  - 'launch'
  - 'memory'
  - 'model'
  - 'size'

```JSON
{
    'gpu': {
        'memory': {
            'bandwidth': 0,
            'bus_width' 0,
            'size': 0
        },
        'code_name': '',
        'core_config' '',
        'launch': '',
        'model': ''
    }
}
```
