# Data Wrangling with Dynamic Attributes

In [8]:
from urllib.request import urlopen
import warnings
import os
import json

URL = 'http://www.oreilly.com/pub/sc/osconfeed'
JSON = 'data/osconfeed.json'

def load():
    if not os.path.exists(JSON):
        msg = 'downloading {} to {}'.format(URL, JSON)
        warnings.warn(msg)
        with urlopen(URL) as remote, open(JSON, 'wb') as local:
            local.write(remote.read())
            
    with open(JSON) as fp:
        return json.load(fp)

In [9]:
feed = load()

In [4]:
sorted(feed['Schedule'].keys())

['conferences', 'events', 'speakers', 'venues']

In [5]:
for key, value in sorted(feed['Schedule'].items()):
    print('{:3} {}'.format(len(value), key))

  1 conferences
494 events
357 speakers
 53 venues


In [None]:
feed['Schedule']['speakers'][-1]['name']

In [None]:
feed['Schedule']['speakers'][-1]['serial']

In [None]:
feed['Schedule']['events'][40]['name']

In [None]:
feed['Schedule']['events'][40]['speakers']

## Exploring JSON-Like Data with Dynamic Attributes

In [7]:
from collections import abc

class FrozenJSON:
    """A read-only facade for navigating a JSON-like object
    using attribute notation"""
    
    def __init__(self, mapping):
        self.__data = dict(mapping)                    #1
        
    def __getattr__(self, name):                       #2
        if hasattr(self.__data, name):
            return getattr(self.__data, name)          #3
        else:
            return FrozenJSON.build(self.__data[name]) #4
        
    @classmethod
    def build(cls, obj):                               #5
        if isinstance(obj, abc.Mapping):               #6
            return cls(obj)
        elif isinstance(obj, abc.MutableSequence):     #7
            return [cls.build(item) for item in obj]
        else:                                          #8
            return obj

In [8]:
from osconfeed import load
raw_feed = load()
feed = FrozenJSON(raw_feed)

In [9]:
raw_feed = load()

In [10]:
feed = FrozenJSON(raw_feed)

In [11]:
len(feed.Schedule.speakers)

357

In [12]:
sorted(feed.Schedule.keys())

['conferences', 'events', 'speakers', 'venues']

In [13]:
for key, value in sorted(feed.Schedule.items()):
    print('{:3} {}'.format(len(value), key))

  1 conferences
494 events
357 speakers
 53 venues


In [14]:
feed.Schedule.speakers[-1].name

'Carina C. Zona'

In [15]:
talk = feed.Schedule.events[40]

In [16]:
type(talk)

__main__.FrozenJSON

In [17]:
talk.name

'There *Will* Be Bugs'

In [18]:
talk.speakers

[3471, 5199]

In [19]:
talk.flavor

KeyError: 'flavor'

## The Invalid Attribute Name Problem

In [20]:
grad = FrozenJSON({'name': 'Jim Bo', 'class': 1982})

In [21]:
grad.class

SyntaxError: invalid syntax (<ipython-input-21-ce4327ea3f6c>, line 1)

In [22]:
getattr(grad,'class')

1982

In [24]:
from collections import abc
import keyword

class FrozenJSON:
    """A read-only facade for navigating a JSON-like object
    using attribute notation"""
    
    def __init__(self, mapping):
        self.__data = {}
        for key, value in mapping.items():
            if keyword.iskeyword(key):
                key += '_'
            self.__data[key] = value
        
    def __getattr__(self, name):
        if hasattr(self.__data, name):
            return getattr(self.__data, name)
        else:
            return FrozenJSON.build(self.__data[name])
        
    @classmethod
    def build(cls, obj):
        if isinstance(obj, abc.Mapping):
            return cls(obj)
        elif isinstance(obj, abc.MutableSequence):
            return [cls.build(item) for item in obj]
        else:
            return obj

In [27]:
grad = FrozenJSON({'name': 'Jim Bo', 'class': 1982})
grad.class_

1982

In [28]:
x = FrozenJSON({'2be': 'or not'})
x.2be

SyntaxError: invalid syntax (<ipython-input-28-302340948057>, line 2)

## Flexible Object Creation with __new__

In [1]:
from collections import abc

class FrozenJSON:
    """A read-only facade for navigating a JSON-like object
    using attribute notation"""

    def __new__(cls, arg):                                  #1
        if isinstance(arg, abc.Mapping):
            return super().__new__(cls)                     #2
        elif isinstance(arg, abc.MutableSequence):          #3
            return [cls(item) for item in arg]
        else:
            return arg
        
    def __init__(self, mapping):
        self.__data = {}
        for key, value in mapping.items():
            if keyword.iskeyword(key):
                key += '_'
            self.__data[key] = value
        
    def __getattr__(self, name):
        if hasattr(self.__data, name):
            return getattr(self.__data, name)
        else:
            return FrozenJSON(self.__data[name])      #4
        

## Restructuring the OSCON Feed with shelve

In [7]:
import warnings

import osconfeed

DB_NAME = 'data/schedule1_db'
CONFERENCE = 'conference.115'

class Record:
    def __init__(self, **kwargs):
        self.__dict__.update(kwargs)
        
def load_db(db):
    raw_data = osconfeed.load()
    warnings.warn('loading' + DB_NAME)
    for collection, rec_list in raw_data['Schedule'].items():
        record_type = collection[:-1]
        for record in rec_list:
            key = '{}.{}'.format(record_type, record['serial'])
            record['serial'] = key
            db[key] = Record(**record)


In [8]:
import shelve
db = shelve.open(DB_NAME)

In [11]:
if CONFERENCE not in db:
    load_db(db)



In [12]:
speaker = db['speaker.3471']

In [13]:
type(speaker)

__main__.Record

In [14]:
speaker.name, speaker.twitter

('Anna Martelli Ravenscroft', 'annaraven')

In [15]:
db.close()

## Linked Record Retrieval with Properties

In [13]:
import warnings
import inspect

import osconfeed

DB_NAME = 'data/schedule2_db'
CONFERENCE = 'conference.115'

class Record:
    def __init__(self, **kwargs):
        self.__dict__.update(kwargs)
    
    def __eq__(self, other):
        if isinstance(other, Record):
            return self.__dict__ == other.__dict__
        else:
            return NotImplemented

class MissingDatabaseError(RuntimeError):
    """Raised when a database is required but was not set."""
    
class DbRecord(Record):
    
    __db = None
    
    @staticmethod
    def set_db(db):
        DbRecord.__db = db
        
    @staticmethod
    def get_db():
        return DbRecord.__db
    
    @classmethod
    def fetch(cls, ident):
        db = cls.get_db()
        try:
            return db[ident]
        except TypeError:
            if db is None:
                msg = "database not set; call '{}.set_db(mydb)'"
                raise MissingDatabaseError(msg.format(cls.__name__))
            else:
                raise
                
    def __repr__(self):
        if hasattr(self, 'serial'):
            cls_name = self.__class__.__name__
            return '<{} serial={!r}>'.format(cls_name, self.serial)
        else:
            return super().__repr__()
        
class Event(DbRecord):
    
    @property
    def venue(self):
        key = 'venue.{}'.format(self.venue_serial)
        return self.__class__.fetch(key)
    
    @property
    def speakers(self):
        if not hasattr(self, '_speaker_objs'):
            spkr_serials = self.__dict__['speakers']
            fetch = self.__class__.fetch
            self._speaker_objs = [fetch('speaker.{}'.format(key))
                                 for key in spkr_serials]
        return self._speaker_objs
    
    def __repr__(self):
        if hasattr(self, 'name'):
            cls_name = self.__class__.__name__
            return '<{} {!r}>'.format(cls_name, self.name)
        else:
            return super().__repr__()

def load_db(db):
    raw_data = osconfeed.load()
    warnings.warn('loading ' + DB_NAME)
    for collection, rec_list in raw_data['Schedule'].items():
        record_type = collection[:-1]
        cls_name = record_type.capitalize()
        cls = globals().get(cls_name, DbRecord)
        if inspect.isclass(cls) and issubclass(cls, DbRecord):
            factory = cls
        else:
            factory = DbRecord
        for record in rec_list:
            key = '{}.{}'.format(record_type, record['serial'])
            record['serial'] = key
            db[key] = factory(**record)

In [15]:
import shelve
db = shelve.open(DB_NAME)
if CONFERENCE not in db:
    load_db(db)

In [16]:
DbRecord.set_db(db)

In [17]:
event = DbRecord.fetch('event.33950')

In [18]:
event

<Event 'There *Will* Be Bugs'>

In [19]:
event.venue

<DbRecord serial='venue.1449'>

In [20]:
event.venue.name

'Portland 251'

In [21]:
for spkr in event.speakers:
    print('{0.serial}: {0.name}'.format(spkr))

speaker.3471: Anna Martelli Ravenscroft
speaker.5199: Alex Martelli


In [22]:
event.speakers

[<DbRecord serial='speaker.3471'>, <DbRecord serial='speaker.5199'>]

In [23]:
db.close()

# Using a Property for Attribute Validation