# Serialization

### CSV

In [1]:
with open('CSV-JSON-XML-CocaCola.csv') as fp:
    data = fp.readlines()
    
data  # looks pretty organized. could potentially write our own parser.

['Year,NOI,Assets,Cash,Share Price\n',
 '2017,"35,410,000","87,896,000","6,006,000",45.88\n',
 '2016,"41,863,000","87,270,000",8555000,41.46\n',
 '2015,"44,294,000",,7309000,42.96\n',
 '2014,"45,998,000",,,42.41\n',
 '2013,"46,854,000",,,40.66']

In [2]:
import csv

with open('CSV-JSON-XML-CocaCola.csv') as fp:
    reader = csv.reader(fp)
    data = list(reader)
    
data  # much better! each column is a broken into its own list element

[['Year', 'NOI', 'Assets', 'Cash', 'Share Price'],
 ['2017', '35,410,000', '87,896,000', '6,006,000', '45.88'],
 ['2016', '41,863,000', '87,270,000', '8555000', '41.46'],
 ['2015', '44,294,000', '', '7309000', '42.96'],
 ['2014', '45,998,000', '', '', '42.41'],
 ['2013', '46,854,000', '', '', '40.66']]

In [3]:
with open('CSV-JSON-XML-CocaCola.csv') as fp:
    reader = csv.DictReader(fp)
    data = list(reader)
    
data  # The best for data with headers.

[OrderedDict([('Year', '2017'),
              ('NOI', '35,410,000'),
              ('Assets', '87,896,000'),
              ('Cash', '6,006,000'),
              ('Share Price', '45.88')]),
 OrderedDict([('Year', '2016'),
              ('NOI', '41,863,000'),
              ('Assets', '87,270,000'),
              ('Cash', '8555000'),
              ('Share Price', '41.46')]),
 OrderedDict([('Year', '2015'),
              ('NOI', '44,294,000'),
              ('Assets', ''),
              ('Cash', '7309000'),
              ('Share Price', '42.96')]),
 OrderedDict([('Year', '2014'),
              ('NOI', '45,998,000'),
              ('Assets', ''),
              ('Cash', ''),
              ('Share Price', '42.41')]),
 OrderedDict([('Year', '2013'),
              ('NOI', '46,854,000'),
              ('Assets', ''),
              ('Cash', ''),
              ('Share Price', '40.66')])]

In [4]:
import collections

In [5]:
d = collections.OrderedDict()

d[1] = 'a'
d[2] = 'b'
d[3] = 'c'
d

OrderedDict([(1, 'a'), (2, 'b'), (3, 'c')])

In [6]:
first_row = data[0]
print(first_row['Year'])
print(first_row['Assets'])

2017
87,896,000


In [7]:
import collections

# Data is still in strings. Let's convert to correct types.

def parse_row(row):
    new_row = collections.OrderedDict()
    new_row['Year'] = int(row['Year'])
    new_row['NOI'] = int(row['NOI'].replace(',', '')) if row['NOI'] else None
    new_row['Assets'] = int(row['Assets'].replace(',', '')) if row['Assets'] else None
    new_row['Cash'] = int(row['Cash'].replace(',', '')) if row['Cash'] else None
    new_row['Share Price'] = float(row['Share Price'].replace(',', '')) if row['Share Price'] else None
    return new_row

for row in data:
    print(parse_row(row))

OrderedDict([('Year', 2017), ('NOI', 35410000), ('Assets', 87896000), ('Cash', 6006000), ('Share Price', 45.88)])
OrderedDict([('Year', 2016), ('NOI', 41863000), ('Assets', 87270000), ('Cash', 8555000), ('Share Price', 41.46)])
OrderedDict([('Year', 2015), ('NOI', 44294000), ('Assets', None), ('Cash', 7309000), ('Share Price', 42.96)])
OrderedDict([('Year', 2014), ('NOI', 45998000), ('Assets', None), ('Cash', None), ('Share Price', 42.41)])
OrderedDict([('Year', 2013), ('NOI', 46854000), ('Assets', None), ('Cash', None), ('Share Price', 40.66)])


In [8]:
# exporting data to txt for easier viewing in Jupyter Notebook.
# Same data would be present in a '.csv' file.
with open('CSV-JSON-XML-CocaCola-out.csv', 'w') as fp:
    writer = csv.writer(fp)
    writer.writerow(parse_row(row).keys())  # writer header row
    writer.writerows([parse_row(row).values() for row in data])  # write rows of data

### JSON

In [9]:
import json

with open('CSV-JSON-XML-superheros.json') as fp:
    data = json.load(fp)
    
data

{'squadName': 'Super hero squad',
 'homeTown': 'Metro City',
 'formed': 2016,
 'secretBase': 'Super tower',
 'active': True,
 'members': [{'name': 'Madame Uppercut',
   'age': 39,
   'secretIdentity': 'Jane Wilson',
   'powers': ['Million tonne punch',
    'Damage resistance',
    'Superhuman reflexes']},
  {'name': 'Molecule Man',
   'age': 29,
   'secretIdentity': 'Dan Jukes',
   'powers': ['Radiation resistance', 'Turning tiny', 'Radiation blast']},
  {'name': 'Eternal Flame',
   'age': 1000000,
   'secretIdentity': 'Unknown',
   'powers': ['Immortality',
    'Heat Immunity',
    'Inferno',
    'Teleportation',
    'Interdimensional travel']}]}

In [10]:
type(data)  # JSON objects are loaded as `dict`s

dict

In [11]:
del data['secretBase']

In [12]:
data

{'squadName': 'Super hero squad',
 'homeTown': 'Metro City',
 'formed': 2016,
 'active': True,
 'members': [{'name': 'Madame Uppercut',
   'age': 39,
   'secretIdentity': 'Jane Wilson',
   'powers': ['Million tonne punch',
    'Damage resistance',
    'Superhuman reflexes']},
  {'name': 'Molecule Man',
   'age': 29,
   'secretIdentity': 'Dan Jukes',
   'powers': ['Radiation resistance', 'Turning tiny', 'Radiation blast']},
  {'name': 'Eternal Flame',
   'age': 1000000,
   'secretIdentity': 'Unknown',
   'powers': ['Immortality',
    'Heat Immunity',
    'Inferno',
    'Teleportation',
    'Interdimensional travel']}]}

In [13]:
# Let's add another superhero
data['members'].append(
    {
        'age': 64,
        'name': 'Oprah Winfrey',
        'powers': ['Media Proprietor', 'Producer', 'Philanthropist'],
        'secretIdentity': 'Oprah Winfrey'
    }
)

In [14]:
data

{'squadName': 'Super hero squad',
 'homeTown': 'Metro City',
 'formed': 2016,
 'active': True,
 'members': [{'name': 'Madame Uppercut',
   'age': 39,
   'secretIdentity': 'Jane Wilson',
   'powers': ['Million tonne punch',
    'Damage resistance',
    'Superhuman reflexes']},
  {'name': 'Molecule Man',
   'age': 29,
   'secretIdentity': 'Dan Jukes',
   'powers': ['Radiation resistance', 'Turning tiny', 'Radiation blast']},
  {'name': 'Eternal Flame',
   'age': 1000000,
   'secretIdentity': 'Unknown',
   'powers': ['Immortality',
    'Heat Immunity',
    'Inferno',
    'Teleportation',
    'Interdimensional travel']},
  {'age': 64,
   'name': 'Oprah Winfrey',
   'powers': ['Media Proprietor', 'Producer', 'Philanthropist'],
   'secretIdentity': 'Oprah Winfrey'}]}

In [15]:
with open('CSV-JSON-XML-superheros-out.json', 'w') as fp:
    json.dump(
        obj=data,
        fp=fp,
        indent=True,  # pretty printing
        sort_keys=True,  # sorting for easier lookup by a human
    )

In [16]:
# JSON is much easier when type parsing is important
# CSV is better for relational data

In [17]:
# LEFT HERE FOR SEC1

### XML

In [18]:
from lxml import etree

In [19]:
with open('CSV-JSON-XML-note.xml') as fp:
    data = fp.read()

root = etree.fromstring(data)

In [20]:
print(etree.tostring(root).decode())

<note date="12/11/99">
    <to>Craig</to>
    <from>Megan</from>
    <heading>Reminder</heading>
    <body>
        <p>Pick up my new spark plugs.</p>
        <p>Love you, M</p>
    </body>
</note>


In [21]:
for node in root:
    print(node)

# Notice "note" and "p" tags are not present.

<Element to at 0x7fa4fe7adb48>
<Element from at 0x7fa4fe524088>
<Element heading at 0x7fa4fe5240c8>
<Element body at 0x7fa4fe7adb48>


In [22]:
tags = root.findall('.//p')  # must use XPath syntax to look up nested tags in tree, eg. "p"
tags

[<Element p at 0x7fa4fe7add88>, <Element p at 0x7fa4fef09208>]

In [23]:
tags[0].text

'Pick up my new spark plugs.'

In [24]:
# Could iterate over whole tree but that is not 
# very efficient if we just want the "p" tags.
for element in root.iter():
    print(element)

<Element note at 0x7fa4fef06d48>
<Element to at 0x7fa4fe5241c8>
<Element from at 0x7fa4fe7ade08>
<Element heading at 0x7fa4fef06c88>
<Element body at 0x7fa4fe7adb48>
<Element p at 0x7fa4fe7add88>
<Element p at 0x7fa4fef09208>


In [25]:
# where did the "note" tag go? It bacame the root object.
root

<Element note at 0x7fa4fef06d48>

In [26]:
root.attrib  # get attributes on root

{'date': '12/11/99'}

In [27]:
etree.tostring(root)  # will return bytes!

b'<note date="12/11/99">\n    <to>Craig</to>\n    <from>Megan</from>\n    <heading>Reminder</heading>\n    <body>\n        <p>Pick up my new spark plugs.</p>\n        <p>Love you, M</p>\n    </body>\n</note>'

In [28]:
with open('CSV-JSON-XML-note-out.xml', 'wb') as fp:  # open the file for writing bytes
    fp.write(etree.tostring(root))

In [None]:
# When to use XML? Never. Try not to use XML unless the project specifically requires it.