In [51]:
import codecs
import csv
import json
import pprint
import re

DATAFILE = 'arachnid.csv'
FIELDS ={'rdf-schema#label': 'label',
         'URI': 'uri',
         'rdf-schema#comment': 'description',
         'synonym': 'synonym',
         'name': 'name',
         'family_label': 'family',
         'class_label': 'class',
         'phylum_label': 'phylum',
         'order_label': 'order',
         'kingdom_label': 'kingdom',
         'genus_label': 'genus'}

CLASSIFY = {'kingdom','family','order','phylum','genus','class'}

In [52]:
def parse_label(v):
    if v == "NULL":
        return None
    a = re.compile('\(.*?\)')
    v = a.sub('', v)
    v = v.strip()
    return v


def parse_normal(v):
    if v == "NULL":
        return None
    v = v.strip()
    return v

In [58]:
def parse_name(v):
    a = re.compile('^a-zA-Z')
    if v == "NULL" or a.match(v):
        return True
    return False

def parse_array(v):
    if v == "NULL":
        return None
    if (v[0] == "{") and (v[-1] == "}"):
        v = v.lstrip("{")
        v = v.rstrip("}")
        v_array = v.split("|")
        v_array = [i.strip() for i in v_array]
        return v_array
    return [v]

In [61]:
def parse_line(line, process_fields):
    fields = {}
    classification = {}
    
    for c in line:
        if c in process_fields:
            if c == 'rdf-schema#label':
                fields['label'] = parse_label(line[c])
            elif c == 'name':
                fields['name'] = line[c]
            elif FIELDS[c] in CLASSIFY:
                classification[FIELDS[c]] = parse_normal(line[c])
            elif c == 'synonym':
                fields[c] = parse_array(line[c])
            else:
                fields[FIELDS[c]] = parse_normal(line[c])
                
    if parse_name(fields["name"]):
        fields["name"] = fields["label"]
    else:
        fields["name"] = parse_normal(fields["name"])
    fields['classification'] = classification

    return fields

In [55]:
def process_file(filename, fields):

    process_fields = fields.keys()
    data = []
    
    with open(filename, "r") as f:
        reader = csv.DictReader(f)
        for i in range(3):
            l = reader.next()

        for line in reader:
            fields = parse_line(line, process_fields)
            data.append(fields)
            
    return data

In [17]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
In this problem set you work with another type of infobox data, audit it,
clean it, come up with a data model, insert it into MongoDB and then run some
queries against your database. The set contains data about Arachnid class.

For this exercise, the arachnid data is already in the database. You have been
given the task of including 'binomialAuthority' information in the records.
You will do this by processing the arachnid.csv to extract binomial authority
data and then using this data to update the corresponding data base records.

The following things should be done in the function add_field:
- process the csv file and extract 2 fields - 'rdf-schema#label' and
  'binomialAuthority_label'
- clean up the 'rdf-schema#label' the same way as in the first exercise,
  removing redundant "(spider)" suffixes
- return a dictionary with the cleaned 'rdf-schema#label' field values as keys, 
  and 'binomialAuthority_label' field values as values
- if 'binomialAuthority_label' is "NULL" for a row in the csv, skip the item

The following should be done in the function update_db:
- query the 'label' field in the database using rdf-schema#label keys from the
  data dictionary
- update the documents by adding a new item under 'classification' with the key
  'binomialAuthority' and the binomialAuthority_label value from the data
  dictionary as the value

For item {'Argiope': 'Jill Ward'} in the data dictionary, the resulting document structure 
should look like this:

{ 'label': 'Argiope',
  'uri': 'http://dbpedia.org/resource/Argiope_(spider)',
  'description': 'The genus Argiope includes rather large and spectacular spiders that often ...',
  'name': 'Argiope',
  'synonym': ["One", "Two"],
  'classification': {
                    'binomialAuthority' : 'Jill Ward'
                    'family': 'Orb-weaver spider',
                    'class': 'Arachnid',
                    'phylum': 'Arthropod',
                    'order': 'Spider',
                    'kingdom': 'Animal',
                    'genus': None
                    }
}

Note that the value in the 'binomialAuthority' field is a placeholder; this is only to 
demonstrate the output structure form, for the entries that require updating.
"""
import codecs
import csv
import json
import re
import pprint

DATAFILE = 'arachnid.csv'
FIELDS ={'rdf-schema#label': 'label',
         'binomialAuthority_label': 'binomialAuthority'}


def parse_label(v):
    if v == "NULL":
        return None
    a = re.compile('\(.*?\)')
    v = a.sub('', v)
    v = v.strip()

    return v

def add_field(filename, fields):
    """
    Complete this function to set up a dictionary for adding binomialAuthority
    information to the database.
    """
    process_fields = fields.keys()
    data = []   
    with open(filename, "r") as f:
        reader = csv.DictReader(f)
        for i in range(3):
            l = reader.next()
        
        for line in reader:
            fields = {}
            for c in line:
                if c == 'rdf-schema#label':
                    label = parse_label(line[c])
                if c == 'binomialAuthority_label':
                    binomialAuthority = line[c]
            if binomialAuthority != "NULL":
                fields["label"] = label
                fields["binomialAuthority"] = binomialAuthority
                data.append(fields)    
                    
    return data


def update_db(data, db):
    """
    Use the dictionary you generated from add_field to update the database.
    """
    for item in data:
        doc = db.arachnid.find_one({'label': item['label']})
        doc['classification']['binomialAuthority'] = item['binomialAuthority']
        db.arachnid.save(doc)

def test():
    # Please change only the add_field and update_db functions!
    # Changes done to this function will not be taken into account
    # when doing a Test Run or Submit, they are just for your own reference
    # and as an example for running this code locally!
    
    data = add_field(DATAFILE, FIELDS)
    
    from pymongo import MongoClient
    client = MongoClient("mongodb://localhost:27017")
    db = client.examples

    update_db(data, db)

    updated = db.arachnid.find_one({'label': 'Opisthoncana'})
    assert updated['classification']['binomialAuthority'] == 'Embrik Strand'
    pprint.pprint(data)
    
#     for item in data:
#         doc = db.arachnid.find_one({'label': item['label']})
#         doc['classification']['binomialAuthority'] = item['binomialAuthority']
#         print doc
#         print item


if __name__ == "__main__":
    test()

{'binomialAuthority': 'Embrik Strand', 'label': 'Opisthoncana'}
{'binomialAuthority': 'Arthur M. Chickering', 'label': 'Orvilleus'}
{'binomialAuthority': 'Charles Athanase Walckenaer', 'label': 'Six-spotted fishing spider'}
{'binomialAuthority': '{1951 in science|Raymond Robert Forster}', 'label': 'Zealanapis australis'}
