## Preparing Data

In [1]:
import codecs
import csv
import json
import pprint
import re

In [2]:
DATAFILE = 'arachnid.csv'
FIELDS ={'rdf-schema#label': 'label',
         'URI': 'uri',
         'rdf-schema#comment': 'description',
         'synonym': 'synonym',
         'name': 'name',
         'family_label': 'family',
         'class_label': 'class',
         'phylum_label': 'phylum',
         'order_label': 'order',
         'kingdom_label': 'kingdom',
         'genus_label': 'genus'}

In [6]:
def process_file(filename, fields):

    process_fields = fields.keys()
    data = []
    with open(filename, "r") as f:
        reader = csv.DictReader(f)
        for i in range(3):
            l = next(reader)

        for line in reader:
            # YOUR CODE HERE
            arachnid = {}
            classification = {}
            for field, value in line.items():
                if field in FIELDS:
                    new_key = FIELDS[field]
                    new_val = value
                    if new_key in ['kingdom','family','class','phylum','order','genus']:
                        classification[new_key] = new_val
                    else:
                        arachnid[new_key] = new_val
            arachnid['classification'] = classification
            data.append(arachnid)

        # additional cleaning
        for arachnid in data:
            # strip redundant text from label
            arachnid['label'] = re.sub('\(.*?\)', '', arachnid['label']).strip()
            # fix 'name' if 'NULL' or contains non-alphanumeric characters
            if arachnid['name'] == 'NULL' or not arachnid['name'].isalnum():
                arachnid['name'] = arachnid['label']
            # if synonym is not None, convert to an array (strip '{}' and split on '|')
            if arachnid['synonym'] != 'NULL':
                arachnid['synonym'] = arachnid['synonym'].replace('{','').replace('}','').split('|')
            # if any value is 'NULL' change to None
            for field, value in arachnid.items():
                if value == 'NULL':
                    arachnid[field] = None
                # strip all leading/trailing whitespace from values
                try:
                    arachnid[field] = arachnid[field].strip()
                except AttributeError:
                    continue
            # fix 'classification' whitespace and 'NULL' values
            for field, value in arachnid['classification'].items():
                arachnid['classification'][field] = arachnid['classification'][field].strip()
                if value == 'NULL':
                    arachnid['classification'][field] = None

    #pprint.pprint(data)
    return data

In [7]:
def parse_array(v):
    if (v[0] == "{") and (v[-1] == "}"):
        v = v.lstrip("{")
        v = v.rstrip("}")
        v_array = v.split("|")
        v_array = [i.strip() for i in v_array]
        return v_array
    return [v]

In [8]:
def test():
    data = process_file(DATAFILE, FIELDS)

    pprint.pprint(data[0])
    assert data[0] == {
                        "synonym": None, 
                        "name": "Argiope", 
                        "classification": {
                            "kingdom": "Animal", 
                            "family": "Orb-weaver spider", 
                            "order": "Spider", 
                            "phylum": "Arthropod", 
                            "genus": None, 
                            "class": "Arachnid"
                        }, 
                        "uri": "http://dbpedia.org/resource/Argiope_(spider)", 
                        "label": "Argiope", 
                        "description": "The genus Argiope includes rather large and spectacular spiders that often have a strikingly coloured abdomen. These spiders are distributed throughout the world. Most countries in tropical or temperate climates host one or more species that are similar in appearance. The etymology of the name is from a Greek name meaning silver-faced."
                    }


if __name__ == "__main__":
    test()

{'classification': {'class': 'Arachnid',
                    'family': 'Orb-weaver spider',
                    'genus': None,
                    'kingdom': 'Animal',
                    'order': 'Spider',
                    'phylum': 'Arthropod'},
 'description': 'The genus Argiope includes rather large and spectacular '
                'spiders that often have a strikingly coloured abdomen. These '
                'spiders are distributed throughout the world. Most countries '
                'in tropical or temperate climates host one or more species '
                'that are similar in appearance. The etymology of the name is '
                'from a Greek name meaning silver-faced.',
 'label': 'Argiope',
 'name': 'Argiope',
 'synonym': None,
 'uri': 'http://dbpedia.org/resource/Argiope_(spider)'}


## Inserting into DB

In [9]:
import json

In [10]:
def insert_data(data, db):

    # Your code here. Insert the data into a collection 'arachnid'
    for a in data:
        db.arachnid.insert(a)
    pass

In [11]:
if __name__ == "__main__":
    
    from pymongo import MongoClient
    client = MongoClient("mongodb://localhost:27017")
    db = client.examples

    with open('arachnid.json') as f:
        data = json.loads(f.read())
        insert_data(data, db)
        print (db.arachnid.find_one())

{'_id': ObjectId('5b0ef95ba24d68392c838f6e'), 'synonym': None, 'name': 'Argiope', 'classification': {'kingdom': 'Animal', 'family': 'Orb-weaver spider', 'order': 'Spider', 'phylum': 'Arthropod', 'genus': None, 'class': 'Arachnid'}, 'uri': 'http://dbpedia.org/resource/Argiope_(spider)', 'label': 'Argiope', 'description': 'The genus Argiope includes rather large and spectacular spiders that often have a strikingly coloured abdomen. These spiders are distributed throughout the world. Most countries in tropical or temperate climates host one or more species that are similar in appearance. The etymology of the name is from a Greek name meaning silver-faced.'}


  """
