In [51]:
import codecs
import csv
import json
import pprint
import re

DATAFILE = 'arachnid.csv'
FIELDS ={'rdf-schema#label': 'label',
         'URI': 'uri',
         'rdf-schema#comment': 'description',
         'synonym': 'synonym',
         'name': 'name',
         'family_label': 'family',
         'class_label': 'class',
         'phylum_label': 'phylum',
         'order_label': 'order',
         'kingdom_label': 'kingdom',
         'genus_label': 'genus'}

CLASSIFY = {'kingdom','family','order','phylum','genus','class'}

In [52]:
def parse_label(v):
    if v == "NULL":
        return None
    a = re.compile('\(.*?\)')
    v = a.sub('', v)
    v = v.strip()
    return v


def parse_normal(v):
    if v == "NULL":
        return None
    v = v.strip()
    return v

In [58]:
def parse_name(v):
    a = re.compile('^a-zA-Z')
    if v == "NULL" or a.match(v):
        return True
    return False

def parse_array(v):
    if v == "NULL":
        return None
    if (v[0] == "{") and (v[-1] == "}"):
        v = v.lstrip("{")
        v = v.rstrip("}")
        v_array = v.split("|")
        v_array = [i.strip() for i in v_array]
        return v_array
    return [v]

In [61]:
def parse_line(line, process_fields):
    fields = {}
    classification = {}
    
    for c in line:
        if c in process_fields:
            if c == 'rdf-schema#label':
                fields['label'] = parse_label(line[c])
            elif c == 'name':
                fields['name'] = line[c]
            elif FIELDS[c] in CLASSIFY:
                classification[FIELDS[c]] = parse_normal(line[c])
            elif c == 'synonym':
                fields[c] = parse_array(line[c])
            else:
                fields[FIELDS[c]] = parse_normal(line[c])
                
    if parse_name(fields["name"]):
        fields["name"] = fields["label"]
    else:
        fields["name"] = parse_normal(fields["name"])
    fields['classification'] = classification

    return fields

In [55]:
def process_file(filename, fields):

    process_fields = fields.keys()
    data = []
    
    with open(filename, "r") as f:
        reader = csv.DictReader(f)
        for i in range(3):
            l = reader.next()

        for line in reader:
            fields = parse_line(line, process_fields)
            data.append(fields)
            
    return data

In [62]:
data = process_file(DATAFILE, FIELDS)

In [63]:
data[0]

{'classification': {'class': 'Arachnid',
  'family': 'Orb-weaver spider',
  'genus': None,
  'kingdom': 'Animal',
  'order': 'Spider',
  'phylum': 'Arthropod'},
 'description': 'The genus Argiope includes rather large and spectacular spiders that often have a strikingly coloured abdomen. These spiders are distributed throughout the world. Most countries in tropical or temperate climates host one or more species that are similar in appearance. The etymology of the name is from a Greek name meaning silver-faced.',
 'label': 'Argiope',
 'name': 'Argiope',
 'synonym': None,
 'uri': 'http://dbpedia.org/resource/Argiope_(spider)'}