Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
Cleaned-up entities sheet, very extensive normalization.
  • Loading branch information
pudo committed Apr 8, 2012
1 parent 4e8e752 commit 3be3225
Show file tree
Hide file tree
Showing 7 changed files with 24,428 additions and 5,811 deletions.
4 changes: 3 additions & 1 deletion etl/dump_csv.py
Expand Up @@ -7,15 +7,17 @@
"legalStatus", "contactCountry" ORDER BY "legalStatus" ASC;"""
entities = """SELECT * FROM entity;"""

network_entities = """SELECT * FROM network_entity;"""

def dump_query(engine, q, file_name):
fh = open(file_name, 'wb')
rp = engine.execute(q)
sl.dump_csv(sl.resultiter(rp), fh)


if __name__ == '__main__':
import sys
assert len(sys.argv)==2, "Usage: %s [engine-url]"
engine = sl.connect(sys.argv[1])
#dump_query(engine, company_types, "legalStatus.csv")
dump_query(engine, entities, "entity.csv")

19,950 changes: 14,431 additions & 5,519 deletions etl/entities.csv

Large diffs are not rendered by default.

14 changes: 13 additions & 1 deletion etl/entities.py
Expand Up @@ -34,6 +34,7 @@ def update_entities(engine, file_name):
continue
if not row.get('canonicalName'):
row['canonicalName'] = row['etlFingerPrint']
row['canonicalName'] = cleanCanonical(row['canonicalName'])
entity = data.get(fp)
if entity and entity.get('canonicalName') and \
fp != entity.get('canonicalName'):
Expand All @@ -53,11 +54,22 @@ def update_entities(engine, file_name):
fh.close()


def cleanCanonical(name):
name = name.strip()
name = name.replace('\t', ' ')
name = name.replace('\n', ' ')
name = name.replace('\r', ' ')
name = name.replace(' ', ' ')
name = name.replace(' ', ' ')
name = name.replace(' ', ' ')
return name


def create_entities(engine):
log.info("De-normalizing global entities collection...")
table = sl.get_table(engine, 'entity')
for tbl in ['representative', 'person', 'financialDataTurnover',
'organisation']:
'organisation', 'network_entity']:
for row in sl.all(engine, sl.get_table(engine, tbl)):
entity = {'etlFingerPrint': row.get('etlFingerPrint')}
entity['legalStatus'] = row.get('legalStatus', '')
Expand Down
15 changes: 15 additions & 0 deletions etl/load.py
Expand Up @@ -108,6 +108,20 @@ def load_organisations(grano, engine, rep):
return rep


def load_networking(grano, engine, rep):
for org in sl.find(engine, sl.get_table(engine, 'network_entity'),
identificationCode=rep['identificationCode']):
ent = canonical_actor(grano, engine, org['etlFingerPrint'])

rel = find_relation(rep['outgoing'], 'target', ent,
{'type': ASSOCIATED['name']})
rel['type'] = ASSOCIATED['name']
rel['source'] = rep.get('id')
rel['target'] = ent
rep['outgoing'] = replace_relation(rep['outgoing'], 'target', rel)
return rep


def load_clients(grano, engine, rep):
for fdto in sl.find(engine, sl.get_table(engine, 'financialDataTurnover'),
representativeEtlId=rep['etlId']):
Expand Down Expand Up @@ -174,6 +188,7 @@ def load(engine, grano):
rep_ent['contactCountry'] = rep_ent['contactCountryNorm']
rep_ent = load_clients(grano, engine, rep_ent)
rep_ent = load_organisations(grano, engine, rep_ent)
rep_ent = load_networking(grano, engine, rep_ent)
rep_ent = load_persons(grano, engine, rep_ent)
rep_ent = load_interests(grano, engine, rep_ent)
rep_ent = load_action_fields(grano, engine, rep_ent)
Expand Down

0 comments on commit 3be3225

Please sign in to comment.