In [None]:
%load_ext autoreload
%autoreload 2

In [5]:
import pandas as pd
import sys
import os
import importlib
import ast
import numpy as np

This notebook exemplifies how to apply Geordie to a collection of documents and extract some data in order to start an analysis.

# Load data

In [25]:
data=pd.read_csv("...") # expects data to have, at least, columns "title", "abstract" and "ID"
data["fulltext"] = data["title"]+". "+data["abstract"]

# Apply Geordie

In [26]:
current_dir = os.getcwd()
subfolder_path = os.path.join(current_dir, '/Users/bertagrimau/dev_projects/geordie/') # change as needed

# Add the subfolder to the system path
sys.path.append(subfolder_path)

In [None]:
import geordie
importlib.reload(geordie)

In [28]:
# Might need to authenticate into HG's hub in order ot load the NER model

# from huggingface_hub import notebook_login
# notebook_login()

In [None]:
my_geordie = geordie.Geordie()

In [None]:
all_results = {}

for id, text in data[["ID","fulltext"]].itertuples(index=False):
    results_for_record=[]
    print(id)
    try:
        results = my_geordie.process_text(text)
        if results!=[]:
            for result in results:
                results_for_record.append({"entity":result["entity"],"osm":result["osm"],"context":result["context"]})            
        else:
            results_for_record.append(None)
        all_results[id]=results_for_record
    except Exception as error:
        print("Exception: ", error)
        all_results[id]=None
        

all_results_df = pd.DataFrame([all_results.keys(), all_results.values()]).T.reset_index(drop=True).rename(columns={0:"ID",1:"geordie"})
# all_results_df.to_csv("...")

final = data.merge(all_results_df, on="ID").drop(columns=["fulltext"])
# final.to_csv("...")

# Extract (some) entities for analysis

In [None]:
final = final.fillna("")
no_nulls_df = final[(final.geordie!="[None]") & (final.geordie!="")][["ID", 'title', 'abstract', "geordie"]].reset_index(drop=True)

In [17]:
def get_entities(x):
    
    entities=[]
    for item in x:
        if item!=None:
            entities.append(item["entity"])
        else:
            continue
    return entities

def get_osm(x):
    all_osm=[]
    for item in x:
        if item!=None:
            osm=item.get("osm",{})
            all_osm.append(osm)
        else:
            continue
    return all_osm

def get_context(x):
    all_osm=[]
    for item in x:
        if item!=None:
            osm=item.get("context",{})
            all_osm.append(osm)
        else:
            continue
    return all_osm


def get_town(x):
    if not(pd.isnull(x)):
        address=x.get("address",{})
        if address.get('city', None)!=None:
            res=address.get('city', None)
        elif address.get('town', None)!=None:
            res=address.get("town",None)
        elif address.get('village', None)!=None:
            res=address.get("village",None)
        else:
            res=None
        return res
    else:
        return None


def get_region(x, type_region): #type_region="county", "state", etc
    if not(pd.isnull(x)):
        address=x.get("address",{})
        if address.get(type_region, None)!=None:
            res=address.get(type_region, None)
        else:
            res=None
        return res
    else:
        return None

In [18]:
no_nulls_df["ents"] = no_nulls_df.geordie.apply(get_entities)
no_nulls_df["osm"] = no_nulls_df.geordie.apply(get_osm)
no_nulls_df["context"] = no_nulls_df.geordie.apply(get_context)

In [19]:
df_exp = no_nulls_df.explode(["ents","osm","context"])

In [20]:
df_exp["town"] = df_exp.osm.apply(get_town)
df_exp["county"] = df_exp.osm.apply(lambda x: get_region(x, "county"))
df_exp["state"] = df_exp.osm.apply(lambda x: get_region(x, "state"))