# Run on sample of 30 unlocked articles

A sample of 30 unlocked articles has been made available (these are articles in front of the News24 paywall) in the file 'source_data/sample_text_30_unlocked.pq'.

When running on GPU one can use ```watch -n 1 nvidia-smi``` to monitor GPU usage.

## Import required libraries

In [1]:
import json
import time
import pandas as pd
import torch
from kg_builder import kg
from kg_builder import ner
from kg_builder import cr
from kg_builder import rex
from kg_builder import get_wd_relation_data
from kg_builder import chunk_long_articles

## Import data and make articles

In [2]:
# Import (sample) data
df = pd.read_parquet('source_data/sample_text_30_unlocked.pq')

In [3]:
# Make a list of Article instances
articles = kg.make_articles(df=df)

# Just get 3 to test with on CPU - comment out for full run
# articles = articles[0:3]

## Assmble base model outputs

### Run NER and CR
The `cr_tagger` runs both named entity recognition (NER) and coreference resolution (CR) since they are both spaCy-based. Although the outputs from CR were not used in the last iteration of the build-and-evaluate loop I have left CR in as results show that it should be explored further at the next iteration.

In [4]:
# Set the model name for coreference resolution
cr_model_name = 'fastcoref' # 'lingmess'

cr_tagger, cr_model_name = cr.setup_cr_tagger(model_name = cr_model_name)

# And then run the model to add NER's to the articles
num_articles = len(articles)
batch_size = 50
start_indices = list(range(0, num_articles, 50))
end_indices = start_indices[1:] + [num_articles]
batches = list(zip(start_indices, end_indices))

device = 'cuda' if torch.cuda.is_available() else 'cpu'

start_time = time.time()    
for batch in batches:
    cr.get_ner_cr_data(articles = articles[batch[0]:batch[1]],cr_tagger = cr_tagger)
    if device == 'cuda':
        torch.cuda.empty_cache()
    print(f'''>====== {batch[1]} articles processed ======<''')
end_time = time.time()
time_difference = end_time - start_time
print(time_difference)



Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Inference:   0%|          | 0/5 [00:00<?, ?it/s]

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Inference:   0%|          | 0/5 [00:00<?, ?it/s]

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Inference:   0%|          | 0/5 [00:00<?, ?it/s]

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Inference:   0%|          | 0/5 [00:00<?, ?it/s]

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Inference:   0%|          | 0/5 [00:00<?, ?it/s]

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Inference:   0%|          | 0/5 [00:00<?, ?it/s]

28.9291250705719


In [5]:
articles[0].print_named_entities()


Article e8da0128-3676-4dd9-9daa-8511b94ca993
THE ESKOM FILES | In full: Report by Ishmael
--------------------------------------------
ESKOM, ORG, [4:9]
Ishmael Semenya, PERSON, [37:52]
André de Ruyter, PERSON, [96:111]
Eskom, ORG, [159:164]
André de Ruyter, PERSON, [181:196]
Ishmael Semenya SC, PERSON, [320:338]
Eskom, ORG, [371:376]
Solly Tshitangano, PERSON, [394:411]
André de Ruyter, PERSON, [469:484]
Tshitangano, ORG, [656:667]
De Ruyter, PERSON, [692:701]
De Ruyter, PERSON, [829:838]
Ishmael Semenya, PERSON, [882:897]
Tshitangano, PERSON, [961:972]
Public Enterprises, ORG, [1017:1035]
Pravin Gordhan, PERSON, [1045:1059]
Presidency, ORG, [1065:1075]
Parliament, ORG, [1080:1090]
Semenya, ORG, [1093:1100]
Eskom, ORG, [1124:1129]
Tshitangano, PERSON, [1157:1168]
De Ruyter, PERSON, [1211:1220]
Tshitangano, PERSON, [1267:1278]
De Ruyter, PERSON, [1366:1375]
Eskom, ORG, [1405:1410]
Eskom, ORG, [1449:1454]


### Run RE

REBEL is used to extract the main relations that will be used, Flair is used exclusively for alternate_name to aid disambiguation.

In [6]:
# Run Rebel to get the main relations of interest
rex_model_name = 'rebel'

if rex_model_name == 'rebel':
    # Setup the required RE tagger
    rex_tagger, rex_tokenizer, device, rex_model_name = rex.setup_rex_tagger(model_name = rex_model_name)

    # And then run the model to add REs to the articles
    start_time = time.time()
    for i, article in enumerate(articles):
        chunk_boundaries = chunk_long_articles(article.article_text, max_chunk_size = 20000)
        for chunk in chunk_boundaries:
            rex.rebel_get_relations(article = article, rex_tokenizer = rex_tokenizer, \
                                    rex_tagger =  rex_tagger, device = device, chunk = chunk)
            rex.remove_self_relations(article = article)
        # Clear the CUDA cache every 5 articles
        if device == 'cuda' and (i + 1) % 5 == 0:
            torch.cuda.empty_cache()
        if (i + 1) % 50 == 0:
            print(f'''>====== {i + 1} articles processed ======<''')
    print(f'''>====== {i + 1} articles processed ======<''')
    end_time = time.time()
    time_difference = end_time - start_time
    print(time_difference)

26.331096410751343


In [7]:
# Run Flair to get alternate_name relations
rex_model_name = 'flair'

if rex_model_name == 'flair':
    # Setup the required RE tagger
    rex_tagger, ner_tagger, splitter, device, model_name = rex.setup_rex_tagger(model_name = rex_model_name)
    
    # And then run the model to add REs to the articles
    start_time = time.time()
    for i, article in enumerate(articles):
        rex.flair_get_relations(article = article, splitter  = splitter, ner_tagger  = ner_tagger, \
                                rex_tagger = rex_tagger, device = device, restricted = True)
        rex.remove_self_relations(article = article)
        # Clear the CUDA cache every 5 articles
        if device == 'cuda' and (i + 1) % 5 == 0:
            torch.cuda.empty_cache()
        if (i + 1) % 50 == 0:
            print(f'''>====== {i + 1} articles processed ======<''')
    print(f'''>====== {i + 1} articles processed ======<''')
    end_time = time.time()
    time_difference = end_time - start_time
    print(time_difference)     

2024-09-11 10:51:20,161 SequenceTagger predicts: Dictionary with 76 tags: <unk>, O, B-CARDINAL, E-CARDINAL, S-PERSON, S-CARDINAL, S-PRODUCT, B-PRODUCT, I-PRODUCT, E-PRODUCT, B-WORK_OF_ART, I-WORK_OF_ART, E-WORK_OF_ART, B-PERSON, E-PERSON, S-GPE, B-DATE, I-DATE, E-DATE, S-ORDINAL, S-LANGUAGE, I-PERSON, S-EVENT, S-DATE, B-QUANTITY, E-QUANTITY, S-TIME, B-TIME, I-TIME, E-TIME, B-GPE, E-GPE, S-ORG, I-GPE, S-NORP, B-FAC, I-FAC, E-FAC, B-NORP, E-NORP, S-PERCENT, B-ORG, E-ORG, B-LANGUAGE, E-LANGUAGE, I-CARDINAL, I-ORG, S-WORK_OF_ART, I-QUANTITY, B-MONEY
52.86539340019226


In [8]:
articles[0].print_relations()


Article e8da0128-3676-4dd9-9daa-8511b94ca993
THE ESKOM FILES | In full: Report by Ishmael
--------------------------------------------
André de Ruyter >> employer >> ESKOM
[96:111], [4:9]                
ESKOM >> chairperson >> André de Ruyter
[4:9], [96:111]                
André de Ruyter >> position held >> CEO
[469:484], [464:467]                
widely publicised allegations >> participant >> André de Ruyter
[722:751], [469:484]                
widely publicised allegations >> named after >> André de Ruyter
[722:751], [469:484]                
Pravin Gordhan >> position held >> Public Enterprises Minister
[1045:1059], [1017:1044]                
Public Enterprises Minister >> officeholder >> Pravin Gordhan
[1017:1044], [1045:1059]                


## Import additional libraries & ontology data

In [9]:
from datetime import datetime, timedelta
from kg_builder import get_wikidata_prepared_info
from kg_builder import make_lookup_dict_from_df
from kg_builder import track_relation_origin

import os
import logging

log_file_path = os.path.join(os.getcwd(), 'kg_builder.log')

logger = logging.getLogger(__name__)
logging.basicConfig(filename=log_file_path, encoding='utf-8', level=logging.DEBUG)

In [10]:
# rebel_flair_overview contains a summary of relations to be included
rebel_flair_overview, _, _, _, _ = get_wikidata_prepared_info('reference_info/wikidata_references.pkl')

# We only want to evaluate relations which have been preselected for inclusion
included_relations = list(rebel_flair_overview.loc[rebel_flair_overview['rebel description'].notna(), 'rebel description'])
included_relations += list(make_lookup_dict_from_df(rebel_flair_overview[rebel_flair_overview['rebel description'].notna()], 'rebel description', 'inverse description').values())
included_relations += ['alternate_name'] # additional Flair relation not in REBEL
included_relations = list(set(included_relations))

## Build KG

### Perform base model post-processing

In [11]:
run_start = datetime.now()
for article in articles:
    article.relations = [relation for relation in article.relations if relation.relation_type in included_relations]
    rex.populate_inverse_relations(article)
    rex.get_main_relations_only(article)
    rex.cleanup_alternate_name_pairs(article)
    rex.cleanup_duplicate_alternate_name_pairs(article)
    rex.populate_alt_names_mentions(article)
    rex.populate_clean_relation_texts(article)
    rex.cleanup_overlapping_relations(article)
    rex.remove_ambiguous_relations(article)
    rex.populate_node_types(article)

### Build KG with EL

Entity linking is incorporated into the final KG building algorithm.

In [12]:
my_kg = kg.KGData()
el_tagger = kg.setup_el_tagger()

sleeps = 0
start_index, end_index = (0, 31)
for i, article in enumerate(articles[start_index:end_index]):
    
    logger.info(f'''Fetching article # {i + start_index}''')
    kg.update_kg_from_article(my_kg, article, el_tagger)
    
    # Do a long sleep every 50 articles
    if (i + start_index) % 50 == 0 and i != 0:
        print(f'''{i + start_index} articles completed, long sleep...''')
        sleeps += 90
        time.sleep(90)
        print(f'''resuming...''')
        
    # Do a short sleep every 5 articles
    elif (i + start_index) % 5 == 0 and i != 0:
        print(f'''{i + start_index} articles completed, short sleep...''')
        sleeps += 30
        time.sleep(30)
        print(f'''resuming...''')
        
run_end = datetime.now()
runtime = run_end - run_start - timedelta(seconds=sleeps)
print(f'''>====== {len(articles)} articles processed ======<''')
print(runtime)     



5 articles completed, short sleep...
resuming...




10 articles completed, short sleep...
resuming...




15 articles completed, short sleep...
resuming...




20 articles completed, short sleep...
resuming...




25 articles completed, short sleep...
resuming...




0:04:26.352838


## Inspect one entity and relations


In [13]:
# The entity Jacob Zuma
# - correctly identified as a person
# - correctly matched to Wikidata (using OpenTapioca)
# - AlsoKnownAs terms have been collected
jz_entity = [entity for entity in my_kg.entities if entity.Name == 'Jacob Zuma']
jz_entity

[KGEntity(EntityId='QgN5rzu3q', Name='Jacob Zuma', Type='PERSON', Updated=datetime.datetime(2024, 9, 11, 10, 52, 28, 207858), WDId='Q57282', WDUrl='https://www.wikidata.org/entity/Q57282', WDDescription='4th President of South Africa (2009–2018)', WDRetry=99, WDSource='OpenTapioca', WDRetryArticles={'20a33955-28e7-4f4c-b8c5-1f01355b7fb1'}, PywikiStatus='Q57282', AlsoKnownAs={'Jacob G. Zuma', 'Msholozi', 'JZ', 'Jacob Gedleyihlekisa Zuma'})]

In [14]:
jz_entity[0].EntityId

'QgN5rzu3q'

In [15]:
# The WD tracker data for Jacob Zuma
jz_wikidata = [{k: v} for k, v in my_kg.wd_tracker.items() if v['WDName'] == 'Jacob Zuma']
jz_wikidata

[{'Q57282': {'WDUrl': 'https://www.wikidata.org/entity/Q57282',
   'WDName': 'Jacob Zuma',
   'WDType': 'PERSON',
   'WDAlsoKnown': {'JZ',
    'Jacob G. Zuma',
    'Jacob Gedleyihlekisa Zuma',
    'Msholozi'},
   'WDDescription': '4th President of South Africa (2009–2018)',
   'WDSource': 'OpenTapioca',
   'EntityId': 'QgN5rzu3q'}}]

In [16]:
# Relations found for Jacob Zuma
jz_relations = [relation for relation in my_kg.relations 
                if relation.HeadId == jz_entity[0].EntityId 
                or relation.TailId == jz_entity[0].EntityId]
jz_relations

[KGRelation(RelationId='TDP3tQUul', HeadId='QgN5rzu3q', HeadName='Jacob Zuma', Type='position held', TailId='h4zGpNzhF', TailName='President', Updated=datetime.datetime(2024, 9, 11, 10, 54, 49, 915163), Instances={('20a33955-28e7-4f4c-b8c5-1f01355b7fb1', 'breaking-zuma-can-avoid-jail-time-if-he-agrees-to-testify-now-zondo-20210222', (1343, 1993)): 1, ('84658550-7d66-46f4-af60-b7ec31a9a7e1', 'agrizzi-will-no-longer-testify-at-mokgoro-inquiry-as-expected-20190214', (1986, 2212)): 1, ('88942040-5427-488a-9b46-517a39b17a28', 'zondo-report-part-4-where-was-the-anc-as-the-guptas-took-control-of-important-soes-asks-the-da-20220429', (2385, 2861)): 1, ('0eea49aa-749d-47b8-887c-e51248e75f5c', 'ralph-mathekga-the-law-is-straightforward-no-one-needs-to-convince-zuma-to-abide-by-court-order-20210209', (17, 629)): 1}, WDType='P39'),
 KGRelation(RelationId='5X4AT5eLf', HeadId='QgN5rzu3q', HeadName='Jacob Zuma', Type='mentioned_in', TailId='20a33955-28e7-4f4c-b8c5-1f01355b7fb1', TailName='breaking-zu

In [17]:
# Viewing the text which the first relation was identified from
# - a cross-sentence relation as can be seen here
track_relation_origin(my_kg.relations, articles, jz_relations[0].RelationId)

Relation TDP3tQUul: Jacob Zuma >> position held >> President

Origin: breaking-zuma-can-avoid-jail-time-if-he-agrees-to-testify-now-zondo-20210222

 “For such relief to be possible and effective, a special arrangement would need to be made to hear Mr Zuma’s evidence before 31 March 2021,” Mosala said.
Shortly after the Constitutional Court gave its unanimous judgment against him, Zuma publicly stated that he would defy the ruling for him to appear and answer questions about his nine years in office just as he had defied the unjust apartheid government – and insisted he was, “... prepared to go to jail”.
True to his word, Zuma did not obey the inquiry summons issued for him to appear on 15 February.
 Mosala argues that the former President’s conduct poses a serious threat to rule of law.


## Do some basic checks

In [18]:
# Get number of ids from entities
entities_ids = [entity.EntityId for entity in my_kg.entities]
len(entities_ids)

371

In [19]:
# Check that relations has the same number of entity ids
heads = [relation.HeadId for relation in my_kg.relations]
tails = [relation.TailId for relation in my_kg.relations]
both = set(heads) | set(tails)
len(both)

371

In [20]:
# Check if any entities appear in relations that don't exist in entities - should be none
[(i, relation) for i, relation in enumerate(my_kg.relations) if relation.HeadId not in entities_ids or relation.TailId not in entities_ids]

[]

In [21]:
# Check if any entity ids exist that are not in the entities tracker - should be none
[entity for entity in entities_ids if entity not in my_kg.entities_tracker.values()]

[]

In [22]:
# Check that all entities in the wd_tracker have a corresponding WD entriy in entities
wd_tracker_entities = [value['EntityId'] for value in my_kg.wd_tracker.values()]
print(len(wd_tracker_entities))
entities_with_wdid = [entity.EntityId for entity in my_kg.entities if entity.WDId is not None]
print(len(entities_with_wdid))

70
70
