In [1]:
from relatio._logging import FileLogger
logger = FileLogger()

In [2]:
# Browse list of available datasets
from relatio.datasets import list_datasets
print(list_datasets())

# Load an available dataset
from relatio.datasets import load_trump_data
df = load_trump_data("raw")


    List of available datasets:

    Trump Tweet Archive
    - function call: load_trump_data()
    - format: 'raw', 'split_sentences', 'srl_res'
    - allennlp version: 0.9
    - srl model: srl-model-2018.05.25.tar.gz
    


In [3]:
# Split into sentences
from relatio.preprocessing import *

p = Preprocessor(
    spacy_model = "en_core_web_md",
    remove_punctuation = True,
    remove_digits = True,
    lowercase = True,
    lemmatize = True,
    stop_words = [],
    n_process = -1,
    batch_size = 100
)

split_sentences = p.split_into_sentences(
    df.iloc[0:1000], output_path='sentences.json', progress_bar=True
)

from relatio.utils import load_sentences
split_sentences = load_sentences('sentences.json')

Splitting into sentences...


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 1156.00it/s]


In [4]:
# Run SRL
from relatio.semantic_role_labeling import *

SRL = SRL(
    path = "https://storage.googleapis.com/allennlp-public-models/openie-model.2020.03.26.tar.gz",
    batch_size = 10,
    cuda_device = -1
)

srl_res = SRL(split_sentences[1], progress_bar=True)


2022-02-28 19:01:54,300 - INFO - allennlp.common.plugins - Plugin allennlp_models available
2022-02-28 19:01:54,566 - INFO - allennlp.common.file_utils - cache of https://storage.googleapis.com/allennlp-public-models/openie-model.2020.03.26.tar.gz is up-to-date
2022-02-28 19:01:54,567 - INFO - allennlp.models.archival - loading archive file https://storage.googleapis.com/allennlp-public-models/openie-model.2020.03.26.tar.gz from cache at /home/germain/.allennlp/cache/60314a853eb0aaa774d176d878c62469d49872feb4f2bfd071a75c77f6d76707.1b91cc27e347f2df04ce771a304bee2b70a2c487626b67e277d44c593b868c25
2022-02-28 19:01:54,568 - INFO - allennlp.models.archival - extracting archive file /home/germain/.allennlp/cache/60314a853eb0aaa774d176d878c62469d49872feb4f2bfd071a75c77f6d76707.1b91cc27e347f2df04ce771a304bee2b70a2c487626b67e277d44c593b868c25 to temp dir /tmp/tmpyl454q8w
2022-02-28 19:01:54,945 - INFO - allennlp.common.params - dataset_reader.type = srl
2022-02-28 19:01:54,945 - INFO - allennl

2022-02-28 19:01:55,316 - INFO - allennlp.nn.initializers -    encoder._module.layer_2.cell.state_linearity.weight
2022-02-28 19:01:55,317 - INFO - allennlp.nn.initializers -    encoder._module.layer_3.cell.input_linearity.bias
2022-02-28 19:01:55,317 - INFO - allennlp.nn.initializers -    encoder._module.layer_3.cell.input_linearity.weight
2022-02-28 19:01:55,317 - INFO - allennlp.nn.initializers -    encoder._module.layer_3.cell.state_linearity.bias
2022-02-28 19:01:55,318 - INFO - allennlp.nn.initializers -    encoder._module.layer_3.cell.state_linearity.weight
2022-02-28 19:01:55,318 - INFO - allennlp.nn.initializers -    encoder._module.layer_4.cell.input_linearity.bias
2022-02-28 19:01:55,318 - INFO - allennlp.nn.initializers -    encoder._module.layer_4.cell.input_linearity.weight
2022-02-28 19:01:55,319 - INFO - allennlp.nn.initializers -    encoder._module.layer_4.cell.state_linearity.bias
2022-02-28 19:01:55,319 - INFO - allennlp.nn.initializers -    encoder._module.layer_4.c

Running SRL...


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 229/229 [01:11<00:00,  3.18it/s]


In [5]:
# To save us some time, we download the results from the datasets module.
# split_sentences = load_trump_data("split_sentences")
# srl_res = load_trump_data("srl_res")

In [6]:
roles, sentence_index = p.extract_roles(
    srl_res, 
    used_roles = ["ARG0","B-V","B-ARGM-NEG","B-ARGM-MOD","ARG1","ARG2"],
    progress_bar = True
)

for d in roles[0:5]: print(d)

postproc_roles = p.process_roles(roles, 
                                 dict_of_pos_tags_to_keep = {
                                     "ARG0": ['NOUN', 'PROPN'],
                                     "B-V": ['VERB'],
                                     "ARG1": ['NOUN', 'PROPN'],
                                     "ARG2": ['NOUN', 'PROPN']
                                 }, 
                                 progress_bar = True,
                                 output_path = 'postproc_roles.json')

from relatio.utils import load_roles
postproc_roles = load_roles('postproc_roles.json')

for d in postproc_roles[0:5]: print(d)

Extracting semantic roles...


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2290/2290 [00:00<00:00, 34668.42it/s]


{'B-V': 'have'}
{'ARG0': 'Republicans and Democrats', 'ARG1': 'our economic problems', 'B-V': 'created'}
{'ARG1': 'I', 'ARG2': 'thrilled to be back in the Great city of Charlotte , North Carolina with thousands of hardworking American Patriots who love our Country , cherish our values , respect our laws , and always put AMERICA FIRST', 'B-V': 'was'}
{'ARG1': 'I', 'ARG2': 'back in the Great city of Charlotte , North Carolina with , respect our laws , and always put AMERICA FIRST', 'B-V': 'be'}
{'ARG0': 'thousands of hardworking American Patriots who', 'ARG1': 'our Country', 'B-V': 'love'}
Cleaning roles ARG0...


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1900/1900 [00:00<00:00, 2058.60it/s]


Cleaning roles B-V...


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4742/4742 [00:01<00:00, 2597.36it/s]


Cleaning roles B-ARGM-MOD...


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 490/490 [00:00<00:00, 1240.70it/s]


Cleaning roles ARG1...


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3430/3430 [00:01<00:00, 2104.19it/s]


Cleaning roles ARG2...


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 955/955 [00:00<00:00, 1444.53it/s]

{'B-V': 'have'}
{'ARG0': 'republicans democrats', 'B-V': 'create', 'ARG1': 'problem'}
{'B-V': '', 'ARG1': '', 'ARG2': 'city charlotte north carolina thousand patriots country value law america'}
{'B-V': 'be', 'ARG1': '', 'ARG2': 'city charlotte north carolina law america'}
{'ARG0': 'thousand patriots', 'B-V': 'love', 'ARG1': 'country'}





In [7]:
known_entities = p.mine_entities(
    split_sentences[1], 
    clean_entities = True, 
    progress_bar = True,
    output_path = 'entities.pkl'
)

from relatio.utils import load_entities
known_entities = load_entities('entities.pkl')

for n in known_entities.most_common(10): print(n)

Mining named entities...


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2290/2290 [00:01<00:00, 1619.66it/s]

('biden', 75)
('georgia', 58)
('pennsylvania', 53)
('joe biden', 50)
('trump', 36)
('america', 33)
('michigan', 32)
('democrats', 29)
('republicans', 28)
('covid', 27)





In [8]:
from relatio.narrative_models import *
from relatio.utils import prettify
from collections import Counter

m = NarrativeModel(model_type = 'deterministic',
                   roles_considered = ['ARG0', 'B-V', 'B-ARGM-NEG', 'B-ARGM-MOD', 'ARG1', 'ARG2'],
                   roles_with_entities = ['ARG0','ARG1','ARG2'],
                   list_of_known_entities = list(known_entities),
                   assignment_to_known_entities = 'character_matching',
                   roles_with_embeddings = [['ARG0','ARG1','ARG2']],
                   threshold = 1)    

m.train(postproc_roles)
narratives = m.predict(postproc_roles, progress_bar = True, prettify = False)

pretty_narratives = []
for n in narratives: 
    if n.get('ARG0') not in ["", None]:
        if n.get('B-V') not in ["", None]:
            if n.get('ARG1') not in ["", None]:
                pretty_narratives.append(prettify(n))
                
pretty_narratives = Counter(pretty_narratives)
for t in pretty_narratives.most_common(10): print(t)

No training required: the model is deterministic.


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5169/5169 [00:10<00:00, 498.07it/s]

('nate simington have senate', 2)
('steve have complete endorsement', 2)
('biden want country', 2)
('sudan agree israel', 1)
('biden lie pennsylvania', 1)
('bret baier expose fox news|fox', 1)
('trump tell time', 1)
('democrats|republican|rino look d.c.', 1)
('republican not let andrew mccabe|andrew', 1)
('state want state', 1)





In [9]:
m = NarrativeModel(model_type = 'static',
                   roles_considered = ['ARG0', 'B-V', 'B-ARGM-NEG', 'B-ARGM-MOD', 'ARG1', 'ARG2'],
                   roles_with_entities = ['ARG0','ARG1','ARG2'],
                   list_of_known_entities = list(known_entities),
                   assignment_to_known_entities = 'character_matching', 
                   roles_with_embeddings = [['ARG0'],['ARG1','ARG2']], # [['ARG0','ARG1','ARG2']]
                   embeddings_model = None,
                   threshold = 1,
                   n_clusters = [10,20]) # [100]    

m.train(postproc_roles, progress_bar = True, verbose = 0)
narratives = m.predict(postproc_roles, progress_bar = True, prettify = False)

pretty_narratives = []
for n in narratives: 
    if n.get('ARG0') not in ["", None]:
        if n.get('B-V') not in ["", None]:
            if n.get('ARG1') not in ["", None]:
                pretty_narratives.append(prettify(n))
                
pretty_narratives = Counter(pretty_narratives)
for t in pretty_narratives.most_common(10): print(t)

Focus on roles: ARG0
Ignoring known entities...
Embedding relevant phrases...







Clustering phrases into 10 clusters...
Labeling the clusters by the most frequent phrases...





Focus on roles: ARG1-ARG2
Ignoring known entities...
Embedding relevant phrases...
Clustering phrases into 20 clusters...










Labeling the clusters by the most frequent phrases...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5169/5169 [00:10<00:00, 475.33it/s]

('nate simington have senate', 2)
('brann not allow would second amendment', 2)
('dc police get signature', 2)
('dc police do signature', 2)
('steve have complete endorsement', 2)
('democrats steal signature', 2)
('joe|joe biden|biden outsource signature', 2)
('joe|joe biden|biden open signature', 2)
('joe|joe biden|biden sacrifice ballot', 2)
('number destroy would america', 2)





In [10]:
m = NarrativeModel(model_type = 'dynamic',
                   roles_considered = ['ARG0', 'B-V', 'B-ARGM-NEG', 'B-ARGM-MOD', 'ARG1', 'ARG2'],
                   roles_with_entities = ['ARG0','ARG1','ARG2'],
                   list_of_known_entities = list(known_entities),
                   assignment_to_known_entities = 'character_matching',
                   roles_with_embeddings = [['ARG0','ARG1','ARG2']],
                   threshold = 1)    

m.train(postproc_roles, progress_bar = True)
narratives = m.predict(postproc_roles, progress_bar = True, prettify = False)

pretty_narratives = []
for n in narratives: 
    if n.get('ARG0') not in ["", None]:
        if n.get('B-V') not in ["", None]:
            if n.get('ARG1') not in ["", None]:
                pretty_narratives.append(prettify(n))
                
pretty_narratives = Counter(pretty_narratives)
for t in pretty_narratives.most_common(10): print(t)



























































100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5169/5169 [00:07<00:00, 671.08it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5169/5169 [00:10<00:00, 497.85it/s]

('election have election', 2)
('nate simington have senate', 2)
('brann not allow would election', 2)
('election allow election', 2)
('steve have complete endorsement', 2)
('democrats steal election', 2)
('joe|joe biden|biden outsource election', 2)
('joe|joe biden|biden open election', 2)
('joe|joe biden|biden sacrifice second amendment', 2)
('biden want country', 2)





In [11]:
# To-do

# Add user-written functions for the preprocessor
# Add complete narratives filter
# Add option to save and load the SRL 

In [12]:
# Current differences with the previous wrapper

# handling of verbs (dimension_reduce_verbs)
# fit multiple kmeans models (n_clusters as a list of lists) --> With see with Elliott and Philine
# document tracking (doc, sentence, statement, narrative)