In [1]:
# Browse list of available datasets

from relatio.datasets import list_datasets

print(list_datasets())

# Load an available dataset

from relatio.datasets import load_trump_data

df = load_trump_data("raw")

# Split into sentences (example on 100 tweets)

from relatio.utils import split_into_sentences

split_sentences = split_into_sentences(
    df.iloc[0:100], output_path=None, progress_bar=True
)

# As sentence splitting and SRL is time-consuming, we download the results from the datasets module.

split_sentences = load_trump_data("split_sentences")
srl_res = load_trump_data("srl_res")


    List of available datasets:

    Trump Tweet Archive
    - function call: load_trump_data()
    - format: 'raw', 'split_sentences', 'srl_res'
    - allennlp version: 0.9
    - srl model: srl-model-2018.05.25.tar.gz
    
Splitting into sentences...


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 367.28it/s]


In [2]:
# Process SRL 

from relatio.preprocessing import *

p = Preprocessor(spacy_model = "en_core_web_lg")

roles, sentence_index = p.extract_roles(
    srl_res[0:1000], 
    used_roles = ["ARG0","B-V","B-ARGM-NEG","B-ARGM-MOD","ARG1","ARG2"],
    progress_bar = True
)

print(roles[0:5])

postproc_roles = p.process_roles(roles, 
                                 remove_punctuation = True,
                                 remove_digits = True,
                                 lowercase = True,
                                 lemmatize = True,
                                 stop_words = [],
                                 dict_of_pos_tags_to_keep = {"ARG0": ['NOUN', 'PROPN'], 
                                                     "ARG1": ['NOUN', 'PROPN'], 
                                                     "ARG2": ['NOUN', 'PROPN']}, 
                                 progress_bar = True)

print(postproc_roles[0:5])

named_entities = p.mine_entities(split_sentences[1][0:1000], progress_bar = True)
print(named_entities.most_common(10))

Extracting semantic roles...


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 32963.46it/s]


[{'B-V': 'have'}, {'ARG0': 'Republicans and Democrats', 'ARG1': 'our economic problems', 'B-V': 'created'}, {'ARG1': 'I', 'ARG2': 'thrilled to be back in the Great city of Charlotte , North Carolina with thousands of hardworking American Patriots who love our Country , cherish our values , respect our laws , and always put AMERICA FIRST', 'B-V': 'was'}, {'ARG1': 'I', 'ARG2': 'to be back in the Great city of Charlotte , North Carolina with thousands of hardworking American Patriots who love our Country , cherish our values , respect our laws , and always put AMERICA FIRST', 'B-V': 'thrilled'}, {'ARG1': 'I', 'ARG2': 'back in the Great city of Charlotte , North Carolina', 'B-V': 'be'}]
Cleaning semantic roles...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2088/2088 [00:12<00:00, 167.03it/s]


[{'B-V': 'have'}, {'ARG0': 'republicans democrats', 'ARG1': 'problem', 'B-V': 'create'}, {'ARG1': '', 'ARG2': 'city charlotte north carolina thousand patriots country value law america', 'B-V': 'be'}, {'ARG1': '', 'ARG2': 'city charlotte north carolina thousand patriots country value law america', 'B-V': 'thrill'}, {'ARG1': '', 'ARG2': 'city charlotte north carolina', 'B-V': 'be'}]
Mining named entities...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:04<00:00, 246.68it/s]

[('Pennsylvania', 23), ('Biden', 20), ('BreitbartNews', 19), ('Republicans', 17), ('Joe Biden', 17), ('Georgia', 17), ('Trump', 16), ('Democrats', 13), ('Wisconsin', 10), ('Arizona', 10)]





In [3]:
known_entities = [e[0].lower() for e in list(named_entities.most_common(100))]

In [4]:
# Build the narrative model
# This will take several minutes to run. You might want to grab a coffee.

from relatio.narrative_models import *

m = NarrativeModel(model_type = 'static',
                   roles_considered = ['ARG0', 'B-V', 'B-ARGM-NEG', 'ARGM-MOD', 'ARG1', 'ARG2'],
                   roles_with_entities = ['ARG0','ARG1','ARG2'],
                   list_of_known_entities = known_entities,
                   assignment_to_known_entities = 'embeddings',
                   roles_with_embeddings = [['ARG0','ARG1','ARG2']],
                   threshold = 1,
                   n_clusters = [10])    

In [5]:
m.train(postproc_roles[0:1000], progress_bar = True, verbose = 0)

Focus on roles: ARG0-ARG1-ARG2
Ignoring known entities...
Embedding relevant phrases...
Clustering phrases into 10 clusters...
Labeling the clusters by the most frequent phrases...




In [8]:
m.predict(postproc_roles[0:1000], progress_bar = True, prettify = True)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:02<00:00, 435.23it/s]


['have',
 'democrats create carney',
 'be mike pence',
 'thrill mike pence',
 'be north carolina',
 'foxnews hardworking',
 'foxnews love state',
 'foxnews cherish mark',
 'foxnews respect court',
 'foxnews put america',
 'thank state',
 'be witch hunt witch hunt',
 'democrats know',
 'carney use carney',
 'count carney',
 'have',
 'end clark county state',
 'be clark county state',
 'miss carney',
 'get adamlaxalt',
 'thank',
 'elise',
 '',
 'be',
 'turn clark county',
 'be clark county',
 'anticipate',
 'be',
 'impact clark county',
 'stay',
 'tune',
 'm',
 'run senate',
 '',
 'go',
 'get',
 'not be state',
 'will',
 'china own',
 '',
 'have',
 'sudan agree clark county',
 's mike pence',
 'have',
 'do',
 'will',
 'follow',
 'thank',
 '',
 'be',
 'turn joe biden',
 'be nevada witch hunt',
 'be',
 'adamlaxalt find carney',
 'release',
 'will',
 'be carney',
 'be',
 'wisconsin look',
 'need carney',
 'will',
 'happen',
 '',
 'be',
 'show dominion',
 'hate dominion',
 'laced dominion',
