# An introduction to `relatio` 
**Runtime $\sim$ 5min**

Original paper: ["Text Semantics Capture Political and Economic Narratives"](https://arxiv.org/abs/2108.01720)

----------------------------

This is a short demo of the package `relatio`.  It takes as input a text corpus and outputs a list of narrative statements. The pipeline is unsupervised: the user does not need to specify narratives beforehand. Narrative statements are defined as tuples of semantic roles with a (agent, verb, patient, attribute) structure. 

Here, we present the main wrapper functions to quickly obtain narrative statements from a corpus.

----------------------------

In this tutorial, we work with tweets from candidates at the French Presidential Elections (2022).

----------------------------

In [1]:
# Catch warnings for an easy ride
from relatio import FileLogger
logger = FileLogger(level = 'WARNING')

2022-04-25 11:10:16.698219: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-04-25 11:10:16.698239: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [2]:
from relatio import list_data
list_data()


{
    "trump_tweet_archive": 
    {
        "description": "Tweets from the Trump Tweet Archives (https://www.thetrumparchive.com/)",
        "language": "english",
        "srl_model": "allennlp v0.9 -- srl-model-2018.05.25.tar.gz",
        "links": 
        {
            "raw": "https://www.dropbox.com/s/lxqz454n29iqktn/trump_archive.csv?dl=1",
            "sentences": "https://www.dropbox.com/s/coh4ergyrjeolen/split_sentences.json?dl=1",
            "srl_res": "https://www.dropbox.com/s/54lloy84ka8mycp/srl_res.json?dl=1"
        }
    },
    "tweets_candidates_french_elections": 
    {
        "description": "Tweets of candidates at the French presidential elections (2022)",
        "language": "french",
        "srl_model": "",
        "links": 
        {
            "raw": "https://www.dropbox.com/s/qqlq8xn9x645f79/tweets_candidates_french_elections.csv?dl=1"
        }
    }
}



In [3]:
from relatio import load_data
df = load_data(dataset = "tweets_candidates_french_elections", content = "raw")
df = df[df['candidate'] == 'yjadot']
df.head()

Unnamed: 0,id,doc,date,candidate
29238,29237,"Hier, nous étions à #Rennes, place Hoche, pour...",2022-02-09T16:01:00.000Z,yjadot
29239,29238,Pensées à ses proches,2022-02-09T15:31:36.000Z,yjadot
29240,29239,"Un an déjà que Guillaume, militant communiste ...",2022-02-09T15:31:16.000Z,yjadot
29241,29240,Le #OneOceanSummit s'ouvre aujourd'hui à #Bres...,2022-02-09T15:03:00.000Z,yjadot
29242,29241,Pour revoir l'intégralité de mon passage sur @...,2022-02-09T14:00:03.000Z,yjadot


In [None]:
from relatio import Preprocessor

import string
alphabet_string = string.ascii_lowercase
alphabet_list = list(alphabet_string) + ['rt']

p = Preprocessor(
    spacy_model = "fr_core_news_sm",
    remove_punctuation = True,
    remove_digits = True,
    lowercase = True,
    lemmatize = True,
    remove_chars = ["\"",'-',"^",".","?","!",";","(",")",",",":","\'","+","&","|","/","{","}",
                    "~","_","`","[","]",">","<","=","*","%","$","@","#","’"],
    stop_words = alphabet_list,
    n_process = -1,
    batch_size = 100
)

doc_index, sentences = p.split_into_sentences(
    df, output_path = None, progress_bar = True
)

In [None]:
sentence_index, roles = p.extract_svos(sentences, progress_bar = True)

for svo in roles[0:5]: print(svo)

In [None]:
postproc_roles = p.process_roles(roles, 
                                 dict_of_pos_tags_to_keep = {
                                     "ARG0": ['PRON', 'NOUN', 'PROPN'],
                                     "B-V": ['VERB'],
                                     "ARG1": ['NOUN', 'PROPN', 'PRON']
                                 }, 
                                 max_length = 50,
                                 progress_bar = True,
                                 output_path = 'postproc_roles.json')

from relatio.utils import load_roles
postproc_roles = load_roles('postproc_roles.json')

for d in postproc_roles[0:5]: print(d)

In [None]:
known_entities = p.mine_entities(
    sentences, 
    clean_entities = True, 
    progress_bar = True,
    output_path = 'entities.pkl'
)

from relatio.utils import load_entities
known_entities = load_entities('entities.pkl')

for n in known_entities.most_common(10): print(n)

In [None]:
top_known_entities = [e[0] for e in list(known_entities.most_common(100)) if e[0] != '']

In [None]:
from relatio import Embeddings
nlp_model = Embeddings("spaCy", "fr_core_news_sm", sentences=sentences) 

In [None]:
from relatio import NarrativeModel
from relatio.utils import prettify
from collections import Counter

In [None]:
m1 = NarrativeModel(model_type = 'deterministic',
                   roles_considered = ['ARG0', 'B-V', 'B-ARGM-NEG', 'ARG1'],
                   roles_with_known_entities = ['ARG0','ARG1'],
                   known_entities = top_known_entities,
                   assignment_to_known_entities = 'character_matching',
                   roles_with_unknown_entities = [['ARG0','ARG1']],
                   embeddings_model = nlp_model,
                   threshold = 1)    

m1.train(postproc_roles)

In [None]:
narratives = m1.predict(postproc_roles, progress_bar = True)

In [None]:
pretty_narratives = []
for n in narratives: 
    if n.get('ARG0') is not None:
        if n.get('B-V') is not None:
            if n.get('ARG1') is not None:
                pretty_narratives.append(prettify(n))
                
pretty_narratives = Counter(pretty_narratives)
for t in pretty_narratives.most_common(10): print(t)

In [None]:
m2 = NarrativeModel(model_type = 'static',
                   roles_considered = ['ARG0', 'B-V', 'B-ARGM-NEG', 'ARG1'],
                   roles_with_known_entities = ['ARG0','ARG1'],
                   known_entities = top_known_entities,
                   assignment_to_known_entities = 'character_matching',
                   roles_with_unknown_entities = [['ARG0','ARG1']],
                   n_clusters = [100],
                   embeddings_model = nlp_model,
                   threshold = 0.3)    

m2.train(postproc_roles, progress_bar = True)

In [None]:
A = m2._model_obj.training_vectors[0]

print(A.shape)

import numpy as np
from sklearn.cluster import KMeans

kmeans_models = []
myseed = 1234
K = [100,250,500,1000,1500,2000] 
for num_clusters in K:
    print('Clustering started for %s clusters.' %num_clusters)
    kmeans = KMeans(n_clusters=num_clusters,random_state=myseed).fit(A)
    kmeans_models.append(kmeans)
    print('Clustering done for %s clusters.' %num_clusters)

distortions = []
for model in kmeans_models:
    distortions.append(model.inertia_)

import matplotlib.pyplot as plt
plt.figure(figsize=(16,8))
plt.plot(K, distortions, 'bx-')
plt.xlabel('Number of Clusters')
plt.ylabel('Distortion')
plt.title('The Elbow Method')
plt.show()


from sklearn.metrics import silhouette_score

silhouette_scores = []
for model in kmeans_models:
    print('Computing silhouette score...')
    silhouette_scores.append(silhouette_score(A, model.labels_,random_state=myseed))

plt.figure(figsize=(16,8))
plt.plot(K, silhouette_scores, 'bx-')
plt.xlabel('Number of Clusters')
plt.ylabel('Silhouette Score')
plt.title('The Silhouette Score Method')
plt.show()

In [None]:
narratives = m2.predict(postproc_roles, progress_bar = True)

In [None]:
pretty_narratives = []
for n in narratives: 
    if n.get('ARG0') is not None:
        if n.get('B-V') is not None:
            if n.get('ARG1') is not None:
                pretty_narratives.append(prettify(n))
                
pretty_narratives = Counter(pretty_narratives)
for t in pretty_narratives.most_common(10): print(t)

In [None]:
m3 = NarrativeModel(model_type = 'dynamic',
                   roles_considered = ['ARG0', 'B-V', 'B-ARGM-NEG', 'ARG1'],
                   roles_with_known_entities = ['ARG0','ARG1'],
                   known_entities = top_known_entities,
                   assignment_to_known_entities = 'character_matching',
                   roles_with_unknown_entities = [['ARG0','ARG1']],
                   embeddings_model = nlp_model,
                   threshold = 0.5)    

m3.train(postproc_roles, progress_bar = True)

In [None]:
narratives = m3.predict(postproc_roles, progress_bar = True)

In [None]:
pretty_narratives = []
for n in narratives: 
    if n.get('ARG0') is not None:
        if n.get('B-V') is not None:
            if n.get('ARG1') is not None:
                pretty_narratives.append(prettify(n))
                
pretty_narratives = Counter(pretty_narratives)
for t in pretty_narratives.most_common(10): print(t)

In [None]:
m3._model_obj.vectors_unknown_entities[0].shape

In [None]:
m3._model_obj.vocab_unknown_entities[0]

In [None]:
from relatio import build_graph, draw_graph

G = build_graph(
    narratives, 
    top_n = 100, 
    prune_network = True
)

draw_graph(
    G,
    notebook = False,
    show_buttons = False,
    width="1600px",
    height="1000px",
    output_filename = 'example.html'
    )