### Settings and Libraries

In [1]:
import spacy
nlp = spacy.load("en_core_web_sm")

from collections import Counter
from sklearn.cluster import KMeans

import numpy as np 
from numpy.linalg import norm

from typing import Dict, List, NamedTuple, Optional, Tuple, Union
import numpy as np
from copy import deepcopy

import gensim.downloader as api
#model = api.load("glove-wiki-gigaword-300")

### All functions

In [2]:
UsedRoles = {'roles_considered': ['ARGO', 'B-V', 'B-ARGM-NEG', 'B-ARGM-MOD', 'ARG1', 'ARG2'],
             'roles_with_embeddings': ['ARGO', 'ARG1', 'ARG2'],
             'roles_with_entities': ['ARGO', 'ARG1', 'ARG2']}

In [23]:
import sys
sys.path.append('../code')


# Utils
#..................................................................................................................
#..................................................................................................................


def split_into_sentences(
    docs: List[str]
) -> List[str]:
    
    """
    
    A function that splits a list of documents into sentences (using the SpaCy sentence splitter).
    
    Args:
        docs: list of docs
        
    Returns:
        List of sentences
        
    """
    
    sentences = []
    doc_indices = []
    
    for index, doc in enumerate(docs):
        for sent in nlp(doc).sents:
            sent = str(sent)
            sentences = sentences + [sent]
            doc_indices = doc_indices + [index]
    
    return doc_indices, sentences


from utils import preprocess


# Semantic Role Labeling
#..................................................................................................................
#..................................................................................................................

# link to choose the SRL model 
# https://storage.googleapis.com/allennlp-public-models/YOUR-PREFERRED-MODEL

# Would be nice to track the semantic role labeling's progress (given how long it takes) +
# code needs to be refactored (remove modals = True)

# Require of the user a model path and the text data, that's all.

from semantic_role_labeling import SRL, extract_roles, extract_role_per_sentence, postprocess_roles


# Named Entity Recognition
#..................................................................................................................
#..................................................................................................................


def mine_entities(
    sentences: List[str],
    ent_labels: Optional[List[str]] = ['PERSON', 'NORP', 'ORG', 'GPE', 'EVENT']
) -> List[Tuple[str, int]]:
    
    """
    
    A function that goes through sentences and counts named entities found in the corpus.
    
    Args:
        sentences: list of sentences
        ent_labels: list of entity labels to be considered (see SPaCy documentation)
        
    Returns:
        List of tuples with the named entity and its associated frequency on the corpus
    
    """

    entities_all = []

    for sentence in sentences:
        sentence = nlp(sentence)
        for ent in sentence.ents:
            if ent.label_ in ent_labels:
                entity = [ent.text]
                entities_all = entity + entities_all

    entities_all = preprocess(entities_all) 

    entity_counts = Counter(entities_all)
    entities_sorted = sorted(entity_counts.items(), key=lambda x: x[1], reverse=True)

    return entities_sorted


def pick_top_entities(
    entities_sorted: List[Tuple[str,int]],
    top_n: Optional[int] = 0
) -> List[str]:
   
    """
    
    A function that returns the top n most frequent named entities in the corpus.
    
    Args:
        entities_sorted: list of tuples (named_entity, frequency)
        top_n: number of named entities to keep (default is top 10% and is specified with top_n = 0)
        
    Returns:
        List of most frequent named entities
    
    """
    
    if top_n == 0:
        top_n = round(len(entities_sorted)/10)
    
    entities = []

    for entity in entities_sorted:
        entities = entities + [entity[0]]
    
    return entities[0:top_n]


def is_subsequence(
    v2: list, 
    v1: list
) -> bool:
    
    """
    
    Check whether v2 is a subsequence of v1.
    
    Args:
        v2/v1: lists of elements
        
    Returns:
        a boolean
    
    Example:
        >>> v1 = ['the', 'united', 'states', 'of', 'america']\n
        ... v2 = ['united', 'states', 'of', 'europe']\n
        ... is_subsequence(v2,v1)
        False
    
    """
    it = iter(v1)
    return all(c in it for c in v2) 


def map_entities(
    statements: List[dict],
    entities: list,
    UsedRoles: dict
) -> Tuple[dict, List[dict]]:
    
    """
    
    A function that goes through statements and identifies pre-defined named entities within postprocessed semantic roles.
    
    Args:
        statements: list of dictionaries of postprocessed semantic roles
        entities: user-defined list of named entities 
        roles: a list of roles with named entities (default = ARG0 and ARG1)
        UsedRoles: dict with the specifics of the pipeline for each role
        
    Returns:
        entity_index: dictionary containing statements indices with entities for each role
        roles_copy: new list of postprocessed semantic roles (without the named entities mined since they will not be embedded)
    
    """
    
    roles = UsedRoles['roles_with_entities']
    
    entity_index = {role:{entity:np.asarray([], dtype=int) for entity in entities} for role in roles}
    
    roles_copy = deepcopy(statements)
    
    for i, statement in enumerate(statements):
        for role, tokens in statements[i].items():
            if role in roles:
                for entity in entities:
                    if is_subsequence(entity.split(), tokens)  == True: 
                        entity_index[role][entity] = np.append(entity_index[role][entity], [i]) 
                        roles_copy[i][role] = []
    
    return entity_index, roles_copy


# Vectors and Clustering
#..................................................................................................................
#..................................................................................................................

def count_words(
    sentences: List[str]
) -> dict:
    
    """
    
    A function that computes word frequencies in a list of sentences.
    
    Args:
        sentences: list of sentences
        
    Returns:
        A dictionary {"word": frequency}
    
    """
    
    words = []
    
    for sentence in sentences:
        words = words + sentence.split()

    word_count_dict = dict(Counter(words))
    
    return word_count_dict


def compute_sif_weights(
    word_count_dict: dict,
    alpha: Optional[float] = 0.001
) -> dict:
    
    """
    
    A function that computes SIF weights based on word frequencies.
    
    Args:
        word_count_dict: a dictionary {"word": frequency}
        alpha: regularization parameter (see original paper)
        
    Returns:
        A dictionary {"word": SIF weight}
    
    """
    
    sif_dict = {}
    
    for word, count in word_count_dict.items():
        sif_dict[word] = alpha / (alpha + count)

    return sif_dict


class USE:
    def __init__(self, path: str):
        self._embed = hub.load(path)

    def __call__(self, tokens: List[str]) -> np.ndarray:
        return self._embed([" ".join(tokens)]).numpy()[0]


class SIF_word2vec:
    def __init__(
        self, path: str, sentences = List[str], alpha: Optional[float] = 0.001, normalize: bool = True
    ):

        self._model = Word2Vec.load(path)

        self._word_count_dict = count_words(sentences)
        
        self._sif_dict = compute_sif_weights(self._word_count_dict, alpha)
        
        self._vocab = self._model.wv.vocab

        self._normalize = normalize

    def __call__(self, tokens: List[str]):
        res = np.mean(
            [self._sif_dict[token] * self._model.wv[token] for token in tokens], axis=0
        )
        if self._normalize:
            res = res / norm(res)  
        return res
    

class SIF_keyed_vectors:
    def __init__(
        self, path: str, sentences = List[str], alpha: Optional[float] = 0.001, normalize: bool = True
    ):

        self._model = api.load(path)

        self._word_count_dict = count_words(sentences)
        
        self._sif_dict = compute_sif_weights(self._word_count_dict, alpha)
        
        self._vocab = self._model.vocab

        self._normalize = normalize

    def __call__(self, tokens: List[str]):
        res = np.mean(
            [self._sif_dict[token] * self._model[token] for token in tokens], axis=0
        )
        if self._normalize:
            res = res / norm(res)  
        return res


def get_vector(
    tokens: List[str],
    model: Union[USE, SIF_word2vec, SIF_keyed_vectors]
):
    
    """
    
    A function that computes an embedding vector for a list of tokens.
    
    Args:
        tokens: list of string tokens to embed
        model: trained embedding model 
        (e.g. either Universal Sentence Encoders, a full gensim Word2Vec model or gensim Keyed Vectors)
        
    Returns:
        A two-dimensional numpy array (1,dimension of the embedding space)
    
    """
    
    if not isinstance(model, (USE, SIF_word2vec, SIF_keyed_vectors)):
        raise TypeError("Union[USE, SIF_Word2Vec, SIF_keyed_vectors]")
    
    if isinstance(model, SIF_word2vec) or isinstance(model, SIF_keyed_vectors): 
        if not tokens:
            res = None
        elif any(token not in model._sif_dict for token in tokens):
            res = None
        elif any(token not in model._vocab for token in tokens): 
            res = None 
        else:
            res = model(tokens)
            res = np.array([res])
    else:
            res = model(tokens)
            res = np.array([res])
        
    return res


from utils import get_role_counts


def train_cluster_model(
    postproc_roles,
    model: Union[USE, SIF_word2vec, SIF_keyed_vectors],
    n_clusters,
    UsedRoles = dict,
    random_state: Optional[int] = 0
):
    
    """
    
    A function to train a kmeans model on the corpus.
    
    Args:
        postproc_roles: list of statements
        model: trained embedding model 
        (e.g. either Universal Sentence Encoders, a full gensim Word2Vec model or gensim Keyed Vectors)
        n_clusters: number of clusters
        UsedRoles: dict with the specifics of the pipeline for each role
        random_state: seed for replication (default is 0)
        
    Returns:
        A sklearn kmeans model
    
    """
    
    roles = UsedRoles['roles_with_embeddings']
    
    role_counts = get_role_counts(postproc_roles, roles = roles) 

    role_counts = [role.split() for role in list(role_counts)]

    vecs = None
    for role in role_counts:
        if vecs is None:
            vecs = get_vector(role, model)
        else:
            temp = get_vector(role, model)
            if temp is not None:
                vecs = np.concatenate((vecs, temp), axis=0)
        
    kmeans = KMeans(n_clusters=n_clusters, random_state=random_state).fit(vecs)
    
    return kmeans


def get_clusters(
    postproc_roles: List[dict],
    model: Union[USE, SIF_word2vec, SIF_keyed_vectors],
    kmeans,
    UsedRoles = dict
) -> List[dict]:
    
    """
    
    A function which predicts clusters based on a pre-trained kmeans model.
    
    Args:
        postproc_roles: list of statements
        model: trained embedding model 
        (e.g. either Universal Sentence Encoders, a full gensim Word2Vec model or gensim Keyed Vectors)
        kmeans = a pre-trained sklearn kmeans model
        UsedRoles: dict with the specifics of the pipeline for each role
        
    Returns:
        A list of dictionaries with the predicted cluster for each role
    
    """

    roles = UsedRoles['roles_with_embeddings']
    
    clustering_res = []
    
    for statement in postproc_roles:
        temp = {}
        for role, tokens in statement.items():
            if role in roles:
                vec = get_vector(tokens, model)
                if vec is not None:
                    clu = kmeans.predict(vec)
                    temp[role] = int(clu)
        clustering_res = clustering_res + [temp]

    return clustering_res


def label_clusters_most_freq(
    clustering_res: List[dict],
    postproc_roles: List[dict]
) -> dict:
    
    """
    
    A function which labels clusters by their most frequent term.
    
    Args:
        clustering_res: list of dictionaries with the predicted cluster for each role
        postproc_roles: list of statements
        
    Returns:
        A dictionary associating to each cluster number a label (e.g. the most frequent term in this cluster)
    
    """

    temp = {}
    labels = {}

    for i,statement in enumerate(clustering_res):
        for role, cluster in statement.items():
            tokens = ' '.join(postproc_roles[i][role])
            cluster_num = cluster
            if cluster_num not in temp:
                temp[cluster_num] = [tokens]
            else:
                temp[cluster_num] = temp[cluster_num] + [tokens]

    for cluster_num, tokens in temp.items():
        token_counts = Counter(tokens)
        token_freq = sorted(token_counts.items(), key=lambda x: x[1], reverse=True)
        most_freq_token = token_freq[0][0]
        labels[cluster_num] = most_freq_token

    return labels


# Wrappers
#..................................................................................................................
#..................................................................................................................


def build_narrative_model(
    srl_res: List[dict],
    sentences: List[str],
    embeddings_type = str,
    embeddings_path = str,
    UsedRoles = dict
):
    
    if embeddings_type not in ['gensim_keyed_vectors', 'gensim_full_model', 'USE']:
        raise TypeError("Only three types of embeddings accepted: gensim_keyed_vectors, gensim_full_model, USE")

    # Process SRL
    print('Processing srl output...')
    
    roles, sentence_index = extract_roles(srl_res, start = 0)
    postproc_roles = postprocess_roles(roles)
    
    # Named Entities
    print('Processing named entities...')

    entities_sorted = mine_entities(sentences)
    entities = pick_top_entities(entities_sorted)
    entity_index, postproc_roles_without_entities = map_entities(statements = postproc_roles,
                                                                 entities = entities,
                                                                 UsedRoles = UsedRoles)
    
    # Embeddings and clustering
    print('Loading embeddings model...')
    sentences = preprocess(sentences)
    
    if embeddings_type == 'gensim_keyed_vectors':
        model = SIF_keyed_vectors(path = embeddings_path, sentences = sentences)
    if embeddings_type == 'gensim_full_model':
        model = SIF_word2vec(path = embeddings_path, sentences = sentences)
    if embeddings_type == 'USE':
        model = USE(path = embeddings_path)
    
    print('Clustering remaining arguments...')
    kmeans = train_cluster_model(postproc_roles_without_entities, 
                                 model, 
                                 n_clusters = 5, 
                                 UsedRoles=UsedRoles)
    
    clustering_res = get_clusters(postproc_roles_without_entities, 
                                  model, 
                                  kmeans, 
                                  UsedRoles=UsedRoles)
    
    labels = label_clusters_most_freq(clustering_res=clustering_res, 
                                      postproc_roles=postproc_roles_without_entities)
    
    # Wrap up 
    narrative_model = {}
    narrative_model['entities'] = entities
    narrative_model['embeddings_model'] = model
    narrative_model['cluster_model'] = kmeans
    narrative_model['cluster_labels'] = labels
    narrative_model['UsedRoles'] = UsedRoles
                    
    return narrative_model
 
    
def get_narratives(
    srl_res: List[dict],
    doc_index: List[int],
    narrative_model: dict
):
    
    """
    
    A wrapper function to obtain the final mined narratives.
    
    Args:
        srl_res: sentences labeled with their semantic roles 
        doc_index: list of indices to keep track of original documents
        UsedRoles: dict with the specifics of the pipeline for each role
        
    Returns:
        A list of dictionaries with the mined narratives.
    
    """

    final_statements = []    
    
    # Process SRL
    print('Processing srl output...')
    roles, sentence_index = extract_roles(srl_res)
    postproc_roles = postprocess_roles(roles)
    
    # Named Entities
    print('Processing named entities...')
    entity_index, postproc_roles_without_entities = map_entities(statements = postproc_roles,
                                                                 entities = narrative_model['entities'],
                                                                 UsedRoles = narrative_model['UsedRoles'])
    
    # Embeddings
    print('Clustering remaining arguments...')
    clustering_res = get_clusters(postproc_roles_without_entities, 
                                  narrative_model['embeddings_model'], 
                                  narrative_model['cluster_model'], 
                                  UsedRoles=narrative_model['UsedRoles'])
        
    # Raw statements
    for statement in postproc_roles:
        temp = {}
        for role, tokens in statement.items():
            name = role + '-RAW'
            if type(tokens)!=bool:
                temp[name] = ' '.join(tokens)
            else:
                temp[name] = tokens
        final_statements = final_statements + [temp]
    
    # Clusters
    for i,statement in enumerate(clustering_res):
        for role, cluster in statement.items():
            final_statements[i][role] = narrative_model['cluster_labels'][cluster]
       
    # Named entities
    for role in narrative_model['UsedRoles']['roles_with_entities']: 
        for token, indices in entity_index[role].items():
            for index in indices:
                final_statements[index][role] = token
      
    # Original sentence
    for i,index in enumerate(sentence_index):
        final_statements[i]['sentence'] = index
        
    # Original documents
    for i,index in enumerate(doc_index):
        final_statements[i]['doc'] = index
                
    return final_statements

### Pipeline in Action

In [4]:
import pandas as pd

df = pd.read_csv('trump_tweets.csv')
splitted_text = df['text'].str.split()
indices = [i for i,value in enumerate(splitted_text) if 'RT' not in value]
df = df.loc[indices]
df['text'] = df['text'].str.replace(r"http\S+", "")
df['text'] = df['text'].str.replace(r"@\S+", "")
df = df[df['text'].str.strip() != '']
df = pd.DataFrame(list(zip(df.index, df.text)), columns = ['id', 'doc'])

docs = list(df['doc'])

  import sys
  


In [5]:
doc_index, sentences = split_into_sentences(docs[0:100])

In [6]:
srl = SRL("../srl-model-2018.05.25.tar.gz")
srl_res = srl(sentences=sentences, batch_size = 20)

In [24]:
narrative_model = build_narrative_model(srl_res = srl_res, 
                                        embeddings_type = "gensim_keyed_vectors",
                                        embeddings_path = "glove-wiki-gigaword-300",
                                        sentences = sentences, 
                                        UsedRoles = UsedRoles)

Processing srl output...
Processing named entities...
Loading embeddings model...


100%|██████████| 359/359 [00:00<00:00, 984153.68it/s]

Clustering remaining arguments...





In [25]:
get_narratives(srl_res = srl_res, doc_index = doc_index, narrative_model = narrative_model)

Processing srl output...
Processing named entities...
Clustering remaining arguments...


[{'ARGO-RAW': 'republicans and democrats',
  'ARG1-RAW': 'our economic problems',
  'B-V-RAW': 'created',
  'ARG1': 'us',
  'ARGO': 'democrats',
  'sentence': 0,
  'doc': 0},
 {'ARG1-RAW': 'i',
  'ARG2-RAW': 'thrilled to be back in the great city of charlotte north carolina with thousands of hardworking american patriots who love our country cherish our values respect our laws and always put america first',
  'B-V-RAW': 'was',
  'ARG1': 'we',
  'ARG2': 'country',
  'sentence': 1,
  'doc': 1},
 {'ARG1-RAW': 'i',
  'ARG2-RAW': 'to be back in the great city of charlotte north carolina with thousands of hardworking american patriots who love our country cherish our values respect our laws and always put america first',
  'B-V-RAW': 'thrilled',
  'ARG1': 'we',
  'ARG2': 'country',
  'sentence': 1,
  'doc': 1},
 {'ARG1-RAW': 'i',
  'ARG2-RAW': 'back in the great city of charlotte north carolina',
  'B-V-RAW': 'be',
  'ARG1': 'we',
  'ARG2': 'hardworking american patriots who',
  'sentence': 