### Settings and Libraries

In [1]:
import spacy
nlp = spacy.load("en_core_web_sm")

from collections import Counter
from sklearn.cluster import KMeans

import numpy as np 
from numpy.linalg import norm

from typing import Dict, List, NamedTuple, Optional, Tuple
import numpy as np
from copy import deepcopy

import gensim.downloader as api
model = api.load("glove-wiki-gigaword-300")

### All functions

In [None]:
blerg = {'ARGO':{'considered': True, 'embeddings': True, 'entities': True},
         'B-V':{'considered': True, 'embeddings': False, 'entities': False},
         'B-ARGM-MOD':{'considered': True, 'embeddings': False, 'entities': False},
         'B-ARGM-NEG':{'considered': True, 'embeddings': False, 'entities': False},
         'ARG1':{'considered': True, 'embeddings': True, 'entities': True},
         'ARG2':{'considered': True, 'embeddings': True, 'entities': True}}

In [None]:
params = {'roles_considered': ['ARGO', 'B-V', 'B-ARGM-NEG', 'B-ARGM-MOD', 'ARG1', 'ARG2'],
          'roles_with_embeddings': ['ARGO', 'ARG1', 'ARG2'],
          'roles_with_entities': ['ARGO', 'ARG1', 'ARG2']}

In [36]:
import sys
sys.path.append('../code')

# Utils
#..................................................................................................................
#..................................................................................................................

def split_into_sentences(
    docs: List[str]
) -> List[str]:
    
    """
    
    A function that splits a list of documents into sentences (using the SpaCy sentence splitter).
    
    Args:
        docs: list of docs
        
    Returns:
        List of sentences
        
    """
    
    sentences = []
    
    for doc in docs:
        temp = [str(i) for i in nlp(doc).sents]
        sentences = temp + sentences
    
    return sentences

from utils import preprocess

# Semantic Role Labeling
#..................................................................................................................
#..................................................................................................................

#link to choose the SRL model 
# https://storage.googleapis.com/allennlp-public-models/YOUR-PREFERRED-MODEL

# Would be nice to track the semantic role labeling's progress (given how long it takes) +
# code needs to be refactored (remove modals = True)

from semantic_role_labeling import SRL, extract_roles, postprocess_roles

# Named Entity Recognition
#..................................................................................................................
#..................................................................................................................

def mine_entities(
    sentences: List[str],
    ent_labels: Optional[List[str]] = ['PERSON', 'NORP', 'ORG', 'GPE', 'EVENT']
) -> List[Tuple[str, int]]:
    
    """
    
    A function that goes through sentences and counts named entities found in the corpus.
    
    Args:
        sentences: list of sentences
        ent_labels: list of entity labels to be considered (see SPaCy documentation)
        
    Returns:
        List of tuples with the named entity and its associated frequency on the corpus
    
    """

    entities_all = []

    for sentence in sentences:
        sentence = nlp(sentence)
        for ent in sentence.ents:
            if ent.label_ in ent_labels:
                entity = [ent.text]
                entities_all = entity + entities_all

    entities_all = preprocess(entities_all) 

    entity_counts = Counter(entities_all)
    entities_sorted = sorted(entity_counts.items(), key=lambda x: x[1], reverse=True)

    return entities_sorted

def pick_top_entities(
    entities_sorted: List[Tuple[str,int]],
    top_n: Optional[int] = 0
) -> List[str]:
   
    """
    
    A function that returns the top n most frequent named entities in the corpus.
    
    Args:
        entities_sorted: list of tuples (named_entity, frequency)
        top_n: number of named entities to keep (default is top 10% and is specified with top_n = 0)
        
    Returns:
        List of most frequent named entities
    
    """
    
    if top_n == 0:
        top_n = round(len(entities_sorted)/10)
    
    entities = []

    for entity in entities_sorted:
        entities = entities + [entity[0]]
    
    return entities[0:top_n]

def is_subsequence(
    v2: list, 
    v1: list
) -> bool:
    
    """
    
    Check whether v2 is a subsequence of v1.
    
    Args:
        v2/v1: lists of elements
        
    Returns:
        a boolean
    
    Example:
        >>> v1 = ['the', 'united', 'states', 'of', 'america']\n
        ... v2 = ['united', 'states', 'of', 'europe']\n
        ... is_subsequence(v2,v1)
        False
    
    """
    it = iter(v1)
    return all(c in it for c in v2) 

# Here, the roles matter. 

def map_entities(
    statements: List[dict],
    entities: list,
    entity_index: Optional[dict] = {},
    roles: Optional[List[str]] = ['ARGO', 'ARG1']
) -> Tuple[int, dict, List[dict]]:
    
    """
    
    A function that goes through statements and identifies pre-defined named entities within postprocessed semantic roles.
    
    Args:
        statements: list of dictionaries of postprocessed semantic roles
        entities: user-defined list of named entities 
        entity_index: a dictionary 
        roles: a list of roles with named entities (default = ARG0 and ARG1)
        
    Returns:
        entity_index: updated dictionary
        roles_copy: new list of postprocessed semantic roles (without the named entities mined since they will not be embedded)
    
    """
    
    # roles = params['roles_with_entities']
    
    if entity_index == {}:
        entity_index = {role:{entity:np.asarray([], dtype=int) for entity in entities} for role in roles}
    
    roles_copy = deepcopy(statements)
    
    for i, statement in enumerate(statements):
        for role, tokens in statements[i].items():
            if role in roles:
                for entity in entities:
                    if is_subsequence(entity.split(), tokens)  == True: 
                        entity_index[role][entity] = np.append(entity_index[role][entity], [i]) 
                        roles_copy[i][role] = []
    
    return entity_index, roles_copy

# Vectors and Clustering
#..................................................................................................................
#..................................................................................................................

def count_words(
    sentences: List[str]
) -> dict:
    
    """
    
    A function that computes word frequencies in a list of sentences.
    
    Args:
        sentences: list of sentences
        
    Returns:
        A dictionary {"word": frequency}
    
    """
    
    words = []
    
    for sentence in processed_sentences:
        words = words + sentence.split()

    word_count_dict = dict(Counter(words))
    
    return word_count_dict

def compute_sif_weights(
    word_count_dict: dict,
    alpha: Optional[float] = 0.001
) -> dict:
    
    """
    
    A function that computes SIF weights based on word frequencies.
    
    Args:
        word_count_dict: a dictionary {"word": frequency}
        alpha: regularization parameter (see original paper)
        
    Returns:
        A dictionary {"word": SIF weight}
    
    """
    
    sif_dict = {}
    
    for word, count in word_count_dict.items():
        sif_dict[word] = alpha / (alpha + count)

    return sif_dict


# Here, the roles matter.

from utils import get_role_counts

def get_vector(
    tokens: List[str],
    sif_dict: dict,
    normalize: Optional[bool] = True
):
    
    """
    
    A function that computes an embedding vector for a list of tokens.
    
    Args:
        sif_dict: a dictionary {"word": SIF weight}
        
    Returns:
        A two-dimensional numpy array (1,dimension of the embedding space)
    
    """
    
    if not tokens:
        res = None
    elif any(token not in sif_dict for token in tokens):
        res = None
    elif any(token not in model.vocab for token in tokens): 
        res = None 
    else:
        res = np.mean(
                [sif_dict[token] * model[token] for token in tokens], axis=0 
            )
        if normalize:
            res = res / norm(res)
        
        res = np.array([res])
        
    return res

def train_cluster_model(
    postproc_roles,
    sif_dict,
    n_clusters,
    random_state: Optional[int] = 0
):
    
    """
    
    A function to train a kmeans model on the corpus.
    
    Args:
        postproc_roles: list of statements
        sif_dict: a dictionary {"word": SIF weight}
        n_clusters: number of clusters
        random_state: seed for replication (default is 0)
        
    Returns:
        A sklearn kmeans model
    
    """
    
    role_counts = get_role_counts(postproc_roles, roles = ['ARGO', 'ARG1']) # params['roles_with_embeddings']
    role_counts = list(role_counts)
    role_counts = [role.split() for role in role_counts]

    vecs = None
    for role in role_counts:
        if vecs is None:
            vecs = get_vector(role, sif_dict)
        else:
            temp = get_vector(role, sif_dict)
            if temp is not None:
                vecs = np.concatenate((vecs, temp), axis=0)
            
    kmeans = KMeans(n_clusters=n_clusters, random_state=random_state).fit(vecs)
    
    return kmeans

# Here, the roles matter.

def get_clusters(
    postproc_roles: List[dict],
    sif_dict: dict,
    kmeans
) -> List[dict]:
    
    """
    
    A function which predicts clusters based on a pre-trained kmeans model.
    
    Args:
        postproc_roles: list of statements
        sif_dict: a dictionary {"word": SIF weight}
        kmeans = a pre-trained sklearn kmeans model
        
    Returns:
        A list of dictionaries with the predicted cluster for each role
    
    """

    clustering_res = []
    for statement in postproc_roles:
        temp = {}
        for role, tokens in statement.items():
            if role in ['ARGO', 'ARG1']: # params['roles_with_embeddings']
                vec = get_vector(tokens, sif_dict)
                if vec is not None:
                    clu = kmeans.predict(vec)
                    temp[role] = int(clu)
        clustering_res = clustering_res + [temp]

    return clustering_res

def label_clusters_most_freq(
    clustering_res: List[dict],
    postproc_roles: List[dict]
) -> dict:
    
    """
    
    A function which labels clusters by their most frequent term.
    
    Args:
        clustering_res: list of dictionaries with the predicted cluster for each role
        postproc_roles: list of statements
        
    Returns:
        A dictionary associating to each cluster number a label (e.g. the most frequent term in this cluster)
    
    """

    temp = {}
    labels = {}

    for i,statement in enumerate(clustering_res):
        for role, cluster in statement.items():
            tokens = ' '.join(postproc_roles_without_entities[i][role])
            cluster_num = cluster
            if cluster_num not in temp:
                temp[cluster_num] = [tokens]
            else:
                temp[cluster_num] = temp[cluster_num] + [tokens]

    for cluster_num, tokens in temp.items():
        token_counts = Counter(tokens)
        token_freq = sorted(token_counts.items(), key=lambda x: x[1], reverse=True)
        most_freq_token = token_freq[0][0]
        labels[cluster_num] = most_freq_token

    return labels

# Final Narratives
#..................................................................................................................
#..................................................................................................................

# Here, the roles matter. 

def get_narratives(
    postproc_roles: List[dict],
    clustering_res: List[dict],
    labels: dict
):
    
    """
    
    A wrapper function to obtain the final mined narratives.
    
    Args:
        postproc_roles: list of statements
        clustering_res: list of dictionaries with the predicted cluster for each role
        labels: dictionary associating to each cluster number a label (e.g. the most frequent term in this cluster)
        
    Returns:
        A list of dictionaries with the mined narratives.
    
    """

    
    final_statements = []
    
    for statement in postproc_roles:
        temp = {}
        for role, tokens in statement.items():
            name = role + '-RAW'
            if type(tokens)!=bool:
                temp[name] = ' '.join(tokens)
            else:
                temp[name] = tokens
        final_statements = final_statements + [temp]
    
    for i,statement in enumerate(clustering_res):
        for role, cluster in statement.items():
            final_statements[i][role] = labels[cluster]
            
    for role in ['ARGO', 'ARG1']: # params['roles_with_entities']
        for token, indices in entity_index[role].items():
            for index in indices:
                final_statements[index][role] = token
                
    return final_statements

### Pipeline in Action

In [3]:
import pandas as pd

df = pd.read_csv('trump_tweets.csv')
splitted_text = df['text'].str.split()
indices = [i for i,value in enumerate(splitted_text) if 'RT' not in value]
df = df.loc[indices]
df['text'] = df['text'].str.replace(r"http\S+", "")
df['text'] = df['text'].str.replace(r"@\S+", "")
df = df[df['text'].str.strip() != '']
df = pd.DataFrame(list(zip(df.index, df.text)), columns = ['id', 'doc'])

docs = list(df['doc'])

  import sys
  


In [4]:
sentences = split_into_sentences(docs[0:1000])

In [5]:
srl = SRL("../srl-model-2018.05.25.tar.gz")
srl_res = srl(sentences=sentences, batch_size = 20)

In [6]:
# process_srl

In [7]:
roles, sentence_index = extract_roles(srl_res, start = 0)

In [8]:
postproc_roles = postprocess_roles(roles)

In [9]:
# get_entities

In [10]:
entities_sorted = mine_entities(sentences)

In [11]:
entities = pick_top_entities(entities_sorted)

In [17]:
entity_index, postproc_roles_without_entities = map_entities(statements = postproc_roles,
                                                                          entities = entities)

In [None]:
# train cluster model

In [18]:
processed_sentences = preprocess(sentences)

In [19]:
word_count_dict = count_words(processed_sentences)

In [20]:
sif_dict = compute_sif_weights(word_count_dict)

In [37]:
kmeans = train_cluster_model(postproc_roles, sif_dict, n_clusters = 5)

100%|██████████| 4233/4233 [00:00<00:00, 260570.45it/s]


In [38]:
clustering_res = get_clusters(postproc_roles_without_entities, sif_dict, kmeans)

In [39]:
labels = label_clusters_most_freq(clustering_res=clustering_res, postproc_roles=postproc_roles_without_entities)

In [None]:
# get narratives

In [40]:
pd.DataFrame(get_narratives(postproc_roles, clustering_res, labels))

Unnamed: 0,ARGO-RAW,ARG1-RAW,B-V-RAW,ARG1,ARG2-RAW,ARGO,B-ARGM-MOD-RAW,B-ARGM-NEG-RAW
0,the rinos that,the state voting apparatus,run,state,,,,
1,the rinos that run the state voting apparatus,this problem of allowing the democrats to so b...,caused,election,us,state,,
2,,the democrats to so blatantly cheat in their a...,allowing,election,,,,
3,the democrats,in their attempt to steal the election which w...,cheat,election,,democrats,,
4,the,the election which we won overwhelmingly,steal,election,,the,,
...,...,...,...,...,...,...,...,...
4228,hardworking american patriots who,our laws,respect,the,,american,,
4229,hardworking american patriots who,america,put,america,,american,,
4230,,you,thank,i,for a wonderful evening,,,
4231,,,,,,,,


### Model Validation and Analysis

- To be discussed later on.
- Add inspect_label()
- Add plot_multgraph()
- Wrapper to determine the amount of dimension reduction required for clustering?