### Trump Tweets Archive

In [1]:
import pandas as pd

df = pd.read_csv('trump_tweets.csv')
splitted_text = df['text'].str.split()
indices = [i for i,value in enumerate(splitted_text) if 'RT' not in value]
df = df.loc[indices]
df['text'] = df['text'].str.replace(r"http\S+", "")
df['text'] = df['text'].str.replace(r"@\S+", "")
df = df[df['text'].str.strip() != '']
df = pd.DataFrame(list(zip(df.index, df.text)), columns = ['id', 'doc'])

docs = list(df['doc'])

  import sys
  


### Settings and Libraries

In [4]:
import spacy
nlp = spacy.load("en_core_web_sm")

from collections import Counter
from sklearn.cluster import KMeans

import numpy as np 
from numpy.linalg import norm

from typing import Dict, List, NamedTuple, Optional, Tuple
import numpy as np
from copy import deepcopy

### Utils

- Change UsedRoles() for more flexibility regarding what is and isn't embeddable.
- Verbs should not be embeddable (dy default).
- Should be able to group roles for the clustering.

In [5]:
import sys
sys.path.append('../code')

from utils import preprocess, UsedRoles

In [6]:
used_roles=UsedRoles() 
used_roles

{'ARGO': True, 'ARG1': True, 'ARG2': False, 'B-V': True, 'B-ARGM-MOD': True, 'B-ARGM-NEG': True}

### Split into sentences

Switch to SpaCy splitter as default (see below).

In [7]:
# Spacy splitter should be the default (good performance)

sentences = []

for doc in docs[0:100]:
    temp = [str(i) for i in nlp(doc).sents]
    sentences = temp + sentences
    
sentences

['A brilliant woman of courage!',
 'RINO Mitt Romney should read this.',
 'I’m sure, however, that he feels he got slaughtering by Obama “fair and square”.',
 'Pennsylvania Party Leadership votes are this week.',
 'I hope they pick very tough and smart fighters.',
 'We will WIN!!',
 'Nate Simington, a very smart and qualified individual, is having his Senate hearing today.',
 'Republicans will hopefully confirm him to the FCC ASAP!',
 'We need action NOW on this very important nomination!!\xa0  ',
 'Happy 245th Birthday to the  #HappyBirthdayMarines  ',
 'So much TRUTH!',
 ' There will be no lockdowns other than those done by certain Democrat governors!',
 'Just happened to have found another 4000 ballots from Fulton County.',
 'Here we go!',
 'Republicans, don’t let Andrew McCabe continue to get away with totally criminal activity.',
 'What he did should never be allowed to happen to our Country again.',
 'FIGHT FOR JUSTICE!',
 'Will go down much further.',
 'Weekend daytime even wors

### Run SRL

Works fine. No changes to be made.

In [None]:
# provide link to choose the SRL model 

In [8]:
from semantic_role_labeling import SRL
srl = SRL("../srl-model-2018.05.25.tar.gz")

In [9]:
srl_res = srl(sentences=sentences, batch_size = 20)
srl_res

[{'verbs': [], 'words': ['A', 'brilliant', 'woman', 'of', 'courage', '!']},
 {'verbs': [{'verb': 'should',
    'description': 'RINO Mitt Romney [V: should] read this .',
    'tags': ['O', 'O', 'O', 'B-V', 'O', 'O', 'O']},
   {'verb': 'read',
    'description': '[ARG0: RINO Mitt Romney] [ARGM-MOD: should] [V: read] [ARG1: this] .',
    'tags': ['B-ARG0',
     'I-ARG0',
     'I-ARG0',
     'B-ARGM-MOD',
     'B-V',
     'B-ARG1',
     'O']}],
  'words': ['RINO', 'Mitt', 'Romney', 'should', 'read', 'this', '.']},
 {'verbs': [{'verb': '’m',
    'description': '[ARG1: I] [V: ’m] [ARG2: sure , however , that he feels he got slaughtering by Obama “ fair and square ”] .',
    'tags': ['B-ARG1',
     'B-V',
     'B-ARG2',
     'I-ARG2',
     'I-ARG2',
     'I-ARG2',
     'I-ARG2',
     'I-ARG2',
     'I-ARG2',
     'I-ARG2',
     'I-ARG2',
     'I-ARG2',
     'I-ARG2',
     'I-ARG2',
     'I-ARG2',
     'I-ARG2',
     'I-ARG2',
     'I-ARG2',
     'I-ARG2',
     'O']},
   {'verb': 'feels',
    

### Process SRL

Works fine. No changes to be made.

In [10]:
from semantic_role_labeling import extract_roles, postprocess_roles

In [11]:
roles,sentence_index = extract_roles(srl_res, start = 0)
roles

[{},
 {'B-ARGM-MOD': ['should'],
  'ARGO': ['RINO', 'Mitt', 'Romney'],
  'ARG1': ['this'],
  'B-V': ['read']},
 {'ARG1': ['I'],
  'ARG2': ['sure',
   ',',
   'however',
   ',',
   'that',
   'he',
   'feels',
   'he',
   'got',
   'slaughtering',
   'by',
   'Obama',
   '“',
   'fair',
   'and',
   'square',
   '”'],
  'B-V': ['’m']},
 {'ARGO': ['he'],
  'ARG1': ['he',
   'got',
   'slaughtering',
   'by',
   'Obama',
   '“',
   'fair',
   'and',
   'square',
   '”'],
  'B-V': ['feels']},
 {'ARGO': ['he'],
  'ARG1': ['slaughtering', 'by', 'Obama', '“', 'fair', 'and', 'square'],
  'B-V': ['got']},
 {'ARGO': ['by', 'Obama'], 'B-V': ['slaughtering']},
 {'ARG1': ['Pennsylvania', 'Party', 'Leadership', 'votes'],
  'ARG2': ['this', 'week'],
  'B-V': ['are']},
 {'ARGO': ['I'],
  'ARG1': ['they', 'pick', 'very', 'tough', 'and', 'smart', 'fighters'],
  'B-V': ['hope']},
 {'ARGO': ['they'],
  'ARG1': ['very', 'tough', 'and', 'smart', 'fighters'],
  'B-V': ['pick']},
 {'ARGO': ['We'], 'ARG2': ['W

In [12]:
postproc_roles = postprocess_roles(roles)
postproc_roles

[{},
 {'B-ARGM-MOD': ['should'],
  'ARGO': ['rino', 'mitt', 'romney'],
  'ARG1': ['this'],
  'B-V': ['read']},
 {'ARG1': ['i'],
  'ARG2': ['sure',
   'however',
   'that',
   'he',
   'feels',
   'he',
   'got',
   'slaughtering',
   'by',
   'obama',
   '“',
   'fair',
   'and',
   'square',
   '”'],
  'B-V': ['’m']},
 {'ARGO': ['he'],
  'ARG1': ['he',
   'got',
   'slaughtering',
   'by',
   'obama',
   '“',
   'fair',
   'and',
   'square',
   '”'],
  'B-V': ['feels']},
 {'ARGO': ['he'],
  'ARG1': ['slaughtering', 'by', 'obama', '“', 'fair', 'and', 'square'],
  'B-V': ['got']},
 {'ARGO': ['by', 'obama'], 'B-V': ['slaughtering']},
 {'ARG1': ['pennsylvania', 'party', 'leadership', 'votes'],
  'ARG2': ['this', 'week'],
  'B-V': ['are']},
 {'ARGO': ['i'],
  'ARG1': ['they', 'pick', 'very', 'tough', 'and', 'smart', 'fighters'],
  'B-V': ['hope']},
 {'ARGO': ['they'],
  'ARG1': ['very', 'tough', 'and', 'smart', 'fighters'],
  'B-V': ['pick']},
 {'ARGO': ['we'], 'ARG2': ['win'], 'B-V': ['w

### Get Named Entities

New feature to examine.

In [13]:
ent_labels = ['PERSON', 'NORP', 'ORG', 'GPE', 'EVENT']

entities_all = []

for sentence in sentences:
    sentence = nlp(sentence)
    for ent in sentence.ents:
        if ent.label_ in ent_labels:
            entity = [ent.text]
            entities_all = entity + entities_all

entities_all = preprocess(entities_all) # preprocess them like the rest of the corpus
    
entity_counts = Counter(entities_all)
entities_sorted = sorted(entity_counts.items(), key=lambda x: x[1], reverse=True)

entities_sorted[0:10] # sorted by frequency

[('democrats', 6),
 ('republicans', 6),
 ('georgia', 6),
 ('wisconsin', 5),
 ('ohio', 5),
 ('vaccine', 5),
 ('country', 4),
 ('pennsylvania', 4),
 ('ballots', 3),
 ('joe biden', 3)]

In [14]:
top_n = 10
entities = []

for entity in entities_sorted:
    entities = entities + [entity[0]]
    
entities[0:top_n]

['democrats',
 'republicans',
 'georgia',
 'wisconsin',
 'ohio',
 'vaccine',
 'country',
 'pennsylvania',
 'ballots',
 'joe biden']

In [15]:
def is_subsequence(
    v2: list, 
    v1: list
) -> bool:
    """
    
    Check whether v2 is a subsequence of v1.
    
    Args:
        v2/v1: lists of elements
        
    Returns:
        a boolean
    
    Example:
        >>> v1 = ['the', 'united', 'states', 'of', 'america']\n
        ... v2 = ['united', 'states', 'of', 'europe']\n
        ... is_subsequence(v2,v1)
        False
    
    """
    it = iter(v1)
    return all(c in it for c in v2) 


def mine_entities(
    statements: List[dict],
    entities: list,
    roles_index: Optional[int] = 0,
    entity_index: Optional[dict] = {},
    roles: Optional[List[str]] = ['ARGO', 'ARG1']
) -> Tuple[int, dict, List[dict]]:
    """
    
    A function that goes through statements and identifies pre-defined named entities within postprocessed semantic roles.
    
    Args:
        statements: list of dictionaries of postprocessed semantic roles
        entities: user-defined list of named entities 
        entity_index: a dictionary 
        roles_index: an integer to keep track of statements
        roles: a list of roles with named entities (default = ARG0 and ARG1)
        
    Returns:
        roles_index: updated index
        entity_index: updated dictionary
        roles_copy: new list of postprocessed semantic roles (without the named entities mined since they will not be embedded)
    
    """
    
    if entity_index == {}:
        entity_index = {role:{entity:np.asarray([], dtype=int) for entity in entities} for role in roles}
    
    roles_copy = deepcopy(statements)
    
    for i, statement in enumerate(statements):
        for role, tokens in statements[i].items():
            if role in roles:
                for entity in entities:
                    if is_subsequence(entity.split(), tokens)  == True: 
                        entity_index[role][entity] = np.append(entity_index[role][entity], [i + roles_index]) 
                        roles_copy[i][role] = []
                        
    roles_index = len(statements)
    
    return(roles_index, entity_index, roles_copy)

In [16]:
entity_index = {}

roles_index, entity_index, postproc_roles_without_entities = mine_entities(statements = postproc_roles, 
                                                                                       entities = entities,
                                                                                       roles_index = 0,
                                                                                       entity_index = entity_index)

postproc_roles_without_entities[0:10]

[{},
 {'B-ARGM-MOD': ['should'], 'ARGO': [], 'ARG1': ['this'], 'B-V': ['read']},
 {'ARG1': ['i'],
  'ARG2': ['sure',
   'however',
   'that',
   'he',
   'feels',
   'he',
   'got',
   'slaughtering',
   'by',
   'obama',
   '“',
   'fair',
   'and',
   'square',
   '”'],
  'B-V': ['’m']},
 {'ARGO': ['he'], 'ARG1': [], 'B-V': ['feels']},
 {'ARGO': ['he'], 'ARG1': [], 'B-V': ['got']},
 {'ARGO': [], 'B-V': ['slaughtering']},
 {'ARG1': [], 'ARG2': ['this', 'week'], 'B-V': ['are']},
 {'ARGO': ['i'],
  'ARG1': ['they', 'pick', 'very', 'tough', 'and', 'smart', 'fighters'],
  'B-V': ['hope']},
 {'ARGO': ['they'],
  'ARG1': ['very', 'tough', 'and', 'smart', 'fighters'],
  'B-V': ['pick']},
 {'ARGO': [], 'ARG2': ['win'], 'B-V': ['will']}]

In [17]:
entity_index

{'ARGO': {'democrats': array([111, 135, 202, 203, 343, 359]),
  'republicans': array([ 12,  51, 296, 359]),
  'georgia': array([190]),
  'wisconsin': array([312]),
  'ohio': array([], dtype=int64),
  'vaccine': array([], dtype=int64),
  'country': array([137]),
  'pennsylvania': array([305]),
  'ballots': array([], dtype=int64),
  'joe biden': array([], dtype=int64),
  'joe': array([], dtype=int64),
  'senate': array([], dtype=int64),
  'democrat': array([18]),
  'voter fraud': array([], dtype=int64),
  'the republican party': array([111]),
  'biden': array([ 70, 241, 269]),
  'win': array([], dtype=int64),
  'abcwapo': array([], dtype=int64),
  'dc': array([], dtype=int64),
  'republican': array([111, 171]),
  'washington': array([], dtype=int64),
  'trump': array([116]),
  'eastern': array([], dtype=int64),
  'american patriots': array([352, 353, 354, 355, 356]),
  'north carolina': array([], dtype=int64),
  'charlotte': array([], dtype=int64),
  'ballots amp': array([], dtype=int64)

In [18]:
roles_index

360

### Get Vectors

Lots of little things to modify:
- Use gensim as default library to read and train embeddings.
- The user has three options: use default pre-trained embeddings (Keyed Vectors), train his own embeddings (Full Gensim Model) or use Universal Sentence Encoders.
- Pre-trained vectors can easily be downloaded and should be our default setting (see below)
- SIF weights should be computed based on token frequencies in the training sample (see below)
- Need to account for the special case when a token is not in the pre-trained embedding (see in loop below the extra if statement)
- Vectors are very demanding in RAM/memory. Perhaps in future versions we should streaming from disk (mmap).

In [19]:
import gensim.downloader as api
model = api.load("glove-wiki-gigaword-300")

In [20]:
processed_sentences = preprocess(sentences)

words = []
for sentence in processed_sentences:
    words = words + sentence.split()

word_count_dict = dict(Counter(words))

In [21]:
alpha = 0.001
sif_dict = {}
for word, count in word_count_dict.items():
    sif_dict[word] = alpha / (alpha + count)

In [22]:
statements = postproc_roles_without_entities # Only consider roles for which there wasn't a named entity

normalize = True
start = 0

embed_roles = used_roles.embeddable
not_embed_roles = used_roles.not_embeddable
statements_index = {el: [] for el in embed_roles}
roles_vectors = {el: [] for el in embed_roles}
not_found_or_empty_index = {el: [] for el in embed_roles}

for i, statement in enumerate(statements, start=start):
    for role_name, tokens in statement.items():
        if (role_name in embed_roles) and (role_name not in not_embed_roles):
            if not tokens:
                not_found_or_empty_index[role_name].append(i)
                continue
            if any(token not in word_count_dict for token in tokens):
                not_found_or_empty_index[role_name].append(i)
                continue
            if any(token not in model.vocab for token in tokens): 
                not_found_or_empty_index[role_name].append(i)
                continue
            statements_index[role_name].append(i)
            res = np.mean(
                    [sif_dict[token] * model[token] for token in tokens], axis=0 
                )
            if normalize:
                res = res / norm(res)
            
            roles_vectors[role_name].append(res)
            
for role_name in embed_roles:
    roles_vectors[role_name] = np.asarray(
        roles_vectors[role_name], dtype=np.float32
    )
    for el in [statements_index, not_found_or_empty_index]:
        el[role_name] = np.asarray(el[role_name], dtype=np.uint32)

### Get Clusters

Things to change:
- By default, verbs are not clustered.
- By default, agents and patients are clustered together.

In [23]:
from clustering import Clustering
kmeans=KMeans(random_state=0, n_init = 1)

In [24]:
clustering = Clustering(cluster=kmeans,n_clusters={'ARGO':5, 'ARG1': 5, 'B-V':1}, used_roles=used_roles)

In [25]:
clustering.fit(vectors=roles_vectors)

In [26]:
clustering_res = clustering.predict(vectors=roles_vectors)
clustering_res

{'ARGO': array([4, 4, 2, 1, 4, 1, 2, 1, 1, 2, 2, 0, 2, 1, 4, 4, 4, 1, 0, 1, 0, 0,
        1, 3, 3, 1, 1, 4, 2, 2, 2, 1, 0, 1, 2, 1, 1, 2, 2, 1, 1, 1, 1, 1,
        0, 2, 1, 2, 1, 4, 2, 2, 1, 1, 2, 3, 1], dtype=uint8),
 'ARG1': array([2, 0, 4, 4, 1, 1, 4, 1, 1, 0, 1, 3, 2, 1, 3, 0, 4, 4, 3, 3, 1, 0,
        1, 3, 2, 3, 4, 4, 4, 3, 2, 2, 0, 4, 1, 4, 4, 3, 1, 3, 4, 2, 0, 2,
        2, 0, 0, 2, 4, 4, 1, 1, 1, 4, 4, 0, 2, 3, 1, 4, 4, 3, 3, 3, 3, 2,
        0, 4, 1, 3, 1, 3, 3, 2, 2, 4, 1, 1, 0, 1, 2, 1, 4, 1, 4, 1, 0, 1,
        1, 2, 2, 4, 0, 4, 0, 0, 4, 2, 2, 1, 3, 0, 2, 1, 0, 1, 1, 2, 0, 1,
        1, 0, 1, 3, 4, 2, 0, 1, 1, 3, 1, 3, 0, 2, 0, 1, 0, 3, 0, 2, 1, 0,
        0, 4, 4, 2, 2, 1, 0, 0, 1, 2, 1, 0, 0, 0, 1, 1, 2, 0, 1],
       dtype=uint8),
 'B-V': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 

In [27]:
from clustering import label_clusters_most_freq

labels = label_clusters_most_freq(clustering_res=clustering_res, postproc_roles=postproc_roles_without_entities, statement_index=statements_index, clustering_mask=True)
labels

  labels[ARGO][0]=[('dems', 1), ('dominion_machines', 1)]. First one is picked.
  labels[ARGO][4]=[('he', 3), ('who', 3)]. First one is picked.
  labels[ARG1][1]=[('that', 2), ('us', 2)]. First one is picked.
  labels[ARG1][3]=[('the_election', 2), ('election', 2)]. First one is picked.


{'ARGO': {0: ['dems', 1],
  1: ['they', 13],
  2: ['i', 11],
  3: ['the_vice_president', 2],
  4: ['he', 3]},
 'ARG1': {0: ['you', 16],
  1: ['that', 2],
  2: ['it', 14],
  3: ['the_election', 2],
  4: ['they', 9]},
 'B-V': {0: ['is', 13]}}

### Get Narratives

- Currently a mishmash of small functions and manipulations to get what we want. 
- We need to make a clean wrapper.

In [33]:
def display_label(x, labels, arg):
    if x in labels[arg]:
        res = labels[arg][x][0]
    else:
        res = pd.NA
    return res

def clean_list(l):
    if isinstance(l, list):
        l = " ".join(el for el in l)
    return l

In [34]:
from cooccurrence import build_df

df = build_df(
    clustering_res=clustering_res,
    postproc_roles=postproc_roles_without_entities,
    statement_index=statements_index,
    used_roles=used_roles,
    clustering_mask=True
    )

df

Unnamed: 0,ARGO,ARG1,B-V,B-ARGM-MOD,B-ARGM-NEG
0,,,,,
1,,2,0,should,
2,,0,,,
3,4,,0,,
4,4,,0,,
...,...,...,...,...,...
355,,1,0,,
356,,2,0,,
357,,0,0,,
358,,,,,


In [30]:
verbs = []

for roles in postproc_roles:
    if 'B-V' in roles:
        if not roles['B-V']:
            verbs.append('')
        else:
            verbs.append(roles['B-V'][0])
    else:
        verbs.append('')

df['ARGO'] = df['ARGO'].apply(lambda x: display_label(x, labels, 'ARGO'))
df['ARG1'] = df['ARG1'].apply(lambda x: display_label(x, labels, 'ARG1'))
df['B-V'] = verbs

df = df.replace({np.NaN: ''})

roles_considered = ['ARGO', 'ARG1']
for role in roles_considered:
    for key, value in entity_index[role].items():
        df.loc[value, role] = key

In [31]:
temp = pd.DataFrame(postproc_roles)
temp

Unnamed: 0,B-ARGM-MOD,ARGO,ARG1,B-V,ARG2,B-ARGM-NEG
0,,,,,,
1,[should],"[rino, mitt, romney]",[this],[read],,
2,,,[i],[’m],"[sure, however, that, he, feels, he, got, slau...",
3,,[he],"[he, got, slaughtering, by, obama, “, fair, an...",[feels],,
4,,[he],"[slaughtering, by, obama, “, fair, and, square]",[got],,
...,...,...,...,...,...,...
355,,"[hardworking, american, patriots, who]","[our, laws]",[respect],,
356,,"[hardworking, american, patriots, who]",[america],[put],,
357,,,[you],[thank],"[for, a, wonderful, evening]",
358,,,,,,


In [32]:
temp = temp[['ARGO', 'ARG1']]
temp['ARGO'] = temp['ARGO'].apply(clean_list)
temp['ARG1'] = temp['ARG1'].apply(clean_list)
temp.columns = ['ARGO-RAW', 'ARG1-RAW']

temp = temp.replace({np.NaN: ''})

final_df = df.merge(temp, left_index=True, right_index=True)
final_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,ARGO,ARG1,B-V,B-ARGM-MOD,B-ARGM-NEG,ARGO-RAW,ARG1-RAW
0,,,,,,,
1,mitt romney,it,read,should,,rino mitt romney,this
2,,you,’m,,,,i
3,he,obama,feels,,,he,he got slaughtering by obama “ fair and square ”
4,he,obama,got,,,he,slaughtering by obama “ fair and square
...,...,...,...,...,...,...,...
355,american patriots,that,respect,,,hardworking american patriots who,our laws
356,american patriots,it,put,,,hardworking american patriots who,america
357,,you,thank,,,,you
358,,,,,,,


### Model Validation and Analysis

- To be discussed later on.
- Add inspect_label()
- Add plot_multgraph()
- Wrapper to determine the amount of dimension reduction required for clustering?