In [1]:
from pipeline import *
stop_words = list(stopwords.words('english'))

In [2]:
# Load the Trump tweets data

df = pd.read_csv('TrumpTweets/trump_tweets_cleaned.csv')
df = df[['id', 'text_clean']]
df.columns = ['id', 'doc']

In [3]:
# Split into sentences (example on 100 trump tweets)

split_sentences = split_into_sentences(df.iloc[0:100], 
                                       save_to_disk = None, 
                                       progress_bar = True)

Splitting into sentences...


100%|██████████| 100/100 [00:00<00:00, 101.73it/s]


In [4]:
# Run SRL (example on a 100 trump tweets)

srl_res = run_srl(path = "../srl-model-2018.05.25.tar.gz",
                  sentences=split_sentences[1],
                  save_to_disk = None,
                  batch_size = 20, 
                  progress_bar = True)



Running SRL...


100%|██████████| 11/11 [01:16<00:00,  6.96s/it]


In [5]:
# Build narrative_model based on srl_res, embeddings and entities (example on a 100 Trump tweets).
# Full modularity for clustering. List of lists: arguments in each sublist are clustered together.
# Default: we cluster all arguments together.

narrative_model = build_narrative_model(srl_res = srl_res,
                                        sentences = split_sentences[1], # list of sentences
                                        roles_considered = ['ARGO', 'B-V', 'B-ARGM-NEG', 'B-ARGM-MOD', 'ARG1', 'ARG2'],
                                        roles_with_embeddings = [['ARGO','ARG1', 'ARG2']],
                                        embeddings_type = 'gensim_keyed_vectors',
                                        embeddings_path = 'glove-wiki-gigaword-300',
                                        n_clusters = [2],
                                        verbose = 0,
                                        roles_with_entities = ['ARGO', 'ARG1', 'ARG2'],
                                        top_n_entities = 10,
                                        dimension_reduce_verbs = True,
                                        save_to_disk = None,
                                        max_length = 4,
                                        remove_punctuation = True,
                                        remove_digits = True,
                                        remove_chars = '',
                                        stop_words = stop_words,
                                        lowercase = True,
                                        strip = True,
                                        remove_whitespaces = True,
                                        lemmatize = True,
                                        stem = False,
                                        tags_to_keep = None,
                                        remove_n_letter_words = 1, 
                                        progress_bar = True)     

Processing SRL...


100%|██████████| 220/220 [00:00<00:00, 7485.33it/s]


Cleaning SRL...


100%|██████████| 439/439 [00:01<00:00, 261.14it/s]


Computing role frequencies...


100%|██████████| 439/439 [00:00<00:00, 354778.32it/s]


Mining named entities...


100%|██████████| 220/220 [00:01<00:00, 150.07it/s]


Mapping named entities...


100%|██████████| 439/439 [00:00<00:00, 32702.82it/s]


In [6]:
# Get narratives based on the narrative_model and srl_res.

final_statements = get_narratives(srl_res = srl_res,
                                  doc_index = split_sentences[0], # doc names
                                  narrative_model = narrative_model,
                                  save_to_disk = None,
                                  cluster_labeling = 'most_frequent',
                                  progress_bar = True)

final_statements

Processing SRL...


100%|██████████| 220/220 [00:00<00:00, 6973.60it/s]


Cleaning SRL...


100%|██████████| 439/439 [00:00<00:00, 1644.09it/s]


Processing raw arguments...


100%|██████████| 439/439 [00:00<00:00, 325490.45it/s]


Cleaning verbs...


100%|██████████| 439/439 [00:00<00:00, 2382.30it/s]


Mapping named entities...


100%|██████████| 439/439 [00:00<00:00, 19474.35it/s]


Assigning clusters to roles...


100%|██████████| 439/439 [00:00<00:00, 3717.84it/s]


Unnamed: 0,doc,sentence,statement,narrative-CLEANED,narrative-RAW,ARGO-RAW,ARGO,B-V-RAW,B-V-CLEANED,B-ARGM-NEG-RAW,B-ARGM-NEG-CLEANED,B-ARGM-MOD-RAW,ARG1-RAW,ARG1,ARG2-RAW,ARG2
1,98454970654916608,0,1,election make hardworking american patriot,republican democrat create economic problem,republican democrat,election,create,make,,,,economic problem,hardworking american patriot,,
6,1234653427789070336,1,6,hardworking american patriot love country,hardworking american patriot love country,hardworking american patriot,hardworking american patriot,love,love,,,,country,country,,
7,1234653427789070336,1,7,hardworking american patriot cherish hardworki...,hardworking american patriot cherish value,hardworking american patriot,hardworking american patriot,cherish,cherish,,,,value,hardworking american patriot,,
8,1234653427789070336,1,8,hardworking american patriot respect hardworki...,hardworking american patriot respect law,hardworking american patriot,hardworking american patriot,respect,respect,,,,law,hardworking american patriot,,
9,1234653427789070336,1,9,hardworking american patriot put hardworking a...,hardworking american patriot put america first,hardworking american patriot,hardworking american patriot,put,put,,,,america first,hardworking american patriot,,
13,1304875170860015617,4,13,hardworking american patriot use hardworking a...,almost recent election use system,almost recent election,hardworking american patriot,use,use,,,,system,hardworking american patriot,,
41,1319683876046934016,17,41,hardworking american patriot agree hardworking...,sudan agree peace normalization agreement israel,sudan,hardworking american patriot,agree,agree,,,,peace normalization agreement israel,hardworking american patriot,,
53,1325889532840062976,23,53,get hardworking american patriot,adamlaxalt find thing release absolutely shock,adamlaxalt,,find,get,,,,thing release absolutely shock,hardworking american patriot,,
58,1325891490636320768,24,58,wisconsin see hardworking american patriot,wisconsin look good,wisconsin,wisconsin,look,see,,,,good,hardworking american patriot,,
108,1317063921375842305,49,108,election see hardworking american patriot,poll number look strong,poll number,election,look,see,,,,strong,hardworking american patriot,,


In [7]:
df_sents = pd.DataFrame({'doc': split_sentences[0], 'sentence': split_sentences[1]})

In [8]:
inspect_label(final_statements, 
              label = 'election', 
              role = 'ARGO')

embolden radical left democrat    1
biden                             1
tweet                             1
poll number                       1
dominion                          1
vice president                    1
bureaucracy                       1
republican democrat               1
democrat                          1
andrew mccabe                     1
Name: ARGO-RAW, dtype: int64

In [9]:
inspect_narrative(final_statements, narrative = 'election cherish election')

Series([], Name: narrative-RAW, dtype: int64)

In [10]:
# Building alternative narratives (with raw verbs for instance)

narrative_format = ['ARGO', 'B-V-RAW', 'B-ARGM-NEG-RAW', 'ARG1']
final_statements['narrative'] = final_statements[narrative_format].agg(' '.join, axis=1)
final_statements['narrative'] = final_statements['narrative'].apply(remove_extra_whitespaces)
final_statements['narrative'].value_counts().head(n=20)

hardworking american patriot agree hardworking american patriot      1
election say hardworking american patriot                            1
find hardworking american patriot                                    1
election stole election                                              1
election look hardworking american patriot                           1
election run election                                                1
hardworking american patriot get election                            1
hardworking american patriot use hardworking american patriot        1
join hardworking american patriot                                    1
hardworking american patriot cherish hardworking american patriot    1
wisconsin look hardworking american patriot                          1
georgia mean hardworking american patriot                            1
hardworking american patriot end hardworking american patriot        1
election create hardworking american patriot                         1
electi

In [11]:
# Loading output previously saved to disk (for the entire Trump tweets corpus)

with open('TrumpTweets/split_sentences.json', 'r') as f:
    split_sentences = json.load(f)
    
with open('TrumpTweets/srl_res.json', 'r') as f:
    srl_res = json.load(f)

In [12]:
import pandas as pd
df = pd.read_csv('TrumpTweets/final_df.csv')
df

Unnamed: 0,doc,sentence,statement,narrative-CLEANED,narrative-RAW,ARGO-RAW,ARGO,B-V-RAW,B-ARGM-NEG-RAW,B-ARGM-MOD-RAW,ARG1-RAW,ARG1,ARG2-RAW,ARG2
0,98454970654916608,0,1,democrat create economic,republican democrat create economic problem,republican democrat,democrat,create,,,economic problem,economic,,
1,1234653427789070336,1,6,patriot love country,hardworking american patriot love country,hardworking american patriot,patriot,love,,,country,country,,
2,1234653427789070336,1,7,patriot cherish value,hardworking american patriot cherish value,hardworking american patriot,patriot,cherish,,,value,value,,
3,1234653427789070336,1,8,patriot respect law,hardworking american patriot respect law,hardworking american patriot,patriot,respect,,,law,law,,
4,1234653427789070336,1,9,patriot put america first,hardworking american patriot put america first,hardworking american patriot,patriot,put,,,america first,america first,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12495,1213145418093223937,68595,150158,trump question work others,trump question aid number,trump,trump,question,,,aid,work,number,others
12496,1213149809378578432,68602,150179,case dismiss trial,motion dismiss impeachment trial,motion,case,dismiss,,,impeachment trial,trial,,
12497,1213078681750573056,68606,150194,iran lose not negotiation,iran lose not negotiation,iran,iran,lose,not,,negotiation,negotiation,,
12498,1319384118849949702,68612,150209,biden fail delphi,obama biden fail worker delphi,obama biden,biden,fail,,,worker delphi,delphi,,


In [13]:
set(df['ARGO'])

{nan,
 'ag',
 'work',
 'texas',
 'cardinaldolan',
 'tony fauci',
 'njtransit',
 'dirty',
 'rinos',
 'reporting',
 'today',
 'weekend daytime',
 'abe',
 'roy',
 'due process',
 'libya',
 'rivlin',
 'huffpost',
 'rally',
 'gaga',
 'obrien',
 'celebrity apprenticenbc',
 'filthy',
 'pelosis',
 'danny',
 'complete',
 'frankluntz',
 'crooked',
 'clinton puppet',
 'steel',
 'cia',
 'frank vandersloot',
 'jamie',
 'greece',
 'world health',
 'nyjets',
 'walter',
 'salmond',
 'la david johnson',
 'miamiherald',
 'jimhagedornmn',
 'scarborough',
 'angelina jolie',
 'american hero',
 'forever',
 'next',
 'state election',
 'forbes',
 'trumpgolfdc',
 'state supreme court',
 'dummy',
 'bobbyjindal',
 'atlantic city',
 'dominion',
 'putin',
 'voter',
 'david',
 'tony',
 'constitutional',
 'kimberly klacik',
 'obama gang',
 'fall',
 'trade',
 'columbia',
 'lebron',
 'richhudson',
 'caronavirus',
 'pataki',
 'natl bedminster',
 'rep amp',
 'usfda',
 'tony romo',
 'voter identification',
 'bipartisan',