In [1]:
from pipeline import *
stop_words = list(stopwords.words('english'))

In [2]:
# Load the Trump tweets data

df = pd.read_csv('TrumpTweets/trump_tweets_cleaned.csv')
df = df[['id', 'text_clean']]
df.columns = ['id', 'doc']

In [3]:
# Split into sentences (example on 100 trump tweets)

split_sentences = split_into_sentences(df.iloc[0:100], 
                                       save_to_disk = None, 
                                       progress_bar = True)

Splitting into sentences...


100%|██████████| 100/100 [00:01<00:00, 84.95it/s]


In [4]:
# Run SRL (example on a 100 trump tweets)

srl_res = run_srl(path = "../srl-model-2018.05.25.tar.gz",
                  sentences=split_sentences[1],
                  save_to_disk = None,
                  batch_size = 20, 
                  progress_bar = True)

Running SRL...


100%|██████████| 11/11 [01:29<00:00,  8.17s/it]


In [5]:
# Build narrative_model based on srl_res, embeddings and entities (example on a 100 Trump tweets).
# Full modularity for clustering. List of lists: arguments in each sublist are clustered together.
# Default: we cluster all arguments together.

narrative_model = build_narrative_model(srl_res = srl_res,
                                        sentences = split_sentences[1], # list of sentences
                                        roles_considered = ['ARGO', 'B-V', 'B-ARGM-NEG', 'B-ARGM-MOD', 'ARG1', 'ARG2'],
                                        roles_with_embeddings = [['ARGO','ARG1', 'ARG2']],
                                        embeddings_type = 'gensim_keyed_vectors',
                                        embeddings_path = 'glove-wiki-gigaword-300',
                                        n_clusters = [0],
                                        verbose = 0,
                                        roles_with_entities = ['ARGO', 'ARG1', 'ARG2'],
                                        top_n_entities = 0,
                                        dimension_reduce_verbs = True,
                                        save_to_disk = None,
                                        max_length = 4,
                                        remove_punctuation = True,
                                        remove_digits = True,
                                        remove_chars = '',
                                        stop_words = stop_words,
                                        lowercase = True,
                                        strip = True,
                                        remove_whitespaces = True,
                                        lemmatize = True,
                                        stem = False,
                                        tags_to_keep = None,
                                        remove_n_letter_words = 1, 
                                        progress_bar = True)     

Processing SRL...


100%|██████████| 220/220 [00:00<00:00, 8949.93it/s]


Cleaning SRL...


100%|██████████| 439/439 [00:01<00:00, 254.00it/s]


Computing role frequencies...


100%|██████████| 439/439 [00:00<00:00, 308373.72it/s]


Mining named entities...


100%|██████████| 220/220 [00:01<00:00, 134.68it/s]


Mapping named entities...


100%|██████████| 439/439 [00:00<00:00, 6105.95it/s]


In [6]:
# Get narratives based on the narrative_model and srl_res.

final_statements = get_narratives(srl_res = srl_res,
                                  doc_index = split_sentences[0], # doc names
                                  narrative_model = narrative_model,
                                  save_to_disk = None,
                                  cluster_labeling = 'most_frequent',
                                  progress_bar = True)

final_statements

Processing SRL...


100%|██████████| 220/220 [00:00<00:00, 10317.63it/s]


Cleaning SRL...


100%|██████████| 439/439 [00:00<00:00, 1540.03it/s]


Mapping named entities...


100%|██████████| 439/439 [00:00<00:00, 6774.84it/s]


Assigning clusters to roles...


100%|██████████| 439/439 [00:00<00:00, 4858.75it/s]


Unnamed: 0,doc,sentence,statement,narrative-CLEANED,narrative-RAW,ARGO-RAW,ARGO,B-V-RAW,B-V-CLEANED,B-ARGM-NEG-RAW,B-ARGM-NEG-CLEANED,B-ARGM-MOD-RAW,ARG1-RAW,ARG1,ARG2-RAW,ARG2
1,98454970654916608,0,1,republican make election,republican democrat create economic problem,republican democrat,republican,create,make,,,,economic problem,election,,
6,1234653427789070336,1,6,election love country,hardworking american patriot love country,hardworking american patriot,election,love,love,,,,country,country,,
7,1234653427789070336,1,7,election cherish election,hardworking american patriot cherish value,hardworking american patriot,election,cherish,cherish,,,,value,election,,
8,1234653427789070336,1,8,election respect election,hardworking american patriot respect law,hardworking american patriot,election,respect,respect,,,,law,election,,
9,1234653427789070336,1,9,election put election,hardworking american patriot put america first,hardworking american patriot,election,put,put,,,,america first,election,,
13,1304875170860015617,4,13,election use election,almost recent election use system,almost recent election,election,use,use,,,,system,election,,
41,1319683876046934016,17,41,sudan agree israel,sudan agree peace normalization agreement israel,sudan,sudan,agree,agree,,,,peace normalization agreement israel,israel,,
53,1325889532840062976,23,53,adamlaxalt get election,adamlaxalt find thing release absolutely shock,adamlaxalt,adamlaxalt,find,get,,,,thing release absolutely shock,election,,
58,1325891490636320768,24,58,wisconsin see election,wisconsin look good,wisconsin,wisconsin,look,see,,,,good,election,,
108,1317063921375842305,49,108,election see election,poll number look strong,poll number,election,look,see,,,,strong,election,,


In [7]:
df_sents = pd.DataFrame({'doc': split_sentences[0], 'sentence': split_sentences[1]})

In [10]:
inspect_label(final_statements, 
              label = 'election', 
              role = 'ARGO')

hardworking american patriot    4
vice president                  1
bureaucracy                     1
tweet                           1
almost recent election          1
great discovery                 1
poll number                     1
dominion                        1
Name: ARGO-RAW, dtype: int64

In [11]:
inspect_narrative(final_statements, narrative = 'election cherish election')

hardworking american patriot cherish value    1
Name: narrative-RAW, dtype: int64

In [12]:
# Building alternative narratives (with raw verbs for instance)

narrative_format = ['ARGO', 'B-V-RAW', 'B-ARGM-NEG-RAW', 'ARG1']
final_statements['narrative'] = final_statements[narrative_format].agg(' '.join, axis=1)
final_statements['narrative'] = final_statements['narrative'].apply(remove_extra_whitespaces)
final_statements['narrative'].value_counts().head(n=20)

fake mean election             1
radical left steal election    1
election reject election       1
republican create election     1
election respect election      1
sudan agree israel             1
election look election         1
election love country          1
wisconsin look election        1
adamlaxalt find election       1
biden say election             1
democrat stole election        1
election use election          1
election run election          1
andrew mccabe get election     1
election put election          1
election expose fox news       1
election end election          1
election cherish election      1
election destroyed election    1
Name: narrative, dtype: int64

In [None]:
# Loading output previously saved to disk (for the entire Trump tweets corpus)

with open('TrumpTweets/split_sentences.json', 'r') as f:
    split_sentences = json.load(f)
    
with open('TrumpTweets/srl_res.json', 'r') as f:
    srl_res = json.load(f)