#### In this notebook, we demonstrate our pipeline on speeches related to taxation from the US Congress records.

#### 1. Import Packages

In [None]:
my_path = '/cluster/work/lawecon/Projects/Ash_Gauthier_Widmer/germain/narrative-nlp'

import sys
sys.path.append("../Code/")

import glob
import json
import re
import time

import numpy as np
import pandas as pd
from tqdm import tqdm
import pickle

from utils import (
    tokenize_into_sentences,
    filter_sentences,
    preprocess,
    UsedRoles,
    Document,
    dict_concatenate,
    get_verb_counts,
    clean_verbs
)

from word_embedding import run_word2vec, compute_embedding, USE, SIF_Word2Vec
from semantic_role_labeling import SRL, extract_roles, postprocess_roles, extract_role_per_sentence
from clustering import Clustering, label_clusters, label_clusters_most_freq
from sklearn.cluster import KMeans
from cooccurrence import build_df, CoOccurrence

used_roles = UsedRoles()
used_roles["ARG2"] = True

#### 2. Run SRL

Make sure you correctly downloaded the pre-trained SRL model from AllenNLP. 

This step takes a bit of time. You may want make yourself some coffee and run it only once. If you have multiple cores on your machine, you may also change n_cores to speed up the process.

We save results in "../Output/SRL/"

In [None]:
n_cores = 1

In [None]:
filepaths = glob.glob('../Data/*.txt')

In [None]:
from allennlp.predictors.predictor import Predictor
predictor = Predictor.from_path("../srl-model-2018.05.25.tar.gz")

In [None]:
def process_text(filepath, batch_size = 20, max_chars = 500):
    print(filepath)
    outfilename = '../Output/SRL' + re.sub("../Data", '', str(filepath)) + '.json'
    outfilename = re.sub('.txt', '', outfilename)
    with open(outfilename, 'w') as json_file:
        res = []
        with open(filepath, "r") as f:
            speech = f.read()
            sentences = tokenize_into_sentences(speech)
            batches = [sentences[x:x+batch_size] for x in range(0, len(sentences), batch_size)]
            for batch in batches:
                batch = [{'sentence': str(S)} for S in batch if len(S) < max_chars]
                temp = predictor.predict_batch_json(batch)
                res = res + temp
            json.dump(res, json_file)

In [None]:
t0 = time.time()

print('Starting SRL job...')

from joblib import Parallel, delayed
import multiprocessing

Parallel(n_jobs=n_cores)(delayed(process_text)(filepath)
                            for filepath in filepaths)

t1 = time.time() - t0
print('SRL job took %s seconds.' %t1)

#### 3. Train Word Embeddings

We save the model in '../Output/Embeddings'

In [None]:
filepaths = glob.glob('../Data/*.txt')

print('Processing speeches to train embeddings...')

time.sleep(1)

with open('../Output/Text/clean_speeches.txt', 'w') as f1:
    for filepath in tqdm(filepaths):
        with open(filepath, "r") as f2:
            speech = f2.read()
            sentences = tokenize_into_sentences(speech)
            sentences = preprocess(sentences)
            for sentence in sentences:
                f1.write(sentence + '\n')

print('Processing Done.')

In [None]:
# First option: take existing word embeddings and train the model some more
# We have a small corpus in this example, so we start with pretrained embeddings.

import gensim
from gensim.models import KeyedVectors, Word2Vec
from gensim.models.word2vec import LineSentence

sentences = LineSentence('../Output/Text/clean_speeches.txt')

pretrained_model = KeyedVectors.load_word2vec_format("../glove_2_word2vec.6B.300d.txt", binary = False)

model = Word2Vec(size = 300, window = 8, min_count = 1, workers = 1)
model.build_vocab(sentences)
total_examples = model.corpus_count
model.build_vocab([list(pretrained_model.vocab.keys())], update = True)
model.intersect_word2vec_format("../glove_2_word2vec.6B.300d.txt", binary = False, lockf = 1.0)
model.train(sentences, total_examples = total_examples, epochs = model.epochs)
model.save('../Output/Embeddings/my_word2vec_model.model')

In [None]:
# # Second option: train your own word embeddings (if your corpus is large enough)

# import gensim
# from gensim.models import Word2Vec
# from gensim.models.word2vec import LineSentence

# t0 = time.time()

# print('Training word embeddings...')

# sentences = LineSentence('../Output/Text/clean_speeches.txt', max_sentence_length=30000)

# model = Word2Vec(sentences, size = 300, window = 8, min_count = 1, workers = 1)

# model.save('../Output/Embeddings/my_word2vec_model.model')

# t1 = time.time() - t0

# print('Train the Word2Vec model took %s seconds.' %t1)

#### 4. Process SRL output

Default processing of the text data is minimal by default. We remove lowercase, remove punctuation and digits. Many additional options are available (see documentation).

We save results in "../Output/Processed_files/"

In [None]:
sif_w2v = SIF_Word2Vec("../Output/Embeddings/my_word2vec_model.model")

In [None]:
filenames = glob.glob("../Output/SRL/*.json")

documents_all = []
postproc_roles_all = []
sentence_index_all = []  # np.array([], dtype=np.uint32)
vectors_all = []  # None
statement_index_all = []  # {}
funny_index_all = []  # {}


def do_all(filenames):
    global documents_all, postproc_roles_all, sentence_index_all, vectors_all, statement_index_all, funny_index_all

    start_index = 0

    for filename in tqdm(filenames):
        with open(filename) as json_file:
            srl_res = json.load(json_file)
        
        roles, sentence_index = extract_roles(srl_res, start=start_index)

        postproc_roles = postprocess_roles(roles, stop_words = my_stopwords)

        sif_vectors, sif_statements_index, sif_funny_index = compute_embedding(
            sif_w2v, statements=postproc_roles, used_roles=used_roles, start=start_index
        )

        documents_all.append(Document(filename, start_index))
        postproc_roles_all.extend(postproc_roles)
        sentence_index_all.append(sentence_index)
        vectors_all.append(sif_vectors)
        statement_index_all.append(sif_statements_index)
        funny_index_all.append(sif_funny_index)

        start_index += sentence_index.size

do_all(filenames)
sentence_index_all = np.concatenate(sentence_index_all)
vectors_all = dict_concatenate(vectors_all)
statement_index_all = dict_concatenate(statement_index_all)
funny_index_all = dict_concatenate(funny_index_all)

In [None]:
verb_counts = get_verb_counts(postproc_roles_all)
postproc_roles_all = clean_verbs(postproc_roles_all, verb_counts = verb_counts)
postproc_roles_all

In [None]:
print('There are %s documents.' %len(documents_all))
print('There are %s statements.' %len(postproc_roles_all))

argO = [' '.join(i['ARGO']) for i in postproc_roles_all if 'ARGO' in i.keys()]
argO = set(argO)
print('There are %s unique agents.' %len(argO))

arg1 = [' '.join(i['ARG1']) for i in postproc_roles_all if 'ARG1' in i.keys()]
arg1 = set(arg1)
print('There are %s unique patients.' %len(arg1))

BV = [' '.join(i['B-V']) for i in postproc_roles_all if 'B-V' in i.keys()]
BV = set(BV)
print('There are %s unique verbs.' %len(BV))

arg2 = [' '.join(i['ARG2']) for i in postproc_roles_all if 'ARG2' in i.keys()]
arg2 = set(arg2)
print('There are %s unique attributes.' %len(arg2))

In [None]:
with open("../Output/Processed_files/postproc_roles_all.json", 'w') as f:
    json.dump(postproc_roles_all, f)

with open("../Output/Processed_files/documents_all.json", 'w') as f:
    json.dump(documents_all, f)
    
np.savez('../Output/Processed_files/sentence_index_all.npz', sentence_index_all)      
np.savez('../Output/Processed_files/statement_index_all.npz', **statement_index_all)
np.savez('../Output/Processed_files/vectors_all.npz', **vectors_all)
np.savez('../Output/Processed_files/funny_index_all.npz', **funny_index_all)

#### 5. Cluster Roles

We save results in "../Output/Clustering/"

In [None]:
with open("../Output/Processed_files/documents_all.json") as f:
    documents_all_raw = json.load(f)

with open("../Output/Processed_files/postproc_roles_all.json") as f:
    postproc_roles_all = json.load(f)
    
sentence_index_all = np.load('../Output/Processed_files/sentence_index_all.npz')['arr_0']

statement_index_all = np.load('../Output/Processed_files/statement_index_all.npz')
statement_index_all = {fi: statement_index_all[fi] for fi in statement_index_all.files}

funny_index_all = np.load('../Output/Processed_files/funny_index_all.npz')
funny_index_all = {fi: funny_index_all[fi] for fi in funny_index_all.files}

vectors_all = np.load('../Output/Processed_files/vectors_all.npz')
vectors_all = {fi: vectors_all[fi] for fi in vectors_all.files}

# Format them in the narratives-nlp Document format
documents_all = []
for i in documents_all_raw:
    documents_all.append(Document(i[0], i[1]))

In [None]:
# The threshold hyperparameter allows the user to control for cluster coherence. 
# If the distance between the centroid and the vector role is above the threshold, then this observation is labeled as noise. 
# Threshold=2 assumes no noise and forcefully assigns a label to each semantic role in the data.

threshold = 2

In [None]:
# Define the number of clusters for each role. Here, we divide the total number of unique tokens for each role by 100.

bins = 100

num_argO_clu = round(len(argO)/bins)
num_arg1_clu = round(len(arg1)/bins)
num_arg2_clu = round(len(arg2)/bins)

print(num_argO_clu, num_arg1_clu, num_arg2_clu)

In [None]:
kmeans=KMeans(random_state=0, n_init = 1)

clustering = Clustering(
    cluster=kmeans,
    n_clusters={"ARGO": num_argO_clu, "ARG1": num_arg1_clu, "ARG2": num_arg2_clu},
    used_roles=used_roles,
)

sample_vectors = clustering.resample(vectors=vectors_all, sample_size=1)
clustering_fit = clustering.fit(vectors=sample_vectors)
clustering_res = clustering.predict(vectors=vectors_all)
distance = clustering.compute_distance(vectors=vectors_all, predicted_cluster = clustering_res)
clustering_mask = clustering.distance_mask(distance, threshold=threshold)

In [None]:
pickle.dump(clustering, open("../Output/Clustering/clustering.pickle", 'wb'))
np.savez('../Output/Clustering/clustering_res.npz', **clustering_res)

#### 6. Label Clusters and Explore Resulting Narratives

We save results in "../Output/Narratives"

In [None]:
with open("../Output/Clustering/clustering.pickle", 'rb') as f:
    clustering = pickle.load(f)

In [None]:
labels = label_clusters_most_freq(
    clustering_res=clustering_res,
    postproc_roles=postproc_roles_all,
    statement_index=statement_index_all,
    clustering_mask=clustering_mask
)

labels

In [None]:
df = build_df(
    clustering_res=clustering_res,
    postproc_roles=postproc_roles_all,
    statement_index=statement_index_all,
    used_roles=used_roles,
    clustering_mask=True
)

In [None]:
labels_bis = labels.copy()
df_bis = df.copy()

labels_bis['ARGO'][pd.NA] = [(pd.NA, 0)]
labels_bis['ARG1'][pd.NA] = [(pd.NA, 0)]
labels_bis['ARG2'][pd.NA] = [(pd.NA, 0)]

def display_label(x, labels_bis, arg):
    if x in labels_bis[arg]:
        res = labels_bis[arg][x][0][0]
    else:
        res = pd.NA
    return res

df_bis['ARGO'] = df_bis['ARGO'].apply(lambda x: display_label(x, labels_bis, 'ARGO'))
df_bis['ARG1'] = df_bis['ARG1'].apply(lambda x: display_label(x, labels_bis, 'ARG1'))
df_bis['ARG2'] = df_bis['ARG2'].apply(lambda x: display_label(x, labels_bis, 'ARG2'))

In [None]:
df_bis = df_bis.replace({np.NaN: ''})
df_bis

In [None]:
df_bis[(df_bis.ARGO != '') & (df_bis.ARG1 != '') & (df_bis['B-V'] != '')]

In [None]:
df_bis.to_csv('../Output/Narratives/df_with_labels.csv')