In [None]:
import sys
sys.path.append("../code")

In [None]:
text = """Sub-module available for the above is sent_tokenize.
            An obvious question in your mind would be why sentence tokenization is needed when we have the option of word tokenization. 
            Imagine you need to count average words per sentence, how you will calculate? 
            For accomplishing such a task, you need both sentence tokenization as well as words to calculate the ratio. 
            Such output serves as an important feature for machine training as the answer would be numeric. 
            Check the below example to learn how sentence tokenization is different from words tokenization.
            The taxes the President announced will not lower work incentives. Evrika!
            John can’t keep up with Mary’s rapid mood swings.
            """

In [None]:
from utils import tokenize_into_sentences, filter_sentences, preprocess, UsedRoles
from word_embedding import run_word2vec, compute_embedding, USE, SIF_Word2Vec
from semantic_role_labeling import SRL, extract_roles, postprocess_roles
from clustering import Clustering
from sklearn.cluster import KMeans
from cooccurrence import build_df, CoOccurrence

In [None]:
used_roles=UsedRoles()
used_roles['ARG2']=True
print(f"{used_roles.used}\n{used_roles.embeddable}\n{used_roles.not_embeddable}\n")

In [None]:
srl = SRL("./srl-model-2018.05.25.tar.gz")
srl([" ".join(["What","are","you","doing?"])])

In [None]:
srl = SRL("./bert-base-srl-2020.03.24.tar.gz")
srl([" ".join(["What","are","you","doing"])])


In [None]:
srl = SRL("./srl-model-2018.05.25.tar.gz")
srl(["the government increases spending without raising taxes."])


In [None]:
srl = SRL("./bert-base-srl-2020.03.24.tar.gz")
srl(["the government increases spending without raising taxes."])

In [None]:
srl = SRL("./srl-model-2018.05.25.tar.gz")


In [None]:
use = USE('./USE-4')
use(["What","are","you","doing"]).shape

In [None]:
sif_w2v = SIF_Word2Vec("./nytimes_word2vec.model")
sif_w2v(["what","are","you","doing"]).shape

In [None]:
kmeans=KMeans(random_state=0)

In [None]:
sentences = tokenize_into_sentences(text)
sentences

In [None]:
sentences = filter_sentences(sentences, max_sentence_length=350)
sentences

In [None]:
srl_res = srl(sentences=sentences)
srl_res

In [None]:
roles,sentence_index = extract_roles(srl_res)
sentence_index

In [None]:
roles

In [None]:
postproc_roles = postprocess_roles(roles)
postproc_roles

In [None]:
sif_vectors, sif_statements_index, sif_funny_index =compute_embedding(sif_w2v,statements=postproc_roles,
                                                                      used_roles=used_roles)

In [None]:
sif_statements_index

In [None]:
{el:sif_vectors[el].shape for el in sif_vectors.keys()}

In [None]:
sif_funny_index

In [None]:
postproc_roles[0]["ARG2"]

In [None]:
USE_vectors, USE_statements_index, USE_funny_index = compute_embedding(use,roles,used_roles)


In [None]:
USE_statements_index

In [None]:
{el:USE_vectors[el].shape for el in USE_vectors.keys()}

In [None]:
USE_funny_index

In [None]:
clustering = Clustering(cluster=kmeans,n_clusters={'ARGO':2, 'ARG1': 1, 'ARG2':1, 'B-V':2},
                         used_roles=used_roles)

In [None]:
clustering.fit(vectors=sif_vectors)

In [None]:
{el:clustering._cluster[el].labels_ for el in clustering._cluster.keys()}

In [None]:
clustering_res = clustering.predict(vectors=sif_vectors)
clustering_res

In [None]:
labels = clustering.label_most_similar_in_w2v(sif_w2v)
labels

In [None]:
df = build_df(
    clustering_res=clustering_res,
    postproc_roles=postproc_roles,
    statement_index=sif_statements_index,
    used_roles=used_roles,
)

In [None]:
df

In [None]:
labels

In [None]:
cooc = CoOccurrence(df, labels, used_roles)

In [None]:
cooc.subset=None ## by convention None means take all roles
cooc.subset

In [None]:
cooc.narratives_counts

In [None]:
cooc.narratives_pmi

In [None]:
cooc.subset={"ARGO","ARG1","B-V","B-ARGM-MOD","B-ARGM-NEG"}
print(cooc.normal_order)
cooc.display_order

In [None]:
cooc.narratives_counts

In [None]:
cooc.narratives_pmi

In [None]:
cooc.subset={"ARGO","ARG1","B-V"}
cooc.narratives_counts

In [None]:
cooc.narratives_pmi