In [1]:
import sys
sys.path.append("../code")

In [2]:
text = """Sub-module available for the above is sent_tokenize.
            An obvious question in your mind would be why sentence tokenization is needed when we have the option of word tokenization. 
            Imagine you need to count average words per sentence, how you will calculate? 
            For accomplishing such a task, you need both sentence tokenization as well as words to calculate the ratio. 
            Such output serves as an important feature for machine training as the answer would be numeric. 
            Check the below example to learn how sentence tokenization is different from words tokenization.
            The taxes the President announced will not lower work incentives. Evrika!
            John can’t keep up with Mary’s rapid mood swings.
            """

In [3]:
from utils import tokenize_into_sentences, filter_sentences, preprocess, UsedRoles
from word_embedding import run_word2vec, compute_embedding, USE, SIF_Word2Vec
from semantic_role_labeling import SRL, extract_roles, postprocess_roles
from clustering import Clustering
from sklearn.cluster import KMeans
from cooccurrence import CoOccurence

In [4]:
used_roles=UsedRoles()
used_roles['ARG2']=True
print(f"{used_roles.used}\n{used_roles.embeddable}\n{used_roles.not_embeddable}\n")

['ARGO', 'ARG1', 'ARG2', 'B-V', 'B-ARGM-MOD', 'B-ARGM-NEG']
['ARGO', 'ARG1', 'ARG2', 'B-V']
['B-ARGM-MOD', 'B-ARGM-NEG']



In [5]:
srl = SRL("./srl-model-2018.05.25.tar.gz")
srl([" ".join(["What","are","you","doing"])])

[{'verbs': [{'verb': 'are',
    'description': 'What [V: are] [ARG1: you doing]',
    'tags': ['O', 'B-V', 'B-ARG1', 'I-ARG1']},
   {'verb': 'doing',
    'description': '[ARG1: What] are [ARG0: you] [V: doing]',
    'tags': ['B-ARG1', 'O', 'B-ARG0', 'B-V']}],
  'words': ['What', 'are', 'you', 'doing']}]

In [6]:
use = USE('./USE-4')
use(["What","are","you","doing"]).shape

(512,)

In [7]:
sif_w2v = SIF_Word2Vec("./nytimes_word2vec.model")
sif_w2v(["what","are","you","doing"]).shape

(300,)

In [8]:
kmeans=KMeans()

In [9]:
sentences = tokenize_into_sentences(text)
sentences

['Sub-module available for the above is sent_tokenize.',
 'An obvious question in your mind would be why sentence tokenization is needed when we have the option of word tokenization.',
 'Imagine you need to count average words per sentence, how you will calculate?',
 'For accomplishing such a task, you need both sentence tokenization as well as words to calculate the ratio.',
 'Such output serves as an important feature for machine training as the answer would be numeric.',
 'Check the below example to learn how sentence tokenization is different from words tokenization.',
 'The taxes the President announced will not lower work incentives.',
 'Evrika!',
 'John can’t keep up with Mary’s rapid mood swings.']

In [10]:
sentences = filter_sentences(sentences, max_sentence_length=350)
sentences

['Sub-module available for the above is sent_tokenize.',
 'An obvious question in your mind would be why sentence tokenization is needed when we have the option of word tokenization.',
 'Imagine you need to count average words per sentence, how you will calculate?',
 'For accomplishing such a task, you need both sentence tokenization as well as words to calculate the ratio.',
 'Such output serves as an important feature for machine training as the answer would be numeric.',
 'Check the below example to learn how sentence tokenization is different from words tokenization.',
 'The taxes the President announced will not lower work incentives.',
 'Evrika!',
 'John can’t keep up with Mary’s rapid mood swings.']

In [11]:
srl_res = srl(sentences=sentences)
srl_res

[{'verbs': [{'verb': 'is',
    'description': 'Sub - module [ARG1: available for the above] [V: is] [ARG2: sent_tokenize] .',
    'tags': ['O',
     'O',
     'O',
     'B-ARG1',
     'I-ARG1',
     'I-ARG1',
     'I-ARG1',
     'B-V',
     'B-ARG2',
     'O']}],
  'words': ['Sub',
   '-',
   'module',
   'available',
   'for',
   'the',
   'above',
   'is',
   'sent_tokenize',
   '.']},
 {'verbs': [{'verb': 'would',
    'description': 'An obvious question in your mind [V: would] be why sentence tokenization is needed when we have the option of word tokenization .',
    'tags': ['O',
     'O',
     'O',
     'O',
     'O',
     'O',
     'B-V',
     'O',
     'O',
     'O',
     'O',
     'O',
     'O',
     'O',
     'O',
     'O',
     'O',
     'O',
     'O',
     'O',
     'O',
     'O']},
   {'verb': 'be',
    'description': '[ARG1: An obvious question in your mind] [ARGM-MOD: would] [V: be] [ARG2: why sentence tokenization is needed when we have the option of word tokenization] .

In [12]:
roles,sentence_index = extract_roles(srl_res)
sentence_index

array([0, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 4, 4, 5, 5, 5, 6, 6, 7, 8],
      dtype=uint32)

In [13]:
roles

[{'ARG1': ['available', 'for', 'the', 'above'],
  'ARG2': ['sent_tokenize'],
  'B-V': ['is']},
 {'B-ARGM-MOD': ['would'],
  'ARG1': ['An', 'obvious', 'question', 'in', 'your', 'mind'],
  'ARG2': ['why',
   'sentence',
   'tokenization',
   'is',
   'needed',
   'when',
   'we',
   'have',
   'the',
   'option',
   'of',
   'word',
   'tokenization'],
  'B-V': ['be']},
 {'ARG1': ['sentence', 'tokenization'], 'B-V': ['needed']},
 {'ARGO': ['we'],
  'ARG1': ['the', 'option', 'of', 'word', 'tokenization'],
  'B-V': ['have']},
 {'ARG1': ['you',
   'need',
   'to',
   'count',
   'average',
   'words',
   'per',
   'sentence',
   ',',
   'how',
   'you',
   'will',
   'calculate'],
  'B-V': ['Imagine']},
 {'ARGO': ['you'],
  'ARG1': ['to', 'count', 'average', 'words', 'per', 'sentence'],
  'B-V': ['need']},
 {'ARGO': ['you'],
  'ARG1': ['average', 'words', 'per', 'sentence'],
  'B-V': ['count']},
 {'B-ARGM-MOD': ['will'], 'ARGO': ['you'], 'B-V': ['calculate']},
 {'ARGO': ['you'], 'ARG1': ['s

In [14]:
postproc_roles = postprocess_roles(roles)
postproc_roles

[{'ARG1': ['available', 'for', 'the', 'above'],
  'ARG2': ['senttokenize'],
  'B-V': ['is']},
 {'B-ARGM-MOD': ['would'],
  'ARG1': ['an', 'obvious', 'question', 'in', 'your', 'mind'],
  'ARG2': ['why',
   'sentence',
   'tokenization',
   'is',
   'needed',
   'when',
   'we',
   'have',
   'the',
   'option',
   'of',
   'word',
   'tokenization'],
  'B-V': ['be']},
 {'ARG1': ['sentence', 'tokenization'], 'B-V': ['needed']},
 {'ARGO': ['we'],
  'ARG1': ['the', 'option', 'of', 'word', 'tokenization'],
  'B-V': ['have']},
 {'ARG1': ['you',
   'need',
   'to',
   'count',
   'average',
   'word',
   'per',
   'sentence',
   'how',
   'you',
   'will',
   'calculate'],
  'B-V': ['imagine']},
 {'ARGO': ['you'],
  'ARG1': ['to', 'count', 'average', 'word', 'per', 'sentence'],
  'B-V': ['need']},
 {'ARGO': ['you'],
  'ARG1': ['average', 'word', 'per', 'sentence'],
  'B-V': ['count']},
 {'B-ARGM-MOD': ['will'], 'ARGO': ['you'], 'B-V': ['calculate']},
 {'ARGO': ['you'], 'ARG1': ['such', 'a', '

In [15]:
sif_vectors, sif_statements_index, sif_funny_index =compute_embedding(sif_w2v,statements=postproc_roles,
                                                                      used_roles=used_roles)

In [16]:
sif_statements_index

{'ARGO': array([ 3,  5,  6,  7,  8,  9, 10, 11, 16, 17, 19], dtype=uint32),
 'ARG1': array([ 0,  1,  2,  3,  4,  5,  6,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
        19], dtype=uint32),
 'ARG2': array([ 1, 12, 15], dtype=uint32),
 'B-V': array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
        17, 19], dtype=uint32)}

In [17]:
{el:sif_vectors[el].shape for el in sif_vectors.keys()}

{'ARGO': (11, 300), 'ARG1': (18, 300), 'ARG2': (3, 300), 'B-V': (19, 300)}

In [18]:
sif_funny_index

{'ARGO': array([], dtype=uint32),
 'ARG1': array([], dtype=uint32),
 'ARG2': array([0], dtype=uint32),
 'B-V': array([], dtype=uint32)}

In [19]:
postproc_roles[0]["ARG2"]

['senttokenize']

In [20]:
USE_vectors, USE_statements_index, USE_funny_index = compute_embedding(use,roles,used_roles)


In [21]:
USE_statements_index

{'ARGO': array([ 3,  5,  6,  7,  8,  9, 10, 11, 16, 17, 19], dtype=uint32),
 'ARG1': array([ 0,  1,  2,  3,  4,  5,  6,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
        19], dtype=uint32),
 'ARG2': array([ 0,  1, 12, 15], dtype=uint32),
 'B-V': array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
        17, 19], dtype=uint32)}

In [22]:
{el:USE_vectors[el].shape for el in USE_vectors.keys()}

{'ARGO': (11, 512), 'ARG1': (18, 512), 'ARG2': (4, 512), 'B-V': (19, 512)}

In [23]:
USE_funny_index

{'ARGO': array([], dtype=uint32),
 'ARG1': array([], dtype=uint32),
 'ARG2': array([], dtype=uint32),
 'B-V': array([], dtype=uint32)}

In [24]:
clustering = Clustering(cluster=kmeans,n_clusters={'ARGO':2, 'ARG1': 2, 'ARG2':2, 'B-V':1},
                         used_roles=used_roles)

In [25]:
clustering.fit(vectors=sif_vectors,sample_size=None)

In [26]:
{el:clustering._cluster[el].labels_ for el in clustering._cluster.keys()}

{'ARGO': array([0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1], dtype=int32),
 'ARG1': array([0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0], dtype=int32),
 'ARG2': array([0, 1, 0], dtype=int32),
 'B-V': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       dtype=int32)}

In [27]:
clustering_res = clustering.predict(vectors=sif_vectors)
clustering_res

{'ARGO': array([0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1], dtype=uint8),
 'ARG1': array([0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0], dtype=uint8),
 'ARG2': array([0, 1, 0], dtype=uint8),
 'B-V': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       dtype=uint8)}

In [28]:
labels = clustering.label(sif_w2v)
labels

{'ARGO': {0: ('you', 0.9963462948799133), 1: ('president', 0.704796314239502)},
 'ARG1': {0: ('ratio', 0.6853821277618408), 1: ('tokenization', 1.0)},
 'ARG2': {0: ('tokenization', 1.0), 1: ('numeric', 1.0)},
 'B-V': {0: ('apply', 0.7709030508995056)}}

In [29]:
cooc = CoOccurence(postproc_roles,clustering_res,labels,sif_statements_index,used_roles)

In [30]:
cooc._df

Unnamed: 0,ARGO,ARG1,ARG2,B-V,B-ARGM-MOD,B-ARGM-NEG
0,,0.0,,0.0,,
1,,0.0,0.0,0.0,would,
2,,1.0,,0.0,,
3,0.0,1.0,,0.0,,
4,,0.0,,0.0,,
5,0.0,0.0,,0.0,,
6,0.0,0.0,,0.0,,
7,0.0,,,0.0,will,
8,0.0,0.0,,0.0,,
9,0.0,1.0,,0.0,,


In [31]:
cooc.subset=None ## by convention None means take all roles
cooc.subset

{'ARG1', 'ARG2', 'ARGO', 'B-ARGM-MOD', 'B-ARGM-NEG', 'B-V'}

In [32]:
cooc.narratives_counts

{}

In [33]:
cooc.narratives_pmi

{}

In [34]:
cooc.subset={"ARGO","ARG1","B-V","B-ARGM-MOD","B-ARGM-NEG"}
cooc.narratives_counts

{('president', 'ratio', ('apply', 'will', True)): 1,
 ('president', 'ratio', ('apply', 'ca', True)): 1}

In [35]:
cooc.narratives_pmi

{('president', 'ratio', ('apply', 'will', True)): -1.3862943611198906,
 ('president', 'ratio', ('apply', 'ca', True)): -1.3862943611198906}

In [36]:
cooc.subset={"ARGO","ARG1","B-V"}
cooc.narratives_counts

{('you', 'ratio', 'apply'): 4,
 ('president', 'ratio', 'apply'): 4,
 ('you', 'tokenization', 'apply'): 2}

In [37]:
cooc.narratives_pmi

{('you', 'ratio', 'apply'): -4.787491742782046,
 ('president', 'ratio', 'apply'): -4.382026634673881,
 ('you', 'tokenization', 'apply'): -4.0943445622221}