In [109]:
import numpy as np
import os
import pandas as pd

In [6]:
RESULTS_DIR = "results-clean/xlnet-base-cased_pad30_2020-08-07-09-12/"
xlnet_pmi_matrices = np.load(os.path.join(RESULTS_DIR, 'pmi_matrices.npz'))
lstm_logp_matrices = np.load('lstm_logp_matrices.npz')
onlstm_logp_matrices = np.load('onlstm_logp_matrices.npz')

assert len(xlnet_pmi_matrices) == len(lstm_logp_matrices) == len(onlstm_logp_matrices)

We are calculating pmi matrix, where this is

$$\mathrm{PMI}({\bf w}_I;{\bf w}_J\mid {\bf w}_{0:J-1,J+1:I-1}) = 
\log \frac{p({\bf w}_I\mid  {\bf w}_{0:I-1})}
          {p({\bf w}_I\mid {\bf w}_{0:J-1,J+1:I-1})} = \log(unmasked) - \log(masked)$$
where ${\bf w}_{0:J-1,J+1:I-1}$ is the sentence up to ${\bf w}_I$, without ${\bf w}_J$.

--

The logp matrices are interpreted as follows: the value in row $r$, column $c$ is the estimated logp of word c+1 when word $r$ is masked.

that is

- in row $r$ we have masked the token in position $r$.
- in column $c$ are logprobabilities for the word in position $c+1$ (so there is one fewer columns than rows).
- if sentence length (without punctuation) $=n$ tokens, then the logp matrix is of size $(n+2)$-by-$(n+1)$, because an `<eos>` token is appended to the beginning and end of the sentence.


So, the final row in the logp matrix is the row of logps with _no masking_ (bc it is masking last element, which isn't seen by model).  So this is our pmi numerator. The entry at position $r,c$ is the logp of word at position $c+1$ when word at position $r$ is masked, so that gives the numerator for PMI. Note we don't care about predictions of `<eos>`, or masking of `<eos>`, so we will ignore the first row, as well as the last, and also the last column.

For consistency with the other PMI matrices, where rows correspond to target word, columns to masked word, we should transpose this result (so bottom triangle is nonzero).

In [79]:
def pmi_matrix_from_logp_matrix(input_logp_matrix, transpose=True):
    """Convert nparray of logps to pmi matrix"""
    logp_matrix = input_logp_matrix[:,:-1] # drop last column
    logp_unmasked = logp_matrix[-1] # last row is unmasked logps
    # dropping first and last row, subtract from unmasked logps
    pmimat = logp_unmasked - logp_matrix[1:-1]
    if transpose:
        pmimat = pmimat.T
    return(pmimat)

print('for example')
with np.printoptions(precision=2, suppress=True):
    testsen = lstm_logp_matrices.files[1276]
    testmat = lstm_logp_matrices[testsen]
    print(
        testsen, 
        pmi_matrix_from_logp_matrix(testmat), 
        sep='\n')

for example
1276 <eos> source telerate systems inc <eos>
[[ 0.    0.    0.    0.  ]
 [-0.06  0.    0.    0.  ]
 [ 0.35  3.09  0.    0.  ]
 [ 0.68  2.35  2.42  0.  ]]


In [107]:
counter=0
for i_sen in lstm_logp_matrices:
    if len(i_sen.split(' ')) < 6 or len(i_sen.split(' ')) > 7:
        continue
    counter+=1
    logp_matrix = lstm_logp_matrices[i_sen]
    print(logp_matrix.shape)
    pmi_matrix = pmi_matrix_from_logp_matrix(logp_matrix)
    with np.printoptions(precision=2, suppress=True):
        print(pmi_matrix, i_sen)
    if counter==4:
        break

(6, 5)
[[0.   0.   0.   0.  ]
 [1.82 0.   0.   0.  ]
 [1.16 3.27 0.   0.  ]
 [2.44 3.46 4.05 0.  ]] 69 <eos> terms were n't disclosed <eos>
(6, 5)
[[ 0.    0.    0.    0.  ]
 [ 3.74  0.    0.    0.  ]
 [-0.59 -0.47  0.    0.  ]
 [ 0.13 -0.05  0.2   0.  ]] 102 <eos> i love 'em both <eos>
(5, 4)
[[ 0.    0.    0.  ]
 [ 0.54  0.    0.  ]
 [-1.49 -0.11  0.  ]] 123 <eos> that 's baseball <eos>
(6, 5)
[[0.   0.   0.   0.  ]
 [1.91 0.   0.   0.  ]
 [2.4  2.   0.   0.  ]
 [0.52 0.53 1.25 0.  ]] 179 <eos> it 's very frustrating <eos>


In [96]:
i=1276
print("Dev sentence number", i, '\n')
with np.printoptions(precision=2, suppress=True):
    for npz in [lstm_logp_matrices, lstm_logp_matrices]:
        sentence = npz.files[i]
        idx = sentence.split(' ')[0]
        assert int(idx) == i
        matrix = npz[sentence]
        print(sentence, np.shape(matrix), matrix, sep='\n')
        print()

Dev sentence number 1276 

1276 <eos> source telerate systems inc <eos>
(6, 5)
[[ -9.82 -11.66  -5.44  -2.26  -9.53]
 [-10.4  -11.27  -5.67  -2.9   -7.43]
 [-10.4  -11.33  -8.41  -4.57  -7.85]
 [-10.4  -11.33  -5.33  -4.64  -8.2 ]
 [-10.4  -11.33  -5.33  -2.21  -3.51]
 [-10.4  -11.33  -5.33  -2.21  -9.55]]

1276 <eos> source telerate systems inc <eos>
(6, 5)
[[ -9.82 -11.66  -5.44  -2.26  -9.53]
 [-10.4  -11.27  -5.67  -2.9   -7.43]
 [-10.4  -11.33  -8.41  -4.57  -7.85]
 [-10.4  -11.33  -5.33  -4.64  -8.2 ]
 [-10.4  -11.33  -5.33  -2.21  -3.51]
 [-10.4  -11.33  -5.33  -2.21  -9.55]]



In [125]:
s1 = pd.Series(lstm_logp_matrices.files, name='simplified')
s2 = pd.Series(xlnet_pmi_matrices.files, name='not simplified')
sent_comparison = pd.concat([s1, s2], axis=1)
sent_comparison.loc[390:400]

Unnamed: 0,simplified,not simplified
390,390 <eos> the fdic which in N provided N milli...,"390 The FDIC , which in 1986 provided $ 130 mi..."
391,391 <eos> in exchange for the other N million ...,"391 In exchange for the other $ 40 million , t..."
392,392 <eos> in exchange for the N million they a...,392 In exchange for the $ 76 million they are ...
393,393 <eos> also under the agreement debenture h...,"393 Also under the agreement , debenture holde..."
394,394 <eos> bear stearns 's chief economist lawr...,"394 Bear Stearns 's chief economist , Lawrence..."
395,395 <eos> were it true that a weak currency pa...,395 Were it true that a weak currency paves th...
396,396 <eos> bsn corp. said it will begin an offe...,396 BSN Corp. said it will begin an offer tomo...
397,397 <eos> under terms of the offer the sportin...,"397 Under terms of the offer , the sporting go..."
398,398 <eos> each warrant allows the holder to bu...,398 Each warrant allows the holder to buy one ...
399,399 <eos> bsn currently has N.N million common...,399 BSN currently has 4.6 million common share...
