In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.metrics import ndcg_score
from textsimilarity.get_similarity import get_sent_similarity, get_graph_similarity
from textsimilarity.preprocess import get_coref_and_dp
from textsimilarity.get_w2v_features import get_word2vec_embeddings
from textsimilarity.build_semantic_graph.build_semantic_graph import get_graph_from_sent
from textsimilarity import draw_graph
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN
from sklearn import metrics
from sklearn.datasets import make_blobs
from sklearn.preprocessing import StandardScaler


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jiajinghu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




In [2]:
from allennlp.predictors.predictor import Predictor
predictor = Predictor.from_path("https://storage.googleapis.com/allennlp-public-models/bert-base-srl-2020.03.24.tar.gz")


RuntimeError: Error(s) in loading state_dict for SrlBert:
	Missing key(s) in state_dict: "bert_model.embeddings.position_ids". 

In [5]:
import spacy
from spacy.tokens import Token
Token.set_extension('srl_arg0', default=None)
Token.set_extension('srl_arg1', default=None)

In [2]:
from sentence_transformers import SentenceTransformer
import sentence_transformers
model = SentenceTransformer('distilbert-base-nli-stsb-mean-tokens')

In [6]:
nlp = spacy.load("en_core_web_sm")

In [7]:
def cosine_similarity(vec1, vec2):
    dot = np.dot(vec1, vec2)
    norm1 = np.linalg.norm(vec1)
    norm2 = np.linalg.norm(vec2)
    return dot/(norm1*norm2)


In [8]:
def srl(sent):
    doc = nlp(sent)
    words = [token.text for token in doc]
    for i, word in enumerate(doc):
        if word.pos_ == "VERB":
            verb = word.text
            verb_labels = [0 for _ in words]
            verb_labels[i] = 1
            instance = predictor._dataset_reader.text_to_instance(doc, verb_labels)
            output = predictor._model.forward_on_instance(instance)
            tags = output['tags']
    
            if "B-ARG0" in tags:
                start = tags.index("B-ARG0")
                end = max([i for i, x in enumerate(tags) if x == "I-ARG0"] + [start]) + 1
                word._.set("srl_arg0", doc[start:end])
    
            if "B-ARG1" in tags:
                start = tags.index("B-ARG1")
                end = max([i for i, x in enumerate(tags) if x == "I-ARG1"] + [start]) + 1
                word._.set("srl_arg1", doc[start:end])
    for w in doc:
        if w.pos_ == "VERB":
            print("ARG0:", w._.srl_arg0)
            print("VERB:", w)
            print("ARG1:", w._.srl_arg1)
            print("-----------------")

In [6]:
query = """"What might be supporting the dollar: expectations of Fed rate hikes have really increased over the last week 
or so," said Daniel Ghali, commodity strategist at TD Securities, after new Chair Jerome Powell's testimony indicated 
his optimistic outlook on the U.S. economy and opened the door for four interest rate hikes.
"""

In [7]:
srl(query)

ARG0: None
VERB: might
ARG1: None
-----------------
ARG0: None
VERB: be
ARG1: None
-----------------
ARG0: What
VERB: supporting
ARG1: the dollar
-----------------
ARG0: None
VERB: have
ARG1: None
-----------------
ARG0: None
VERB: increased
ARG1: expectations of Fed rate hikes
-----------------
ARG0: Daniel Ghali, commodity strategist at TD Securities
VERB: said
ARG1: What might be supporting the dollar: expectations of Fed rate hikes have really increased over the last week 
or so
-----------------
ARG0: new Chair Jerome Powell's testimony
VERB: indicated
ARG1: 
his optimistic outlook on the U.S. economy
-----------------
ARG0: new Chair Jerome Powell's testimony
VERB: opened
ARG1: the door
-----------------


In [8]:
q1_sents = pd.read_csv('../data/financial_news/interim/query_1_sentences_random.csv')
# q1_sents = q1_sents.rename(columns = {'Sentence':'sentences'})
q1_sents.head()

Unnamed: 0,sentence1,sentence2,whole_sim,subject_sim,action_sim,obj_sim
0,He has generally supported the existing outloo...,"""As we approach next week's FOMC day, we shoul...",0,0,0,0.0
1,He has generally supported the existing outloo...,Treasury yields shrank further on Wednesday af...,3,0,0,4.0
2,He has generally supported the existing outloo...,TD Securities: Bearish view on the US dollar 1...,0,0,0,0.0
3,He has generally supported the existing outloo...,The U.S. economy should grow at a good clip th...,0,0,0,1.0
4,"""It shows how sensitive the markets are around...","""It seems like the market is positioned for so...",3,4,5,4.0


In [3]:
ndcg_score(np.asarray([q1_sents.whole_sim.values.tolist()]), np.asarray([q1_sents.srl_sim.values.tolist()]))

0.6798728627737926

In [9]:
np.corrcoef(q1_sents.whole_sim.values.tolist(), q1_sents.srl_sim.values.tolist())

AttributeError: 'DataFrame' object has no attribute 'srl_sim'

In [10]:
sentences1 = q1_sents.sentence1.values
sentences2 = q1_sents.sentence2.values

In [11]:
embeddings1 = model.encode(sentences1)
embeddings2 = model.encode(sentences2)

In [12]:
# q1_sents['bert_sim'] = [cosine_similarity(embeddings[0], x) for x in embeddings]
q1_sents['bert_sim'] = list(map(cosine_similarity, embeddings1, embeddings2))
#
q1_sents.head()

Unnamed: 0,sentence1,sentence2,whole_sim,subject_sim,action_sim,obj_sim,bert_sim
0,He has generally supported the existing outloo...,"""As we approach next week's FOMC day, we shoul...",0,0,0,0.0,0.426786
1,He has generally supported the existing outloo...,Treasury yields shrank further on Wednesday af...,3,0,0,4.0,0.541551
2,He has generally supported the existing outloo...,TD Securities: Bearish view on the US dollar 1...,0,0,0,0.0,0.303528
3,He has generally supported the existing outloo...,The U.S. economy should grow at a good clip th...,0,0,0,1.0,0.629319
4,"""It shows how sensitive the markets are around...","""It seems like the market is positioned for so...",3,4,5,4.0,0.535848


In [68]:
q1_sents.sentences.iloc[3]

'"I think you\'re going to continue to see higher rates until Powell shoots something across the bow. He either gets rates going higher or he slows it down," said Andrew Brenner of National Alliance.'

In [79]:
ndcg_score(np.asarray([q1_sents.whole_sim.values.tolist()]), np.asarray([q1_sents.bert_sim.values.tolist()]))

0.8827762878195183

In [13]:
np.corrcoef(q1_sents.whole_sim.values.tolist(), q1_sents.bert_sim.values.tolist())

array([[1.        , 0.31470882],
       [0.31470882, 1.        ]])

In [14]:
# query_coref = get_coref_and_dp.get_neural_coreference(query)
query_graph, _,  query_coref= get_graph_from_sent(query)
query_w2v = get_word2vec_embeddings(' '.join(query_coref), with_coref=False)


NameError: name 'query' is not defined

In [15]:
# wms_sents = []
# graph_sents0 = []
# graph_sents1 = []
# for sent in q1_sents.sentences.values:
# #     coref = get_coref_and_dp.get_neural_coreference(sent)
#     graph, _, coref = get_graph_from_sent(sent)
#     w2v = get_word2vec_embeddings(' '.join(coref), with_coref=False)
#     wms_sents.append(get_sent_similarity(query_coref, coref, query_w2v, w2v))
#     graph_sents0.append(get_sent_similarity(query_coref, coref, query_w2v, w2v, query_graph, graph, 0, threshold=0))
#     graph_sents1.append(get_sent_similarity(query_coref, coref, query_w2v, w2v, query_graph, graph, 0, threshold=0.9))
# q1_sents['wms'] = wms_sents
# q1_sents['graph_sim_0'] = graph_sents0
# q1_sents['graph_sim_0.9'] = graph_sents1
# q1_sents['graph_sim_3'] = graph_sents2

wms_sents = []
graph_sents0 = []
graph_sents1 = []
for _,row in q1_sents.iterrows():
    sent1 = row.sentence1
    sent2 = row.sentence2
    graph1, _, coref1 = get_graph_from_sent(sent1)
    w2v1 = get_word2vec_embeddings(' '.join(coref1), with_coref=False)
    
    graph2, _, coref2 = get_graph_from_sent(sent2)
    w2v2 = get_word2vec_embeddings(' '.join(coref2), with_coref=False)
    
    wms_sents.append(get_sent_similarity(coref1, coref2, w2v1, w2v2))
    graph_sents0.append(get_sent_similarity(coref1, coref2, w2v1, w2v2, graph1, graph2, 0, threshold=0))
    graph_sents1.append(get_sent_similarity(coref1, coref2, w2v1, w2v2, graph1, graph2, 0, threshold=0.9))
q1_sents['wms'] = wms_sents
q1_sents['graph_sim_0'] = graph_sents0
q1_sents['graph_sim_0.9'] = graph_sents1

q1_sents.head()

  node_features.append(embeddings[[node['index']]].mean(axis=0))


Unnamed: 0,sentence1,sentence2,whole_sim,subject_sim,action_sim,obj_sim,bert_sim,wms,graph_sim_0,graph_sim_0.9
0,He has generally supported the existing outloo...,"""As we approach next week's FOMC day, we shoul...",0,0,0,0.0,0.426786,0.662154,3.074411,2.430974
1,He has generally supported the existing outloo...,Treasury yields shrank further on Wednesday af...,3,0,0,4.0,0.541551,0.554631,1.612061,1.612061
2,He has generally supported the existing outloo...,TD Securities: Bearish view on the US dollar 1...,0,0,0,0.0,0.303528,0.759466,0.07613,0.07613
3,He has generally supported the existing outloo...,The U.S. economy should grow at a good clip th...,0,0,0,1.0,0.629319,0.779174,2.597353,2.326073
4,"""It shows how sensitive the markets are around...","""It seems like the market is positioned for so...",3,4,5,4.0,0.535848,0.861834,3.134896,1.936227


In [88]:
ndcg_score(np.asarray([q1_sents.whole_sim.values.tolist()]), np.asarray([q1_sents.wms.values.tolist()]))

0.7815523535956608

In [89]:
ndcg_score(np.asarray([q1_sents.whole_sim.values.tolist()]), np.asarray([q1_sents.graph_sim_0.values.tolist()]))

0.7519870780795614

In [90]:
ndcg_score(np.asarray([q1_sents.whole_sim.values.tolist()]), np.asarray([q1_sents['graph_sim_0.9'].values.tolist()]))

0.7247981349632608

In [16]:
np.corrcoef(q1_sents.whole_sim.values.tolist(), q1_sents.wms.values.tolist())

array([[1.        , 0.03441451],
       [0.03441451, 1.        ]])

In [17]:
np.corrcoef(q1_sents.whole_sim.values.tolist(), q1_sents.graph_sim_0.values.tolist())

array([[1.        , 0.00430855],
       [0.00430855, 1.        ]])

In [18]:
np.corrcoef(q1_sents.whole_sim.values.tolist(), q1_sents['graph_sim_0.9'].values.tolist())

array([[ 1.        , -0.02064296],
       [-0.02064296,  1.        ]])

In [12]:
for sent in sentences:
    print(sent)
    srl(sent)
    print()
    print()

He has generally supported the existing outlook by the Fed for three rate hikes but if the stimulus from tax cuts boosts both growth and inflation, Powell could soon face a question of whether he needs to do more or do it faster.
ARG0: None
VERB: has
ARG1: None
-----------------
ARG0: He
VERB: supported
ARG1: the existing outlook by the Fed
-----------------
ARG0: None
VERB: existing
ARG1: outlook
-----------------
ARG0: the stimulus from tax cuts
VERB: boosts
ARG1: both growth and inflation
-----------------
ARG0: None
VERB: could
ARG1: None
-----------------
ARG0: Powell
VERB: face
ARG1: a question of whether he needs to do more or do it faster
-----------------
ARG0: he
VERB: needs
ARG1: to do more or do it faster
-----------------
ARG0: he
VERB: do
ARG1: more
-----------------
ARG0: he
VERB: do
ARG1: it
-----------------


But while the immediate horizon appears clear, Powell faces the risk that today's sturdy economy, low inflation and rising stock market could reverse course in t

ARG0: None
VERB: should
ARG1: None
-----------------
ARG0: None
VERB: grow
ARG1: The U.S. economy
-----------------
ARG0: by tax cuts
VERB: boosted
ARG1: The U.S. economy
-----------------
ARG0: two Fed officials
VERB: said
ARG1: None
-----------------
ARG0: they
VERB: disagreed
ARG1: None
-----------------
ARG0: None
VERB: is
ARG1: the economy
-----------------
ARG0: None
VERB: overheating
ARG1: None
-----------------


 Gold prices held on to losses from the previous session early on Thursday, after the precious metal fell to four-week lows on a firmer dollar amid expectations of more U.S. interest rate hikes.
ARG0: None
VERB: held
ARG1:  Gold prices
-----------------
ARG0: None
VERB: fell
ARG1: the precious metal
-----------------


The dramatic moves in equities and bonds this week were stoked by concerns about signs of inflation amid an improving global backdrop and speculation whether the Federal Reserve and other major central banks would act quicker to raise interest rates.
ARG

ARG0: None
VERB: is
ARG1: None
-----------------
ARG0: None
VERB: expected
ARG1: investment-grade corporate supply
-----------------
ARG0: None
VERB: ’re
ARG1: None
-----------------
ARG0: we
VERB: seeing
ARG1: some of that come through this week
-----------------
ARG0: None
VERB: come
ARG1: some of that
-----------------
ARG0: Priya Misra, head of global rates strategy at TD Securities in New York
VERB: said
ARG1: There’s a lot of investment-grade corporate supply that is normally expected in January, and we’re seeing some of that come through this week
-----------------


TD Securities: Bearish view on the US dollar 11 Hours Ago U.S. Treasury yields are being driven by global economic growth rather than the dollar, Richard Kelly, head of global strategy at TD Securities, said.
ARG0: None
VERB: are
ARG1: None
-----------------
ARG0: None
VERB: being
ARG1: None
-----------------
ARG0: by global economic growth rather than the dollar
VERB: driven
ARG1: U.S. Treasury yields
-------------

ARG0: you
VERB: get
ARG1: much market reaction to this
-----------------
ARG0: Daniel Ghali, commodities strategist at TD Securities in Toronto
VERB: said
ARG1: Historically you donâ€™t really get much market reaction to this
-----------------
ARG0: Daniel Ghali, commodities
VERB: strategist
ARG1: TD Securities
-----------------


"I would expect the recent correction in equity prices to show a little bit more bid (in gold), but at the moment it doesn't seem to be the case," said Daniel Ghali, commodities strategist at TD Securities in Toronto
ARG0: None
VERB: would
ARG1: None
-----------------
ARG0: I
VERB: expect
ARG1: the recent correction in equity prices to show a little bit more bid (in gold)
-----------------
ARG0: the recent correction in equity prices
VERB: show
ARG1: a little bit more bid (in gold)
-----------------
ARG0: None
VERB: does
ARG1: None
-----------------
ARG0: None
VERB: seem
ARG1: to be the case
-----------------
ARG0: None
VERB: be
ARG1: it
-----------------
ARG

In [13]:
original_mention = 1+1+1+1+1+1+1+1+1+1+1+1+1+1+1+1+1+1+1
original_mention_srl_no_mention = 1+1+1+1
increase_verb_interest_rate_arg1 = 1+1+1
increase_verb_expectation_interest_rate_arg1 = 1+1+1

expectation_rate_increase_arg0 = 1
expectation_rate_increaes_arg1 = 1+1+1+1+1
increase_rates_arg1 = 1+1+1+1+1

In [14]:
print('original_mention: ', original_mention)
print('original_mention_srl_no_mention: ', original_mention_srl_no_mention)
print('increase_verb_interest_rate_arg1: ', increase_verb_interest_rate_arg1)
print('expectation_rate_increase_arg0: ', expectation_rate_increase_arg0)
print('expectation_rate_increaes_arg1: ', expectation_rate_increaes_arg1)
print('increase_rates_arg1: ', increase_rates_arg1)


original_mention:  19
original_mention_srl_no_mention:  4
increase_verb_interest_rate_arg1:  3
expectation_rate_increase_arg0:  1
expectation_rate_increaes_arg1:  5
increase_rates_arg1:  5


In [15]:
4/19

0.21052631578947367

In [16]:
3/19

0.15789473684210525

In [17]:
5/19

0.2631578947368421

In [30]:
txt = ['The interest rate will increase',
    'The interest rate will not increase',
            'The interest rate will decrease']

In [4]:
wms_txt = []
graph_txt0 = []
graph_txt1 = []
for sent in txt:
    graph, _, coref = get_graph_from_sent(sent)
    w2v = get_word2vec_embeddings(' '.join(coref), with_coref=False)
    wms_txt.append(get_sent_similarity(query_coref, coref, query_w2v, w2v))
    graph_txt0.append(get_sent_similarity(query_coref, coref, query_w2v, w2v, query_graph, graph, 0, threshold=0))
    graph_txt1.append(get_sent_similarity(query_coref, coref, query_w2v, w2v, query_graph, graph, 0, threshold=0.9))

In [31]:
graph0, _, coref0 = get_graph_from_sent(txt[0])
w2v0 = get_word2vec_embeddings(' '.join(coref0), with_coref=False)

graph1, _, coref1 = get_graph_from_sent(txt[1])
w2v1 = get_word2vec_embeddings(' '.join(coref1), with_coref=False)

graph2, _, coref2 = get_graph_from_sent(txt[2])
w2v2 = get_word2vec_embeddings(' '.join(coref2), with_coref=False)

In [32]:
get_sent_similarity(coref0, coref1, w2v0, w2v1)

0.9617747217416763

In [33]:
get_sent_similarity(coref0, coref2, w2v0, w2v2)

0.9553444087505341

In [34]:
get_sent_similarity(coref1, coref2, w2v1, w2v2)

0.9264016300439835

In [35]:
get_sent_similarity(coref0, coref1, w2v0, w2v1, graph0, graph1, 0, threshold=0)

  node_features.append(embeddings[[node['index']]].mean(axis=0))


0.916428416967392

In [36]:
get_sent_similarity(coref0, coref2, w2v0, w2v2, graph0, graph2, 0, threshold=0)

0.9341770708560944

In [37]:
get_sent_similarity(coref1, coref2, w2v1, w2v2, graph1, graph2, 0, threshold=0)

0.900338351726532

In [63]:
kmeans = KMeans(n_clusters=2, random_state=0).fit(embeddings)
kmeans.labels_

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
      dtype=int32)

In [64]:
q1_sents[kmeans.labels_==1].sentences.values

array(['What might be supporting the dollar: expectations of Fed rate hikes have really increased over the last week or so," said Daniel Ghali, commodity strategist at TD Securities, after new Chair Jerome Powell\'s testimony indicated his optimistic outlook on the U.S. economy and opened the door for four interest rate hikes. ',
       'He has generally supported the existing outlook by the Fed for three rate hikes but if the stimulus from tax cuts boosts both growth and inflation, Powell could soon face a question of whether he needs to do more or do it faster.',
       "But while the immediate horizon appears clear, Powell faces the risk that today's sturdy economy, low inflation and rising stock market could reverse course in the coming months or years, forcing him to come up with the right remedies.",
       '"I think you\'re going to continue to see higher rates until Powell shoots something across the bow. He either gets rates going higher or he slows it down," said Andrew Brenn

In [65]:
q1_sents[kmeans.labels_==1].whole_sim.mean()

1.5833333333333333

In [66]:
q1_sents[kmeans.labels_==0].sentences.values

array(['"It shows how sensitive the markets are around the discussion on U.S. dollar and trade in general," said Mark McCormick, North America head of foreign exchange strategy at TD Securities.',
       '"It\'s not good or bad. It\'s a surprise and markets aren\'t priced for it. There\'s this pivot away from the U.S. dollar bull market. Part of it is driven by global reflation and part of it is driven by Trump, " said Mark McCormick, North American head of foreign exchange strategy at TD Securities.',
       'The report "added pressure on the U.S. dollar and helped gold," said Bart Melek, head of commodity strategy at TD Securities in Toronto. ',
       'TD Securities: Bearish view on the US dollar 11 Hours Ago U.S. Treasury yields are being driven by global economic growth rather than the dollar, Richard Kelly, head of global strategy at TD Securities, said.',
       '"Obviously, equity markets stabilized a bit into the North American session, but the dollar is starting to stage a li

In [67]:
q1_sents[kmeans.labels_==0].whole_sim.mean()

0.5882352941176471

In [47]:
bert_sim_matrix = np.zeros((q1_sents.shape[0], q1_sents.shape[0]))
for i in range(q1_sents.shape[0]):
    for j in range(i+1, q1_sents.shape[0]):
        bert_sim_matrix[i,j] = cosine_similarity(embeddings[i], embeddings[j])
bert_sim_matrix = bert_sim_matrix + bert_sim_matrix.T + np.identity(q1_sents.shape[0])
bert_sim_matrix 

array([[1.        , 0.5873307 , 0.34986106, ..., 0.51824355, 0.43585911,
        0.47856343],
       [0.5873307 , 1.        , 0.55838662, ..., 0.42678618, 0.36078858,
        0.4242098 ],
       [0.34986106, 0.55838662, 1.        , ..., 0.34495449, 0.37720671,
        0.32313287],
       ...,
       [0.51824355, 0.42678618, 0.34495449, ..., 1.        , 0.58515263,
        0.53605115],
       [0.43585911, 0.36078858, 0.37720671, ..., 0.58515263, 1.        ,
        0.64701819],
       [0.47856343, 0.4242098 , 0.32313287, ..., 0.53605115, 0.64701819,
        1.        ]])