In [74]:
import pickle
import numpy as np
from util import tfidf_score, textrank_info, remove_dict_val_less_than_K, sort_dict_by_value

In [2]:
with open('news_list.pkl', "rb") as f:
    news_list = pickle.load(f)

In [3]:
content_list = [news['content'] for news in news_list]
tfidf_dict_list = tfidf_score(content_list)
tfidf_dict = remove_dict_val_less_than_K(tfidf_dict_list[0], 0)

In [4]:
G, vocab, tr_dict = textrank_info(content_list[0])

In [5]:
G

array([[0., 1., 1., ..., 0., 0., 0.],
       [1., 0., 1., ..., 0., 0., 0.],
       [1., 1., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 1., 1.],
       [0., 0., 0., ..., 1., 0., 1.],
       [0., 0., 0., ..., 1., 1., 0.]])

In [6]:
def group_by_std(tfidf_dict, lower, upper):
    g_low = dict()
    g_mid = dict()
    g_hig = dict()

    for kw in tfidf_dict:
        kw_score = tfidf_dict[kw]
        if kw_score < lower:
            g_low[kw] = kw_score
        elif kw_score > upper:
            g_hig[kw] = kw_score
        else:
            g_mid[kw] = kw_score
    return g_low, g_mid, g_hig

def expect_boundary(g_mid, tr_dict):
    g_mid_kw = [kw for kw in g_mid]
    g_mid_tr_score = []
    for kw in g_mid_kw:
        g_mid_tr_score.append(tr_dict[kw])

    s_min = min(g_mid_tr_score)
    s_max = max(g_mid_tr_score)
    return s_min, s_max

def creat_expect_dict(g_low, g_hig, g_mid, s_min, s_max):
    expect_dict = dict()
    for kw in g_low:
        if tr_dict[kw] > s_min:
            expect_dict[kw] = s_min
        else:
            expect_dict[kw] = np.nan
    for kw in g_hig:
        if tr_dict[kw] < s_max:
            expect_dict[kw] = s_max
        else:
            expect_dict[kw] = np.nan
    for kw in g_mid:
        expect_dict[kw] = np.nan
    return expect_dict

def get_expect_dict(tfidf_dict, tr_dict):
    tfidf_dict = remove_dict_val_less_than_K(tfidf_dict)
    std_tfidf = np.std(  [tfidf_dict[k] for k in tfidf_dict])
    avg_tfidf = np.array([tfidf_dict[k] for k in tfidf_dict]).mean()
    print("std:{:.5f},\t avg:{:.5f}".format(std_tfidf, avg_tfidf))

    lower = avg_tfidf-(0.3 * std_tfidf)
    upper = avg_tfidf+(0.6 * std_tfidf)
    g_low, g_mid, g_hig = group_by_std(tfidf_dict, lower, upper)

    s_min, s_max = expect_boundary(g_mid, tr_dict)
    expect_dict = creat_expect_dict(g_low, g_hig, g_mid, s_min, s_max)
    return expect_dict

In [58]:
# ## 1. Compute the difference between expected score and current score
# 
# There are two cases [Eq.35]

def find_vocab_by_idx(vocab, idx):
    return list(vocab.items())[nk_idx][0] # ex:('storm',31) only get the node part

def get_g_norm(G):
    norm = np.sum(G, axis=0)
    g_norm = np.divide(G, norm, where=norm!=0) # this is ignore the 0 element in norm
    return g_norm

def calculateDifferentials_init(expect_dict, textrank_dict):
    difference_dict = dict()
    for node, T_j in expect_dict.items():
        if(T_j is not np.nan):
            A_j = textrank_dict[node]
            d_j = T_j - A_j
            difference_dict[node] = d_j
        else:
            difference_dict[node] = 0
    return difference_dict

def calculateDifferentials(G, expect_dict, difference_dict):
    g_norm = get_g_norm(G)

    TEXTRANK_DAMPING_FACTOR = 0.85
    is_converge = False
    while(is_converge == False):
        is_converge = True
        for node, T_j in expect_dict.items():
            if(T_j == np.nan):
                previous_d_j = difference_dict[node]
                d_j = 0
                node_idx = vocab[node]
                node_k_idxs = np.where(G[node_idx] != 0)[0]
                for nk_idx in node_k_idxs:
                    node_k = find_vocab_by_idx(vocab, nk_idx) 
                    d_j += difference_dict[node_k] * g_norm[node_idx][nk_idx]
                d_j *= TEXTRANK_DAMPING_FACTOR
                difference_dict[node] = round(d_j,6)
                if(previous_d_j != d_j):
                    is_converge = False
    return difference_dict

In [50]:
# ## 2. Calculate delta weight [Eq.36]
def creat_np_array(weight_dict, vocab, transpose=False):
    if transpose:
        return np.array([weight_dict[v] for v in vocab])[:,None]
    else:
        return np.array([weight_dict[v] for v in vocab])

def get_delta_weight(G, textrank_dict, difference_dict, learningRate):
    delta_weight = np.zeros( (len(G),len(G)), dtype=float )
    TEXTRANK_DAMPING_FACTOR = 0.85
    
    A_i = creat_np_array(textrank_dict, vocab, transpose=True)
    d_j = creat_np_array(difference_dict, vocab)
    delta_normalized_w_ij = learningRate * d_j * TEXTRANK_DAMPING_FACTOR * A_i  # [Eq.34]
    
    denormalizationDenominator = np.sum(G, axis=0)
    delta_weight = delta_normalized_w_ij * denormalizationDenominator # [Eq.36]
    return delta_weight
    
#     for node in vocab:
#         A_i = textrank_dict[node]
#         denormalizationDenominator = np.sum(G, axis=0)
# ##         for node, out_node in G.out_edges(node):
# ##             denormalizationDenominator += G[node][out_node]['weight']
# #         if(node == 'NN=bifurcation'):
# #             print(textRank_score_dict[node])
# #             print(denormalizationDenominator)
#         for node, out_node in G.out_edges(node):
#             d_j = difference_dict[out_node]
#             delta_normalized_w_ij = learningRate * d_j * TEXTRANK_DAMPING_FACTOR * A_i  # [Eq.34]
#             delta_normalized_w_ij *= denormalizationDenominator # [Eq.36]
#             delta_weight_graph_dict[(node, out_node)] = delta_normalized_w_ij
#     return delta_weight_graph_dict

In [52]:
text = "Robin is so happy to see Bob. But he are not happy to see Robin."
G, vocab, tr_dict = textrank_info(text)
vocab

OrderedDict([('robin', 0), ('happy', 1), ('bob', 2)])

In [57]:
G

array([[0., 2., 1.],
       [2., 0., 1.],
       [1., 1., 0.]])

In [56]:
G * s

array([[0.        , 2.20305556, 0.79694444],
       [2.20305556, 0.        , 0.79694444],
       [1.10152778, 1.10152778, 0.        ]])

In [55]:
s = creat_np_array(tr_dict, vocab)
s

array([1.10152778, 1.10152778, 0.79694444])

In [44]:

#A_i = textrank_dict[node]
np.array([tr_dict[v] for v in vocab])[:,None]

array([[ 3.52778479,  7.1572738 ,  2.16662736, ...,  1.71178077,
         1.46910709,  1.24136093],
       [ 7.1572738 , 14.52088813,  4.39571748, ...,  3.47291131,
         2.98056778,  2.51850965],
       [ 2.16662736,  4.39571748,  1.33065774, ...,  1.05130876,
         0.90226808,  0.7623953 ],
       ...,
       [ 1.71178077,  3.47291131,  1.05130876, ...,  0.83060436,
         0.71285223,  0.60234337],
       [ 1.46910709,  2.98056778,  0.90226808, ...,  0.71285223,
         0.61179345,  0.51695107],
       [ 1.24136093,  2.51850965,  0.7623953 , ...,  0.60234337,
         0.51695107,  0.4368115 ]])

In [51]:
# init setting
learningRate = 0.5

# tfidf_dict
content_list = [news['content'] for news in news_list]
tfidf_dict_list = tfidf_score(content_list)
tfidf_dict = remove_dict_val_less_than_K(tfidf_dict_list[0], 0)

# init textrank_info
G, vocab, tr_dict = textrank_info(content_list[0])



expect_dict = get_expect_dict(tfidf_dict, tr_dict)

difference_dict = calculateDifferentials_init(expect_dict, tr_dict)        # [Eq.35]
difference_dict = calculateDifferentials(G, expect_dict, difference_dict)  # [Eq.35]
delta_weight = get_delta_weight(G, tr_dict, difference_dict, learningRate) # [Eq.36+34]
delta_weight # add this to textrank

std:0.00682,	 avg:0.00959


array([[20.89858785,  0.        , 16.96188282, ..., -2.0729294 ,
        -1.21175959, -0.5822392 ],
       [42.39967129,  0.        , 34.41276803, ..., -4.20562029,
        -2.45845358, -1.18126405],
       [12.83509483,  0.        , 10.41732465, ..., -1.27311212,
        -0.74421532, -0.35758853],
       ...,
       [10.14058482,  0.        ,  8.2303844 , ..., -1.00584387,
        -0.58797996, -0.2825189 ],
       [ 8.70298655,  0.        ,  7.06358913, ..., -0.8632486 ,
        -0.50462392, -0.2424671 ],
       [ 7.35381889,  0.        ,  5.96856664, ..., -0.72942476,
        -0.42639534, -0.20487899]])

In [80]:
### main RankUP code
# init setting
min_diff = 1e-5
learningRate = 0.5
steps = 1000
TEXTRANK_DAMPING_FACTOR = 0.85

# tfidf_dict
content_list = [news['content'] for news in news_list]
tfidf_dict_list = tfidf_score(content_list)
tfidf_dict = remove_dict_val_less_than_K(tfidf_dict_list[0], 0)

# init textrank_info
G, vocab, tr_dict = textrank_info(content_list[0])

# Initionlization for weight(pagerank value)
tr = creat_np_array(tr_dict, vocab)


# Iteration
previous_tr = 0
for epoch in range(steps):
    expect_dict = get_expect_dict(tfidf_dict, tr_dict)  # tr_dict changes

    difference_dict = calculateDifferentials_init(expect_dict, tr_dict)        # [Eq.35]
    difference_dict = calculateDifferentials(G, expect_dict, difference_dict)  # [Eq.35]
    delta_weight = get_delta_weight(G, tr_dict, difference_dict, learningRate) # [Eq.36+34]
    G = G + delta_weight # [Eq.37]
    g_norm = get_g_norm(G)
    
    tr = (1-TEXTRANK_DAMPING_FACTOR) + ( TEXTRANK_DAMPING_FACTOR * np.dot(g_norm, tr) )

    tr_dict = dict()
    for word, index in vocab.items():
        tr_dict[word] = tr[index]
    
    if abs(previous_tr - sum(tr))  < min_diff:
        break
    else:
        previous_tr = sum(tr)


std:0.00682,	 avg:0.00959
std:0.00682,	 avg:0.00959


In [81]:
sort_dict_by_value(tr_dict)

{'said': 5.533734981327617,
 'surigae': 5.091921946355722,
 'taiwan': 2.963463627496687,
 'yesterday': 2.5663388725174183,
 'strength': 2.3509992899135463,
 'moving': 2.2346531613995935,
 'storm': 2.051535444087973,
 'southern': 2.048311512299463,
 'km': 1.9663524015807363,
 'ming': 1.965449582868652,
 '’s': 1.9062252154446244,
 'weatherrisk': 1.7873421086264925,
 'occurring': 1.7486548359183218,
 'cheng': 1.7166370745975512,
 'adding': 1.7045432099865048,
 'lin': 1.6953798995904683,
 'eye': 1.694837337985641,
 'typhoons': 1.6530518707105766,
 'typhoon': 1.620167002328306,
 'counties': 1.579531604774346,
 'kinmen': 1.5588467994492259,
 'wrote': 1.5201420744686462,
 'luzon': 1.519243518643469,
 'china': 1.49815278334441,
 'rain': 1.489147365062295,
 'generally': 1.457845169074417,
 'april': 1.4501373169113756,
 'convection': 1.447029434777456,
 'winds': 1.439214413000486,
 'frequently': 1.4350058320349852,
 'bureau': 1.388037711316866,
 'environmental': 1.377925378688016,
 'cwb': 1.3681

In [82]:
_, _, original_tr_dict = textrank_info(content_list[0])
original_tr_dict

{'said': 5.224050512162467,
 'surigae': 3.8106283122532814,
 'yesterday': 3.514517417922049,
 'taiwan': 2.685721672261592,
 'southern': 2.2376117387724643,
 'areas': 2.113508176071431,
 'rain': 1.9928880445117647,
 'occur': 1.9338474843671252,
 '’s': 1.886328372799257,
 'typhoon': 1.8782398103134406,
 'east': 1.8742826249957494,
 'storm': 1.8654533551632917,
 'air': 1.837869453368031,
 'penghu': 1.836532597685705,
 'strength': 1.8241136233440742,
 'counties': 1.8173803139462184,
 'kinmen': 1.7171887108305577,
 'luzon': 1.6849480843968556,
 'coast': 1.681886439453201,
 'typhoons': 1.6686492289786767,
 'moving': 1.6091942060915274,
 'mountainous': 1.5573043321173108,
 'lin': 1.5381412871644322,
 'km': 1.4828942775974023,
 'cwb': 1.4752738271367063,
 'central': 1.4306058539036477,
 'april': 1.4291161106222283,
 'ming': 1.413052775966276,
 'cheng': 1.412775087683456,
 'philippines': 1.4108467782606513,
 'generally': 1.3935253452711374,
 'strong': 1.3220619750529203,
 'bureau': 1.2960773698

In [20]:
test = np.array([[1,0,1],
                 [1,0,1],
                 [1,0,1]])
np.sum(test,axis=1)