In [1]:
import pickle
import numpy as np
from util import tfidf_score, textrank_info, remove_dict_val_less_than_K

In [2]:
with open('news_list.pkl', "rb") as f:
    news_list = pickle.load(f)

In [3]:
content_list = [news['content'] for news in news_list]
tfidf_dict_list = tfidf_score(content_list)
tfidf_dict = remove_dict_val_less_than_K(tfidf_dict_list[0], 0)

In [4]:
G, vocab, tr_dict = textrank_info(content_list[0])

In [5]:
G

array([[0., 1., 1., ..., 0., 0., 0.],
       [1., 0., 1., ..., 0., 0., 0.],
       [1., 1., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 1., 1.],
       [0., 0., 0., ..., 1., 0., 1.],
       [0., 0., 0., ..., 1., 1., 0.]])

In [6]:
def group_by_std(tfidf_dict, lower, upper):
    g_low = dict()
    g_mid = dict()
    g_hig = dict()

    for kw in tfidf_dict:
        kw_score = tfidf_dict[kw]
        if kw_score < lower:
            g_low[kw] = kw_score
        elif kw_score > upper:
            g_hig[kw] = kw_score
        else:
            g_mid[kw] = kw_score
    return g_low, g_mid, g_hig

def expect_boundary(g_mid):
    g_mid_kw = [kw for kw in g_mid]
    g_mid_tr_score = []
    for kw in g_mid_kw:
        g_mid_tr_score.append(tr_dict[kw])

    s_min = min(g_mid_tr_score)
    s_max = max(g_mid_tr_score)
    return s_min, s_max

def creat_expect_dict(g_low, g_hig, g_mid, s_min, s_max):
    expect_dict = dict()
    for kw in g_low:
        if tr_dict[kw] > s_min:
            expect_dict[kw] = s_min
        else:
            expect_dict[kw] = np.nan
    for kw in g_hig:
        if tr_dict[kw] < s_max:
            expect_dict[kw] = s_max
        else:
            expect_dict[kw] = np.nan
    for kw in g_mid:
        expect_dict[kw] = np.nan
    return expect_dict

def get_expect_dict(tfidf_dict, tr_dict):
    tfidf_dict = remove_dict_val_less_than_K(tfidf_dict)
    std_tfidf = np.std(  [tfidf_dict[k] for k in tfidf_dict])
    avg_tfidf = np.array([tfidf_dict[k] for k in tfidf_dict]).mean()
    print("std:{:.5f},\t avg:{:.5f}".format(std_tfidf, avg_tfidf))

    lower = avg_tfidf-(0.3 * std_tfidf)
    upper = avg_tfidf+(0.6 * std_tfidf)
    g_low, g_mid, g_hig = group_by_std(tfidf_dict, lower, upper)

    s_min, s_max = expect_boundary(g_mid)
    expect_dict = creat_expect_dict(g_low, g_hig, g_mid, s_min, s_max)
    return expect_dict

In [7]:
# ## 1. Compute the difference between expected score and current score
# 
# There are two cases [Eq.35]

def find_vocab_by_idx(vocab, idx):
    return list(vocab.items())[nk_idx][0] # ex:('storm',31) only get the node part

def calculateDifferentials_init(expect_dict, textrank_dict):
    difference_dict = dict()
    for node, T_j in expect_dict.items():
        if(T_j is not np.nan):
            A_j = textrank_dict[node]
            d_j = T_j - A_j
            difference_dict[node] = d_j
        else:
            difference_dict[node] = 0
    return difference_dict

def calculateDifferentials(G, expect_dict, difference_dict):
    
    norm = np.sum(G, axis=0)
    g_norm = np.divide(G, norm, where=norm!=0) # this is ignore the 0 element in norm
    
    TEXTRANK_DAMPING_FACTOR = 0.85
    is_converge = False
    while(is_converge == False):
        is_converge = True
        for node, T_j in expect_dict.items():
            if(T_j == np.nan):
                previous_d_j = difference_dict[node]
                d_j = 0
                node_idx = vocab[node]
                node_k_idxs = np.where(G[node_idx] != 0)[0]
                for nk_idx in node_k_idxs:
                    node_k = find_vocab_by_idx(vocab, nk_idx) 
                    d_j += difference_dict[node_k] * g_norm[node_idx][nk_idx]
                d_j *= TEXTRANK_DAMPING_FACTOR
                difference_dict[node] = round(d_j,6)
                if(previous_d_j != d_j):
                    is_converge = False
    return difference_dict

In [8]:
# ## 2. Calculate delta weight [Eq.36]
def get_delta_weight(G, textrank_dict, difference_dict, learningRate):
    delta_weight = np.zeros( (len(G),len(G)), dtype=float )
    TEXTRANK_DAMPING_FACTOR = 0.85
    for node in vocab:
        A_i = textrank_dict[node]
        denormalizationDenominator = 0
        for node, out_node in G.out_edges(node):
            denormalizationDenominator += G[node][out_node]['weight']
#         if(node == 'NN=bifurcation'):
#             print(textRank_score_dict[node])
#             print(denormalizationDenominator)
        for node, out_node in G.out_edges(node):
            d_j = difference_dict[out_node]
            delta_normalized_w_ij = learningRate * d_j * TEXTRANK_DAMPING_FACTOR * A_i  # [Eq.34]
            delta_normalized_w_ij *= denormalizationDenominator
            delta_weight_graph_dict[(node, out_node)] = delta_normalized_w_ij
    return delta_weight_graph_dict

array([[0., 1., 1., ..., 0., 0., 0.],
       [1., 0., 1., ..., 0., 0., 0.],
       [1., 1., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 1., 1.],
       [0., 0., 0., ..., 1., 0., 1.],
       [0., 0., 0., ..., 1., 1., 0.]])

In [18]:
# init setting
learningRate = 0.5


# tfidf_dict
content_list = [news['content'] for news in news_list]
tfidf_dict_list = tfidf_score(content_list)
tfidf_dict = remove_dict_val_less_than_K(tfidf_dict_list[0], 0)

# textrank_info
G, vocab, tr_dict = textrank_info(content_list[0])

expect_dict = get_expect_dict(tfidf_dict, tr_dict)

difference_dict = calculateDifferentials_init(expect_dict, tr_dict)
difference_dict = calculateDifferentials(G, expect_dict, difference_dict)
difference_dict

std:0.00682,	 avg:0.00959


{'“': -0.42651722732013786,
 'april': -0.9505471457302341,
 'china': -0.7008711606937543,
 'poor': -0.05006828638012539,
 'hualien': -0.4462784873198846,
 'weather': -0.18797277682605318,
 '2003': -0.2700437529822368,
 'level': -0.30360330038002936,
 'reaching': -0.12672093667469309,
 'hsinchu': -0.2890032791151528,
 'eastern': -0.3129099748929841,
 'began': -0.21306444033325656,
 'alert': -0.43280601631488813,
 'citing': -0.32466860722720003,
 'residents': -0.10247016007816923,
 'taitung': -0.3636208858800809,
 'speed': -0.34588242399689473,
 'chances': -0.21767409066356125,
 'direction': -0.32740338501686883,
 'beginning': -0.2477384999662079,
 'peak': -0.383318812112747,
 'yu': -0.3710249086506888,
 'north': -0.3236713555562555,
 'indicating': -0.33875709679409816,
 'likely': -0.28100172203907803,
 'county': -0.25669333759468493,
 'wind': -0.4818824239968946,
 'sign': -0.22972056535724006,
 'cause': -0.33538503868141045,
 'agency': -0.18234819833921312,
 'month': -0.6361968328565724

In [14]:
np.sum(G, axis=0)

array([16., 34.,  9., 16.,  6., 17., 17., 17., 14., 19., 14., 18., 24.,
       12.,  6.,  6., 12., 14., 49., 33.,  3.,  4.,  6.,  6., 12., 10.,
        6.,  6.,  4.,  3.,  3., 15.,  5., 12.,  6.,  6.,  6.,  5.,  4.,
        3.,  4.,  6.,  6.,  6.,  6., 15., 12., 12.,  6., 15.,  6.,  6.,
        6., 11.,  5.,  4.,  3.,  3.,  4.,  5.,  6.,  6.,  6.,  6., 18.,
        4., 14.,  6.,  6., 11., 14., 12.,  6.,  5.,  4.,  3., 10., 19.,
        6.,  5.,  3.,  4.,  5.,  6., 17., 17.,  6.,  5.,  7.,  4., 11.,
        6.,  6.,  6.,  9.,  6.,  6., 10., 10.,  6.,  6.,  6.,  6.,  5.,
        4.,  3.,  3.,  5.,  5.,  4.,  3.,  4.,  6.,  8.,  9.,  6.,  5.,
        6.,  6.,  6.,  6.,  3.,  4.,  5.,  5.,  3.,  4.,  5., 12.,  6.,
        6.,  6.,  6., 12.,  6.,  6.,  5.,  4.,  4.,  5.,  6.,  6.,  5.,
        4.,  4.,  9.,  6.,  6.,  6.,  6.,  6.,  6.,  5.,  4.,  3.,  3.,
        4.,  5.,  9.,  3.,  4.,  4.,  8.,  5.,  6.,  3., 14., 10.,  6.,
        6.,  6.,  6.,  6.,  6.,  6.,  6.,  6.,  5.,  4.,  3.,  4

In [20]:
test = np.array([[1,0,1],
                 [1,0,1],
                 [1,0,1]])
np.sum(test,axis=1)