In [1]:
import numpy as np
from numpy import matrix

In [2]:
path = './data/sample1.txt'
with open(path, 'r') as log_fp:
    logs = [log.strip() for log in log_fp]
    pass
logs_tuple = [tuple(log.split(',')) for log in logs]
logs_tuple[:2]

[('pc', 'hp.com'), ('pc', 'hp.com')]

In [3]:
queries = list(set([log[0] for log in logs_tuple]))
queries

['tv', 'digital camera', 'camera', 'pc', 'flower']

In [4]:
ads = list(set([log[1] for log in logs_tuple]))
ads

['teleflora.com', 'orchids.com', 'bestbuy.com', 'hp.com']

In [5]:
# Graph means the relations number
graph = np.matrix(np.zeros([len(queries), len(ads)]))
graph

matrix([[0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.]])

In [6]:
for log in logs_tuple:
    query = log[0]
    ad = log[1]
    q_i = queries.index(query)
    a_j = ads.index(ad)
    graph[q_i, a_j] += 1
    pass
print(graph)

query_sim = matrix(np.identity(len(queries)))
ad_sim = matrix(np.identity(len(ads)))
print('query_sim: \n', query_sim)
print('ad_sim: \n', ad_sim)

[[ 0.  0. 15.  0.]
 [ 0.  0.  7. 30.]
 [ 0.  0.  5. 20.]
 [ 0.  0.  0. 10.]
 [16. 15.  0.  0.]]
query_sim: 
 [[1. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0.]
 [0. 0. 1. 0. 0.]
 [0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 1.]]
ad_sim: 
 [[1. 0. 0. 0.]
 [0. 1. 0. 0.]
 [0. 0. 1. 0.]
 [0. 0. 0. 1.]]


In [7]:
def get_ads_num(query):
    q_i = queries.index(query)
    return graph[q_i]
    pass
get_ads_num('camera')

matrix([[ 0.,  0.,  5., 20.]])

In [8]:
def get_queries_num(ad):
    a_j = ads.index(ad)
#     return graph[:, a_j]
    return graph.transpose()[a_j]
    pass
get_queries_num('orchids.com')

matrix([[ 0.,  0.,  0.,  0., 15.]])

In [9]:
def get_ads(query):
    series = get_ads_num(query).tolist()[0]
    return [ ads[x] for x in range(len(series)) if series[x] > 0]
    pass
get_ads('camera')

['bestbuy.com', 'hp.com']

In [10]:
def get_queries(ad):
    series = get_queries_num(ad).tolist()[0]
    return [queries[x] for x in range(len(series)) if series[x] > 0]
    pass
get_queries('hp.com')

['digital camera', 'camera', 'pc']

In [11]:
def query_simrank(q1, q2, C):
    if q1 == q2: return 1
    
    # 惩罚项
    prefix = C / (get_ads_num(q1).sum() * get_ads_num(q2).sum())
    postfix = 0
    for ad_i in get_ads(q1):
        for ad_j in get_ads(q2):
            i = ads.index(ad_i)
            j = ads.index(ad_j)
            postfix += ad_sim[i, j]
            pass
        pass
    return prefix * postfix
    pass
query_simrank('tv', 'camera', 0.8)

0.0021333333333333334

In [15]:
def ad_simrank(a1, a2, C):
    if a1 == a2: return 1
    
    # 惩罚项
    prefix = C / (get_queries_num(a1).sum() * get_queries_num(a2).sum())
    postfix = 0
    for q_i in get_queries(a1):
        for q_j in get_queries(a2):
            i = queries.index(q_i)
            j = queries.index(q_j)
            postfix += query_sim[i, j]
            pass
        pass
    return prefix * postfix
    pass
ad_simrank('orchids.com', 'hp.com', C=0.8)

0.0

In [19]:
def simrank(C=0.8, times=1):
    global query_sim, ad_sim
    new_query_sim = matrix(np.identity(len(queries)))
    for qi in queries:
        for qj in queries:
            i = queries.index(qi)
            j = queries.index(qj)
            new_query_sim[i, j] = query_simrank(qi, qj, C)
            pass
        pass
    
    new_ad_sim = matrix(np.identity(len(ads)))
    for ai in ads:
        for aj in ads:
            i = ads.index(ai)
            j = ads.index(aj)
            new_ad_sim[i, j] = ad_simrank(ai, aj, C)
            pass
        pass
    query_sim = new_query_sim
    ad_sim = new_ad_sim
    pass
simrank()


In [20]:
query_sim

matrix([[1.        , 0.00144144, 0.00213333, 0.        , 0.        ],
        [0.00144144, 1.        , 0.00172973, 0.00216216, 0.        ],
        [0.00213333, 0.00172973, 1.        , 0.0032    , 0.        ],
        [0.        , 0.00216216, 0.0032    , 1.        , 0.        ],
        [0.        , 0.        , 0.        , 0.        , 1.        ]])

In [21]:
ad_sim

matrix([[1.00000000e+00, 3.33333333e-03, 0.00000000e+00, 0.00000000e+00],
        [3.33333333e-03, 1.00000000e+00, 0.00000000e+00, 0.00000000e+00],
        [0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 9.87654321e-04],
        [0.00000000e+00, 0.00000000e+00, 9.87654321e-04, 1.00000000e+00]])