In [37]:
from collections import defaultdict
import numpy as np

In [38]:
def read(filename):
    users = set()
    user_directed_graph =  defaultdict(lambda: defaultdict(float))

    ads = set()
    ads_directed_graph = defaultdict(lambda: defaultdict(float))

    input = open(filename, 'r')
    num_entries = int(input.readline())

    for i in range(num_entries):
        unformatted = input.readline()[:-1].split(',')
        score = float(unformatted[-1])
        unformatted.pop()
        cur_user, cur_ad = [int(entry) for entry in unformatted]
        users.add(cur_user)
        user_directed_graph[cur_user][cur_ad] = score

        ads.add(cur_ad)
        ads_directed_graph[cur_ad][cur_user] = score

    predict_user, predict_ad = [int(entry) for entry in input.readline().split(',')]
    
    return users, user_directed_graph, ads, ads_directed_graph, predict_user, predict_ad



In [39]:
def simple_simrank(users, user_directed_graph, ads, ads_directed_graph):
    # note that this runs simrank with the partial sums memoization trick!
    C1 = C2 = 0.8
    similarity_user = defaultdict(lambda: defaultdict(float))
    similarity_ads = defaultdict(lambda: defaultdict(float))

    for u in users:
        similarity_user[u][u] = 1.0
    
    for a in ads:
        similarity_ads[a][a] = 1.0

    partial_user = defaultdict(lambda: defaultdict(float))
    partial_ads = defaultdict(lambda: defaultdict(float))


    for iter in range(10):
        for u in users:
            for a in ads:
                temp = 0.0
                for u_ads in user_directed_graph[u]:
                    temp += similarity_ads[u_ads][a]
                partial_user[u][a] = temp
        
        list_users = list(users)
        for i in range(len(list_users)):
            for j in range(i + 1, len(list_users)):
                temp = 0.0
                u1, u2 = list_users[i], list_users[j]
                for a in user_directed_graph[u2]:
                    temp += partial_user[u1][a]
                similarity_user[u1][u2] = similarity_user[u2][u1] =  C1 / (len(user_directed_graph[u1])*len(user_directed_graph[u2])) * temp
    
        for a in ads:
            for u in users:
                temp = 0.0
                for a_user in ads_directed_graph[a]:
                    temp += similarity_user[a_user][u]
                partial_ads[a][u] = temp
        
        list_ads = list(ads)
        for i in range(len(list_ads)):
            for j in range(i+1, len(list_ads)):
                temp = 0.0
                a1, a2 = list_ads[i], list_ads[j]
                for k in ads_directed_graph[a2]:
                    temp += partial_ads[a1][k]
                similarity_ads[a1][a2] = similarity_ads[a2][a1] = C2 / (len(ads_directed_graph[a1])*len(ads_directed_graph[a2])) * temp
            
    return similarity_user, similarity_ads
                

In [105]:
def geometric_evidence(similarity_ads, ads_directed_graph, ads, similarity_users, user_directed_graph, users):
    for _ in range(10):
        copy_ads = list(ads)
        for i in range(len(copy_ads)):
            for j in range(i + 1, len(copy_ads)):
                a1, a2 = copy_ads[i], copy_ads[j]
                a1_neighbors, a2_neighbors = ads_directed_graph[a1].keys(), ads_directed_graph[a2].keys()

                evidence_sum = 0.0
                # starting adn ending 1 later to account for offset
                for power in range(1, len(set(a1_neighbors).intersection(set(a2_neighbors))) + 1):
                    evidence_sum += (1/2)**power
                
                similarity_ads[a1][a2] *= evidence_sum
                similarity_ads[a2][a1] = similarity_ads[a1][a2]
        
        copy_users = list(users)
        for i in range(len(copy_users)):
            for j in range(i + 1, len(copy_users)):
                u1, u2 = copy_users[i], copy_users[j]
                if u1 == 1 and u2 == 4:
                    print('hello')
                u1_neighbors, u2_neighbors = user_directed_graph[u1].keys(), user_directed_graph[u2].keys()

                evidence_sum = 0.0
                # starting adn ending 1 later to account for offset
                for power in range(1, len(set(u1_neighbors).intersection(set(u2_neighbors))) + 1):
                    evidence_sum += (1/2)**power
                
                similarity_users[u1][u2] *= evidence_sum
                similarity_users[u2][u1] = similarity_users[u1][u2]
        
    
    return similarity_users, similarity_ads

In [106]:
def exponential_evidence(similarity_ads, ads_directed_graph, ads, similarity_users, user_directed_graph, users):
    for _ in range(10):
        copy_ads = list(ads)
        for i in range(len(copy_ads)):
            for j in range(i + 1, len(copy_ads)):
                a1, a2 = copy_ads[i], copy_ads[j]
                a1_neighbors, a2_neighbors = ads_directed_graph[a1].keys(), ads_directed_graph[a2].keys()

                evidence_sum = 1-np.exp(-len(set(a1_neighbors).intersection(set(a2_neighbors))))
                
                similarity_ads[a1][a2] *= evidence_sum
                similarity_ads[a2][a1] = similarity_ads[a1][a2]
        
        copy_users = list(users)
        for i in range(len(copy_users)):
            for j in range(i + 1, len(copy_users)):
                u1, u2 = copy_users[i], copy_users[j]
                u1_neighbors, u2_neighbors = user_directed_graph[u1].keys(), user_directed_graph[u2].keys()

                evidence_sum = 1-np.exp(-len(set(u1_neighbors).intersection(set(u2_neighbors))))
                # starting adn ending 1 later to account for offset
                
                similarity_users[u1][u2] *= evidence_sum
                similarity_users[u2][u1] = similarity_users[u1][u2]
    
    
    return similarity_users, similarity_ads

In [107]:
def fetch_top3(to_predict, similarity):
    res = []
    temp = similarity[to_predict].items()

    value_key = {}
    for k,v in temp:
        if k != to_predict:
            if v not in list(value_key.keys()):
                value_key[v] = k
                res.append((k,v))
            else:
                if k < value_key[v]:
                    res.remove((value_key[v], v))
                    value_key[v] = k
                    res.append((k,v))
                    
    res = sorted(res, reverse=True, key= lambda x: (x[1], -x[0]))

    return res[:5]

In [108]:
users, user_directed_graph, ads, ads_directed_graph, predict_user, predict_ad = read(filename='input_b.txt')
# similarity_user, similarity_ads = simrank(users, ads, user_directed_graph, ads_directed_graph, 10, 0.8, 0.8)
similarity_user, similarity_ads = simple_simrank(users, user_directed_graph, ads, ads_directed_graph)
# similarity_user, similarity_ads = geometric_evidence(similarity_ads, ads_directed_graph, ads, similarity_user, user_directed_graph, users)

top3_users = fetch_top3(predict_user, similarity_user)
top3_ads = fetch_top3(predict_ad, similarity_ads)
top3_users, top3_ads

([(24481, 0.32294552747512184),
  (76584, 0.27223586898312374),
  (5786, 0.25225715544238314),
  (3374, 0.25169640940503096),
  (661, 0.24582639351308352)],
 [(0, 0.5345862231972374),
  (3, 0.4374262162314706),
  (5, 0.43574833499259097),
  (17, 0.4353242145982706),
  (77, 0.43469158436577154)])

In [109]:
users, user_directed_graph, ads, ads_directed_graph, predict_user, predict_ad = read(filename='input_b.txt')
similarity_user, similarity_ads = simple_simrank(users, user_directed_graph, ads, ads_directed_graph)
similarity_user, similarity_ads = geometric_evidence(similarity_ads, ads_directed_graph, ads, similarity_user, user_directed_graph, users)
# similarity_user = evidence_geometric(similarity_user, user_directed_graph)

top3_users = fetch_top3(predict_user, similarity_user)
top3_ads = fetch_top3(predict_ad, similarity_ads)
top3_users, top3_ads

([(24481, 0.32294552747512184),
  (76584, 0.27223586896727753),
  (5786, 0.2483156373885959),
  (661, 0.2419853561144416),
  (3374, 0.23596538381721652)],
 [(10, 0.400939667397928),
  (3, 0.32806966217360295),
  (5, 0.3268112512444432),
  (17, 0.32649316094870295),
  (77, 0.32601868827432867)])

In [104]:
users, user_directed_graph, ads, ads_directed_graph, predict_user, predict_ad = read(filename='input_b.txt')
similarity_user, similarity_ads = simple_simrank(users, user_directed_graph, ads, ads_directed_graph)
similarity_user, similarity_ads = exponential_evidence(similarity_ads, ads_directed_graph, ads, similarity_user, user_directed_graph, users)
# similarity_user = evidence_geometric(similarity_user, user_directed_graph)

top3_users = fetch_top3(predict_user, similarity_user)
top3_ads = fetch_top3(predict_ad, similarity_ads)
top3_users, top3_ads

([(24481, 0.32294552747512184),
  (76584, 0.2722358689831193),
  (5786, 0.24607361314927864),
  (661, 0.2398004875347824),
  (579, 0.22079804258353997)],
 [(10, 0.12488064665232755),
  (3, 0.1021838318596408),
  (5, 0.10179187470656657),
  (17, 0.10169279914718456),
  (77, 0.10154501518063602)])

In [98]:
similarity_ads[predict_ad][0]

0.33792294214948576