## Baseline Algorithm (Common Neighbors)

In [57]:
import numpy as np
import networkx as nx

# read in item-item graph
G_train = nx.Graph()
file = open('amazon-meta_item_item_graph.txt')
for line in file:
    cell = line.split(' ')
    G_train.add_edge(cell[0], cell[1])
file.close()

print G_train.number_of_nodes()
print G_train.number_of_edges()

833
59978


In [58]:
# calculate common neighbors 
# sort according to the number of common neighbors
core = G_train.nodes()
common = {}
for i in range(len(core)):
    for j in range(i+1, len(core)):
        x = G_train[core[i]]
        y = G_train[core[j]]
        tmp = [val for val in x if val in y]
        common[(core[i], core[j])] = len(tmp)
sorted_common = sorted(common.iteritems(), key=lambda d:d[1], reverse = True)
#print sorted_common

In [59]:
# read in ground truth graph to calculate measurements
G_test = nx.Graph()
all_nodes_inorder = []
file = open('amazon-meta_filtered_ground_truth.txt')
for line in file:
    line = line[:-1]
    cell = line.split(',')
    if G_train.has_node(cell[0]):
        G_test.add_node(cell[0])
        all_nodes_inorder.append(cell[0])
    if len(cell) > 1:
        for i in range(len(cell)-1):
            if G_train.has_node(cell[0]) and G_train.has_node(cell[i+1]):
                G_test.add_edge(cell[0], cell[i+1])
file.close()

print G_test.number_of_nodes()
print G_test.number_of_edges()

833
572


In [60]:
# generate predictions based on common neighbors
prediction = sorted_common[:G_test.number_of_edges()]
tp = 0
for i in range(len(prediction)):
    node_x = prediction[i][0][0]
    node_y = prediction[i][0][1]
    if G_test.has_edge(node_x, node_y) == True:
        tp += 1
#print tp

15


In [61]:
# generate similarity graph from common neighbors
G_sim = nx.Graph()
for node in G_test.nodes():
    G_sim.add_node(node)
for p in prediction:
    G_sim.add_edge(p[0][0],p[0][1])

In [62]:
# derive classifications of ground truth and common neighbor
compare = []
testing_list = G_test.nodes()
for node1 in testing_list:
    for node2 in testing_list:
        if node1 != node2:
            tmp = [node1,node2]
            if not nx.has_path(G_test,node1,node2):
                tmp.append(-1)
            elif nx.shortest_path_length(G_test,node1,node2) == 1:
                tmp.append(1)
            else:
                tmp.append(0)
            if not nx.has_path(G_sim,node1,node2):
                tmp.append(-1)
            elif nx.shortest_path_length(G_sim,node1,node2) == 1:
                tmp.append(1)
            else:
                tmp.append(0)
            compare.append(tmp)

In [63]:
# Measurement 1, classification accuracy
accuracy = 0
for c in compare:
    if c[2] == c[3]:
        accuracy += 1
accuracy = accuracy*1.0 / len(compare)
print(accuracy)

0.860242173793


In [64]:
# Measurement 2, label 1 classification accuracy
accuracy = 0
for c in compare:
    if c[2] == c[3] and c[3] == 1:
        accuracy += 1
print(accuracy*1.0 / (2*len(G_test.edges())))

0.0262237762238


In [67]:
# Top K recommendations, Measurement 3,4
# change value of K
accuracy = []
counter = 0
accuracy_sum = 0
K = 10
for nodes in all_nodes_inorder:
    ground_truth_set = set(G_test.neighbors(nodes))
    prediction_set = set(G_sim.neighbors(nodes)[:K])
    if len(ground_truth_set) == 0:
        accuracy.append(2.0)
    else:
        accuracy_t = len(ground_truth_set.intersection(prediction_set))*1.0\
                     / len(ground_truth_set)
        accuracy.append([nodes,accuracy_t])
        accuracy_sum += accuracy_t
        counter += 1

In [68]:
accuracy_sum / counter

0.019493269493269495