## Imports

In [None]:
import networkx as nx
import matplotlib.pyplot as plt
import operator
import random
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error as mse
import itertools

%matplotlib inline 

## Graph Creation

In [None]:
facebook = nx.Graph()

## Read dataset, Draw the Graph

In [None]:
def open_source_file(filename):
    with open (filename,"r") as file:
        if filename == "../data/facebook-links.txt":
            content = file.readlines()
            for i in range(len(content)):
                point1 = content[i].split("\t")[0]
                if point1 not in facebook:
                    facebook.add_node(point1)
                point2 = content[i].split("\t")[1]
                if point2 not in facebook:
                    facebook.add_node(point2)
                facebook.add_edge(point1,point2)
        else:
            print ("Not a valid file, please check again")

In [None]:
open_source_file("../data/facebook-links.txt")

## Show the basic info about the Facebook Graph

In [None]:
def show_basic_info(graph):
    print (nx.info(graph))

In [None]:
print ("Original Dataset")
show_basic_info(facebook)

## Remove the user only have one friend

In [None]:
# only do it once!!!!
def remove_user_with_one_friend(graph):
    edge_to_remove = {}
    # find the user with only one friend, and store the edge connection as a dictionary format.
    # the format of dictionary is: {user(shown in user_id): the only friend(shown in user_id)}
    for node in nx.nodes_iter(graph):
        if (len(graph.neighbors(node)) <= 1):
            neighbor = graph.neighbors(node)
            edge_to_remove[node] = neighbor[0]
        else:
            pass
    
    first10pairs = {k: edge_to_remove[k] for k in list(edge_to_remove)[:10]}
    print ("First 10 pairs",first10pairs)
        
    for key in edge_to_remove:
        if graph.has_edge(key,edge_to_remove[key]):
            graph.remove_edge(key,edge_to_remove[key])
        else:
            pass
        if graph.has_node(key):
            graph.remove_node(key)
        else:
            pass
    # after removing,there are some nodes that don't have edges.We simply delete those nodes as well. 
    further_node_to_remove = []
    for node in nx.nodes_iter(graph):
        if (len(graph.neighbors(node)) == 0):
            further_node_to_remove.append(node)
    print ("After preliminary removing, further remove users with no friends",further_node_to_remove)
    for node in further_node_to_remove:
        graph.remove_node(node)

In [None]:
remove_user_with_one_friend(facebook)

In [None]:
print ("Dataset after removing")
show_basic_info(facebook)

## Adjacency Matrix

In [None]:
adjacency_matrix = {}
for node in nx.nodes_iter(facebook):
    adjacency_matrix[int(node)] = [neighbor for neighbor in nx.all_neighbors(facebook,node)]   
# print the first 10 users with their friends for testing purposes
for i in range(5):
    print ("adjacency_matrix",[i+1],":",adjacency_matrix[i+1])

## Create Dictionary, for number of friends

In [None]:
num_of_friends = {}
for key in adjacency_matrix:
    num_of_friends[int(key)] = len(adjacency_matrix[key])

In [None]:
updat_num_of_friends = sorted(num_of_friends.items(), key=lambda x: (-x[1], x[0]))

In [None]:
num_users = len(adjacency_matrix)
print ("number of users in total:",num_users)

## Calculate the sparisty of dataset

In [None]:
utility = np.zeros((num_users, num_users))
for user_id, user_friend_ids in adjacency_matrix.items():
    for x in user_friend_ids:
        utility[(int)(user_id) - 1, (int)(x) - 1] = 1
        
sparsity = float(len(utility.nonzero()[0]))
sparsity /= (utility.shape[0] * utility.shape[1])
sparsity *= 100
print('Sparsity: {:.2f}%'.format(sparsity))

## Evaluation via Mean Squared Error (MSE)

In [None]:
def mse_utility(u1, u2):
    return mse(u1[u1.nonzero()].flatten(), u2[u2.nonzero()].flatten())

## Similarity via Cosine Distance

In [None]:
def cosine_sim(v1, v2):
    numerator = sum([x * y for x, y in zip(v1, v2)])
    denominator = np.sqrt(sum([x ** 2 for x in v1])) * np.sqrt(sum([x ** 2 for x in v2]))
    return numerator / denominator

In [None]:
def sim_matrix(u, eps=1.0e-9):
    step1 = u.dot(u.T) + eps
    step2 = np.array([np.sqrt(np.diagonal(step1))])
    return (step1 / step2 / step2.T)

In [None]:
%timeit -n 10 -r 3 sim_matrix(utility[:50,:])

In [None]:
def sim_users(u):
    return sim_matrix(u)

print(sim_users(utility[:50,:50]))

# K neighborhood

In [None]:
def top_k(arr, self_idx, k):
    val_index = { v:key for key, v in enumerate(arr) }
    top_k_val = sorted(val_index.keys())[::-1]
    i = 0
    res = {}
    while i < k:
        if val_index[top_k_val[i]] == self_idx:
            i += 1
            k += 1
            continue
        res[val_index[top_k_val[i]]] = top_k_val[i]
        i += 1
    return res

## Recommend via Similar Users

In [None]:
def rec_via_users(m_utility, m_sim_users, user_idx, frd_idx, k):
    items = m_utility[:, frd_idx]
    i_sim = top_k(m_sim_users[:, user_idx], user_idx, k)
    non_zero_index = [i for i in i_sim if items[i] != 0]
    if sum([i_sim[i] for i in non_zero_index]) == 0:
        return 0
    return sum([i_sim[i] * items[i] for i in non_zero_index]) / sum([i_sim[i] for i in non_zero_index])

## Evaluation

In [None]:
random.seed(12345)

def recs_via_users(m_utility, m_sim_users, k, test_n):
    test = random.sample(range(m_sim_users.shape[0]), test_n)
    true = []
    pred = []
    for user_idx in test:
        for item_idx in range(m_utility.shape[1]):
            if m_utility[user_idx][item_idx] != 0:
                true.append(m_utility[user_idx][item_idx])
                
                p = round(rec_via_users(m_utility, m_sim_users, user_idx, item_idx, k))
                if p != 0:    
                    pred.append(p)
                else:
                    pred.append(1.0e-9)
                        

    return mse_utility(np.array([true], dtype=np.float64), np.array([pred], dtype=np.float64))
    
similarity_users = sim_users(utility)

ks = []
mses = []
for i in range(50):
    ks.append(i+1)
    mses.append(recs_via_users(utility, similarity_users, i+1, 100))
    print("{}/50".format(i+1), mses[-1])