In [1]:
import json  
import pandas as pd 
from pandas import json_normalize 
import networkx as nx
import matplotlib.pyplot as plt
import numpy as np
import random

In [3]:
# get restaurant, user and review data 
rest_df = pd.read_pickle('C:\\Users\\ineso\\FEUP-3ano\\gulbenkian-ai\\data\\vegas-restaurants\\restaurant_in_vegas.pickle')
user_df = pd.read_pickle('C:\\Users\\ineso\\FEUP-3ano\\gulbenkian-ai\\data\\vegas-restaurants\\vegas_users.pickle')
review_df = pd.read_pickle('C:\\Users\\ineso\\FEUP-3ano\\gulbenkian-ai\\data\\vegas-restaurants\\vegas_review.pickle')

In [4]:
review_df = review_df.sample(10000)

In [5]:
# fill category column cells without values
rest_df['categories'] = rest_df['categories'].fillna("")

In [6]:
# initialize graph
G = nx.Graph()

categories = []
restaurants = []
rest_category = []
users = []
friends = []
reviews = []

In [7]:
# add edges between users and the restaurants they reviewed
for index, row in review_df.iterrows():
    if row['user_id'] not in users:
        G.add_node(row['user_id'], type="user")
        users.append(row['user_id'])
    if row['business_id'] not in restaurants:
        G.add_node(row['business_id'], type="restaurant")
        restaurants.append(row['business_id'])
    G.add_edge(row['user_id'], row['business_id'], rating=row['stars'])
    reviews.append((row['user_id'], row['business_id']))

In [29]:
# add restaurant names
for index, row in rest_df.iterrows():
    if row['business_id'] in restaurants:
        G.nodes[row['business_id']]['name'] = row['name']
    
    # add category nodes and edges between those and restaurants
    cat = row['categories'].split(", ")
    for c in cat:
        if c not in categories:
            G.add_node(c, type="category")
            categories.append(c)
        G.add_edge(row['business_id'], c)
        rest_category.append((row['business_id'], c))

In [30]:
# add user nodes
for index, row in user_df.iterrows():
    if row['user_id'] in users:
        G.nodes[row['user_id']]['name'] = row['name']

In [36]:
def shared_neighbors(G, user1, user2):
    nbrs1 = G.neighbors(user1)
    nbrs2 = G.neighbors(user2)

    overlap = set(nbrs1).intersection(nbrs2)
    return overlap

In [173]:
def user_similarity(G, user1, user2):
    shared_nodes = shared_neighbors(G, user1, user2)

    #nbrs1 = G.neighbors(user1)
    #nbrs2 = G.neighbors(user2)
    #total = set(nbrs1).union(set(nbrs2))
    return len(shared_nodes) / len(restaurants)

In [171]:
def restaurant_similarity(G, rest1, rest2):
    shared_nodes = shared_neighbors(G, rest1, rest2)
    shared_nodes = list(filter(lambda x: G.nodes[x]['type'] == 'category', shared_nodes))

    nbrs1 = G.neighbors(rest1)
    nbrs2 = G.neighbors(rest2)
    total = set(nbrs1).union(set(nbrs2))
    total = list(filter(lambda x: G.nodes[x]['type'] == 'category', total))
    return len(shared_nodes) / len(total)

In [71]:
from collections import defaultdict

def most_similar_users(G, user):
    nbrs = G.neighbors(user)

    user_nodes = []
    for r in nbrs:
        user_nbrs = G.neighbors(r)
        user_nbrs = list(filter(lambda x: G.nodes[x]['type'] == 'user', user_nbrs))
        for u in user_nbrs:
            if u not in user_nodes:
                user_nodes.append(u)

    similarities = defaultdict(list)
    for n in user_nodes:
        similarity = user_similarity(G, user, n)
        similarities[similarity].append(n)

    max_similarity = max(similarities.keys())
    if(max_similarity == 0): return []

    return similarities[max_similarity]

In [55]:
def recommend_restaurants(G, from_user, to_user):
    from_rests = set(G.neighbors(from_user))
    to_rests = set(G.neighbors(to_user))

    return from_rests.difference(to_rests)

In [178]:
test_user = 'jkBOSB5CBB_omGWIHKCIpg'
#test_user = random.choice(users)
print(test_user)
similar_users = most_similar_users(G, test_user)
print(similar_users)
recommendations = set()
for u in similar_users:
    recommendations.update(recommend_restaurants(G, u, test_user))

nbrs = G.neighbors(test_user)

similarities = []
for n in nbrs:
    for r in recommendations:
        similarities.append((rest_df.loc[rest_df['business_id'] == n,]['name'].item(),rest_df.loc[rest_df['business_id'] == r]['name'].item(), restaurant_similarity(G, n, r)))

print(similarities)

jkBOSB5CBB_omGWIHKCIpg
['4zicS2kEKUAYpRZDAAY36Q', 'o0JxB9pFY-683xN3qiHs7A', 'QMvKdCyjwCzqQey1Y32nKQ', 'jkBOSB5CBB_omGWIHKCIpg', 'cfmNF0zeQKz8NRJ-Uj_ofQ', 'eBCd6cQsEywwjai53WZrKQ', 'Hd6pD2G26ObK3gtvdhyHGg', 'VFmgdFI4iEwbxNSXu-3RZg', 'ljexhO2z36Mt3WXAjed-Kw']
[('Cafe Americano', 'Spago by Wolfgang Puck', 0.2857142857142857), ('Cafe Americano', 'Beijing Noodle No. 9', 0.14285714285714285), ('Cafe Americano', 'Las Vegas National Golf Club', 0.1)]


In [179]:
nx.write_gpickle(G, "DatasetSamples/vegas_graph_small.gpickle")