### Python libs & Configs

In [1]:
import pickle # Serialization and deserialization
import pandas as pd # Package to manage dataframe like in R
import numpy as np # Scientific computing package
import random

import matplotlib.pyplot as plt # Basic package for statistical data visualization
import seaborn as sns # Advanced package for statistical data visualization

# In order to display figures inside the notebook:
%matplotlib inline

# Display trick to display all columns of large dataframes
from IPython.display import display
pd.set_option('display.max_columns', None)
pd.options.mode.chained_assignment = None  # default='warn'

In [2]:
SAVED_FOLDER = './data'

### Preparation

#### Distance function

In [3]:
from math import radians, cos, sin, asin, sqrt

AVG_EARTH_RADIUS = 6371  # in km

def haversine(point1, lat2,lng2):
    """ Calculate the great-circle distance bewteen two points on the Earth surface.

    :input: two 2-tuples, containing the latitude and longitude of each point
    in decimal degrees.

    Example: haversine((45.7597, 4.8422), (48.8567, 2.3508))

    :output: Returns the distance bewteen the two points.
    The default unit is kilometers. Miles can be returned
    if the ``miles`` parameter is set to True.

    """
    # unpack latitude/longitude
    lat1, lng1 = point1

    # convert all latitudes/longitudes from decimal degrees to radians
    lat1, lng1, lat2, lng2 = map(radians, (lat1, lng1, lat2, lng2))

    # calculate haversine
    lat = lat2 - lat1
    lng = lng2 - lng1
    d = sin(lat * 0.5) ** 2 + cos(lat1) * cos(lat2) * sin(lng * 0.5) ** 2
    h = 2 * AVG_EARTH_RADIUS * asin(sqrt(d))

    return h  # in kilometers

#### Data import

In [4]:
business_df = pd.read_pickle('{}/business.pickle'.format(SAVED_FOLDER))
users_df = pd.read_pickle('{}/user.pickle'.format(SAVED_FOLDER))
review_df = pd.read_pickle('{}/review.pickle'.format(SAVED_FOLDER))

In [5]:
# ordonner les villes par nombre de restaurants
business_df_test = business_df.copy()
print(business_df_test.groupby(['city']).agg({'business_id':'count'}).reset_index().sort_values(['business_id'], ascending = False))

                    city  business_id
169            Las Vegas        17423
270              Phoenix        10629
59             Charlotte         5189
326           Scottsdale         5139
224             Montréal         3891
280           Pittsburgh         3337
101            Edinburgh         3272
205                 Mesa         3190
140            Henderson         2839
366                Tempe         2773
55              Chandler         2425
190              Madison         2104
125             Glendale         1823
122              Gilbert         1716
266               Peoria          929
156            Karlsruhe          898
253      North Las Vegas          819
362             Surprise          587
54             Champaign          462
131             Goodyear          459
12              Avondale          386
288          Queen Creek          348
196             Matthews          346
375               Urbana          262
386             Waterloo          262
117         

In [6]:
category = 'Restaurants'
dist_max = 10 #en km
rating_min = 4 #Sur 5 étoiles max
city = 'Las Vegas' # prendre une ville

In [7]:
#Premier filtrage des business, général

# Choisir les users qui ont des amis
users_df['nb_friends'] = users_df['friends'].apply(len)
users_df = users_df[users_df['nb_friends'] > 0]

#On travaille sur les colones qui nous intéressent
selected_businesses = business_df[['business_id','categories','city','latitude','longitude','stars']]

#On crée un champ pour filtrer sur la catégorie
selected_businesses['right_category'] = selected_businesses.apply(lambda x : category in x['categories'],axis=1)
del(selected_businesses['categories'])

#on filtre par catégorie et la note
selected_businesses = selected_businesses[selected_businesses['right_category'] == True]
selected_businesses = selected_businesses[selected_businesses['stars'] >= rating_min]
del(selected_businesses['stars'])

# On filtre par ville
selected_businesses = selected_businesses[selected_businesses['city'] == city]

print(selected_businesses.shape)
selected_businesses.head()

(1674, 5)


Unnamed: 0,business_id,city,latitude,longitude,right_category
94,7YnyiTdXPyuDkeM_VED7XQ,Las Vegas,36.104435,-115.136785,True
136,94HshaJUV3DLvXkvbCbZqg,Las Vegas,36.136924,-115.163492,True
143,P8gXyZy1ZS7B4S20q8hzPQ,Las Vegas,36.119112,-115.279733,True
152,er6gWRnG0YgUav-d8MZ9Zw,Las Vegas,36.219223,-115.242704,True
190,tQQxehL2ZVGp7MqAxtSU_g,Las Vegas,36.143146,-115.259983,True


In [8]:
# Filtre reviews par selected business
selected_businesses_list = selected_businesses['business_id'].tolist()
selected_review_df = review_df[review_df['business_id'].isin(selected_businesses_list)]
selected_review_df.shape

(277065, 8)

In [10]:
pickle.dump( selected_review_df, open( "./output/selected_review_df.pickle", "wb" ) )

In [9]:
lat_min = selected_businesses['latitude'].min()
lat_max = selected_businesses['latitude'].max()
long_min = selected_businesses['longitude'].min()
long_max = selected_businesses['longitude'].max()

In [22]:
#Second filtrage des businesses, dépendant de l'utilisateur
import time

#Le test set sous forme de dataframe
testset_df = pd.DataFrame()

user_list_full = users_df['user_id'].reset_index()
user_list = user_list_full['user_id'].iloc[:100].tolist()
user_id_code = users_df['id'].tolist()

time_start = time.time()
for u_id in user_list:
    # Random user in Las Vegas:
    user_pos = (lat_min + random.random() * (lat_max - lat_min), random.random() * (long_max - long_min) + long_min)
    
    #On crée la liste des businesses déjà visités par l'utilisateur
#     visited_businesses = list(checkin_byhourday_df[checkin_byhourday_df['id'] == user_id_code[user_list.index(u_id)]]['business_id'])
    
    selected_businesses_for_user = selected_businesses.copy()

    #On crée un champ pour filtrer sur la distance
    selected_businesses_for_user['distance_user'] = selected_businesses_for_user.apply(lambda x:haversine(user_pos,x['latitude'],x['longitude']),axis=1)
    del(selected_businesses_for_user['latitude'])
    del(selected_businesses_for_user['longitude'])

    #on filtre par la distance et si l'utilisateur y est déjà allé
    selected_businesses_for_user = selected_businesses_for_user[selected_businesses_for_user['distance_user'] < dist_max]
#     selected_businesses_for_user = selected_businesses_for_user[-selected_businesses_for_user['business_id'].isin(visited_businesses)]
    
    #on cree un dataframe pour l'utilisateur u_id
    testset_user_df = pd.DataFrame()
    testset_user_df['distance'] = selected_businesses_for_user['distance_user']
    testset_user_df['businesses'] = selected_businesses_for_user['business_id']
    testset_user_df['user'] = u_id
    
    testset_df = testset_df.append(testset_user_df)

print('Time spent:', round(time.time() - time_start,2),'s')
testset_df.shape

00037460-4f2b-4268-b268-449ac1edfbe1
00003901-b68b-4a4d-8db5-33fb365f773a
000087c7-1694-4ffc-9d62-a44450369fac
00028f10-82a5-4327-aa5d-e338ff7be456
0000c74a-cdbd-47e3-a1dc-e5f486883e0a
0003a2ea-3f7a-49ec-9a18-4e0361f6c38a
0000f17e-08d5-4d97-8380-12034b329472
0000b0c3-e82d-4ad4-8aa9-214ff3c5eac9
0003cfb5-b0f2-45ff-8ae9-a975d62ff0c1
00012a10-b0c5-458c-8953-72895a124681
0004b3ff-f90f-413c-8d56-f9675b9de1db
00059d79-7003-410d-a97e-6a4062146b61
000479e6-1cb7-4b08-97a8-a77b79579567
0001aeb2-dc1c-4b36-80bd-b574b32235bb
00077447-8604-426d-a918-0108c3ec9796
000816ff-33aa-4536-a808-ecd0cef60909
0005d836-a94d-4161-8086-f0b4e0dad511
0005310c-ef83-4f75-be92-366350160012
00040cd8-4b78-4a07-ab2c-068360acc175
0006028c-30af-401f-845b-5ca50d296178
00053f07-27ad-41c5-b832-b09e6d2baad8
0005282d-2050-4e6e-940e-5c8ac05e34aa
0007ffa2-f50e-4d79-86c3-53742fbe787b
00056207-c667-437c-b4bb-176a43bf911b
000832aa-83c4-40d7-b6e8-da852f2840f7
0007eb95-6a8a-44ea-8b64-2612b33f1736
000ac1d1-3cb8-4c34-84f5-122e2c4006cf
0

(43290, 3)

#### Create testset

In [10]:
# Ajouter les scores entre 1 et 5
testset_df['star'] = random.randint(1, 5)
# Reinitialiser index
testset_df = testset_df.reset_index()
# Faciliter la recherche
index_list = user_list_full['index'].tolist()
user_id_list = user_list_full['user_id'].tolist()

In [11]:
testset_df.head()

Unnamed: 0,index,distance,businesses,user,star
0,2493,8.573528,rR8zCmZCalQTX6AQ_9iDyg,rbWWVwvO1729FRTc9SuKLQ,5
1,3226,8.132945,-584fn2GxYe9sLsgN2WeQA,rbWWVwvO1729FRTc9SuKLQ,5
2,3365,8.623651,P1BK2ke-N4XLsBSNWpSeLQ,rbWWVwvO1729FRTc9SuKLQ,5
3,3427,8.125041,kxAQStRhLvv8fM02OlAZxg,rbWWVwvO1729FRTc9SuKLQ,5
4,3892,8.456015,rKD-phI46Uc-sQ14lv258w,rbWWVwvO1729FRTc9SuKLQ,5


In [12]:
LS_user_100 = []
for row in testset_df.itertuples():
    LS_user_100.append([index_list[user_id_list.index(row[4])], row[1], row[5]])

#### Save

In [13]:
# Save the list in format 'a b c'
thefile = open('./output/LS_user_100.txt', 'w')
for item in LS_user_100: 
    thefile.write("%s\n" % (str(item[0]) + ' ' + str(item[1]) + ' ' + str(item[2])))

In [14]:
# Save file
pickle.dump( testset_df, open( "./output/testset_df.pickle", "wb" ) )
pickle.dump( index_list, open( "./output/index_list.pickle", "wb" ) )
pickle.dump( user_id_list, open( "./output/user_id_list.pickle", "wb" ) )