### Python Modules & configurations

In [1]:
import pickle # Serialization and deserialization
import pandas as pd # Package to manage dataframe like in R
import numpy as np # Scientific computing package

import matplotlib.pyplot as plt # Basic package for statistical data visualization
import seaborn as sns # Advanced package for statistical data visualization

# In order to display figures inside the notebook:
%matplotlib inline

# Display trick to display all columns of large dataframes
from IPython.display import display
pd.set_option('display.max_columns', None)

In [2]:
# Shared folder to read saved data
SAVED_FOLDER = './data'

# Your folder to write new data and create notebooks.
MY_FOLDER = './output'

### Data Import

In [3]:
# Let's load data
review_df = pd.read_pickle('{}/review.pickle'.format(SAVED_FOLDER))
business_df = pd.read_pickle('{}/business.pickle'.format(SAVED_FOLDER))
users_df = pd.read_pickle('{}/user.pickle'.format(SAVED_FOLDER))
categories_df = pd.read_pickle('{}/categories.pickle'.format(SAVED_FOLDER))
tip_df = pd.read_pickle('{}/tip.pickle'.format(SAVED_FOLDER))

In [4]:
# Filter the users without friends
users_df['nb_friends'] = users_df['friends'].apply(len)
users_df = users_df[users_df['nb_friends'] > 0]
users_df.shape

(249440, 13)

#### Save / Load pickle

In [10]:
# Save file
# pickle.dump( users_df, open( "./output/users_df.pickle", "wb" ) )
# Load file
# pickle.load( open( "users_df.pickle", "rb" ) )

In [12]:
# Obtain the friendship mapping
user_friends_list = users_df.reset_index()[['user_id','friends']].copy()
user_friends_list.head(3)

Unnamed: 0,user_id,friends
0,rbWWVwvO1729FRTc9SuKLQ,"[dCzr2bpptYDbOQ-gSU6iXA, HCpHFbxbMRUvRW-ujHdfF..."
1,rN6txAs4mgyCY3tJD5CVsw,"[JYEWTor9qkCeZC7U0bHChg, NCcmBw0bktP8p-XEVMTrs..."
2,fMyqYxU082FtesxvNXAFpg,[30jAE1Xz2bN1mu-1Q46ejg]


### Friendship

#### All data (warning: long time to execute)

In [8]:
# Simpler mapping
sample_user = []
cklist = ufl['user_id'].tolist()
for row in ufl.itertuples():
    rowIndex = ufl.loc[ufl['user_id'] == row[1]].index[0]
    for col in row[2]:
        if col in cklist:
            colIndex = cklist.index(col)
            sample_user.append([rowIndex, colIndex, 1])
    if len(sample_user) % 1000 == 0:
        print(len(sample_user))

16000
19000
31000
34000
38000
39000
56000
64000
68000
69000
87000
97000
121000
123000
132000
137000
152000
173000
184000
185000
187000
189000
204000
210000
244000
262000
263000
301000
303000
322000
333000
384000
395000
401000
402000
411000
412000
427000
431000
432000
438000
446000
472000
486000
490000
500000
501000
503000
512000
513000
517000
536000
544000
547000
591000
680000
699000
705000
708000
723000
758000
771000
815000
825000
827000
841000
845000
849000
853000
862000
875000
881000
889000
893000
899000
908000
922000
927000
932000
936000
945000
973000
978000
1003000
1004000
1006000
1035000
1039000
1040000
1042000
1064000
1065000
1066000
1072000
1085000
1099000
1114000
1118000
1128000
1150000
1187000
1209000
1220000
1225000
1252000
1254000
1271000
1272000
1276000
1279000
1283000
1292000
1293000
1297000
1303000
1320000
1345000
1378000
1389000
1405000
1418000
1428000
1473000
1476000
1505000
1508000
1509000
1524000
1548000
1562000
1568000
1593000
1599000
1634000
1635000
1649000
1669000

#### Save friendship

In [10]:
# Save the sample list in format 'a b c'
thefile = open('./output/sample_user.txt', 'w')
for item in sample_user: 
    thefile.write("%s\n" % (str(item[0]) + ' ' + str(item[1]) + ' ' + str(item[2])))

### Ratings

#### Preparing for categories list

In [34]:
# Get the unique categories and save. Filter manually.
categories_list_raw = business_user_df['categories'].tolist()
categories_list = set()
for i in categories_list_raw:
    for j in i:
        categories_list.add(j)
categories_list = list(categories_list)
thefile = open('./output/categories_list.txt', 'w')
for item in categories_list: 
    thefile.write("%s\n" % item)

In [5]:
# All business list
business_user_df = business_df.copy()
business_list = business_user_df['business_id'].tolist()
len(business_list)

77445

In [6]:
business_user_df = business_user_df.merge(review_df, on='business_id', how='inner', suffixes=['','_review'])

In [7]:
business_user_df_simple = business_user_df.copy()[['user_id','business_id','stars']]
business_user_df_simple.head(5)

Unnamed: 0,user_id,business_id,stars
0,_ASpgUPAEw4WR2m954ukcw,w_vBsXaz-XwyN5O_uYRh8Q,4.5
1,3CHnbZSJle7KnfTvC8i8Sw,w_vBsXaz-XwyN5O_uYRh8Q,4.5
2,DeqxMIjuPUZHRRP06TLzWA,w_vBsXaz-XwyN5O_uYRh8Q,4.5
3,dRttS0i4XATOxXCUCJ3oKg,w_vBsXaz-XwyN5O_uYRh8Q,4.5
4,9sHDVqIHPfVAJIehwg7yXA,OCmmPs_5NBt65ZY7OuG-lA,4.0


#### Get matrix: user-item-rate (warning: long to execute)

In [16]:
sample_business = []
for row in business_user_df_simple.itertuples():
    if row[1] not in cklist:
        continue
    rowIndex = cklist.index(row[1])
    colIndex = business_list.index(row[2])
    sample_business.append([rowIndex, colIndex, row[3]])
len(sample_business)

1547873

#### Save ratings

In [17]:
# # Save the user business list
# thefile = open('D:/Jupyter/TB/Projet308/temp/sample_business.txt', 'w')
# for item in sample_business:
#     thefile.write("%s\n" % item)
# Save the user business in format 'a b c'
thefile = open('D:/Jupyter/TB/Projet308/temp/sample_business.txt', 'w')
for item in sample_business: 
    thefile.write("%s\n" % (str(item[0]) + ' ' + str(item[1]) + ' ' + str(item[2])))