# **Collaborative Filtering**


Collaborative filtering is a recommendation system approach which filters information by using the interactions and data collected by the system from other users.
Meaning that it suggests trails based on the ratings of other users who have similar preferences as the target user.



### **Packages**

In [None]:
import pandas as pd
import random
from six.moves import reduce
import numpy as np
import re

In [None]:
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
# Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

### **Data**


Retrieve trails dataframe which has been filtered by distance (diameter around set location) and difficulty (hiking expertise of the user).

In [None]:
data = [['UserWIth5difficulty_SFOLocation_Within500M.csv', "14U_kZ6gsqIKWmLtkkug-en1j9ICctrzL"], ['UserWith3difficulty_NeYork_Location_Within450Miles.csv', '1N-j8XmTDkntWcMhilBJQPRRaMvjDQIkJ'], ['UserWith3difficulty_NeYork_Location_Within500Miles.csv', '1-4Coql1lpPc08QdmIsG18dnjYJPuYVoW'], ['UserWith7difficulty_LALocation_Within150Miles.csv', '1Uc5bKFLG5wlGaQICVlVHbSpKhrLCA67S']]
link=data[3][1]
file=data[3][0]
downloaded = drive.CreateFile({'id':link}) 
downloaded.GetContentFile(file)
df = pd.read_csv(file)
df.head()

Unnamed: 0.2,Unnamed: 0,trail_id,Unnamed: 0.1,name,area_name,city_name,state_name,country_name,popularity,length,...,activity_bike-touring,activity_whitewater-kayaking,activity_rails-trails,activity_ice-climbing,activity_surfing,activity_snowboarding,state_name_cat,city_name_cat,condition_cat,distance_from_location
0,76,10031888,235,Barker Dam Nature Trail,Joshua Tree National Park,Twentynine Palms,California,United States,30.1796,2896.812,...,0.0,0.0,0.0,0.0,0.0,0.0,3,258,0,120.307771
1,79,10035554,239,Hidden Valley Nature Trail,Joshua Tree National Park,Twentynine Palms,California,United States,27.9587,1609.34,...,0.0,0.0,0.0,0.0,0.0,0.0,3,258,0,118.82685
2,82,10019827,247,Arch Rock Nature Trail,Joshua Tree National Park,Twentynine Palms,California,United States,24.81,1126.538,...,0.0,0.0,0.0,0.0,0.0,0.0,3,258,0,127.579574
3,84,10023775,259,Skull Rock Nature Trail,Joshua Tree National Park,Twentynine Palms,California,United States,21.7608,2735.878,...,0.0,0.0,0.0,0.0,0.0,0.0,3,258,0,125.05112
4,89,10235899,269,Cholla Cactus Garden Nature Trail,Joshua Tree National Park,Twentynine Palms,California,United States,20.2298,321.868,...,0.0,0.0,0.0,0.0,0.0,0.0,3,258,11,132.848897


In [None]:
user_ratings = pd.DataFrame()
np.random.seed(42)

Generate (or obtain if there is any real data) a userTrail dataframe which contains the assigned ratings of all users to all trails that he/she visited and rated.

In [None]:
def gen_avg(expected_avg, n, a=0, b=5):
    while True:
        l = [random.randint(a, b) for i in range(n)]
        avg = reduce(lambda x, y: x + y, l) / len(l)

        if avg == expected_avg:
          return l
    
        
trail_ratings = pd.DataFrame()
trail_ratings['Trails'] = df.name
trail_ratings['Ratings'] = ''
trail_ratings['User ID'] = ''
trail_ratings['trail_id'] = ''

Uid = []

for i in range(len(df)):
      X = gen_avg(df['avg_rating'].iloc[i], n=4)
      for j in range(len(X)):
        X1 = gen_avg(df['avg_rating'].iloc[j], n=4)
        X.append(X1[j])
      trail_ratings['Ratings'].iloc[i] = X
      trail_ratings['trail_id'].iloc[i] = df['trail_id'].iloc[i]
      Uid.append(random.sample(range(50), len(X)))

trail_ratings = trail_ratings.explode('Ratings').reset_index(drop=True)

x=0
j=0
for i in range(len(trail_ratings)):
    trail_ratings.iloc[i,2] = Uid[j][x]
    x+=1
    if x==8:
      x=0
      j+=1
trail_ratings.head()

Unnamed: 0,Trails,Ratings,User ID,trail_id
0,Barker Dam Nature Trail,4,38,10031888
1,Barker Dam Nature Trail,5,29,10031888
2,Barker Dam Nature Trail,4,49,10031888
3,Barker Dam Nature Trail,5,23,10031888
4,Barker Dam Nature Trail,5,32,10031888


Obtain the all ratings from the target user. <br>
*Note: in this case we create random ratings, we have yet to incorporate the scores resulting from clustering.

In [None]:
userInput = [
            {'Trails':re.search(r'd* ([a-zA-Z].*)\n',str(trail_ratings['Trails'].sample(n=1, random_state=1))).group(1), 'rating':3}, 
            {'Trails':re.search(r'd* ([a-zA-Z].*)\n',str(trail_ratings['Trails'].sample(n=1, random_state=2))).group(1), 'rating':3}, 
            {'Trails':re.search(r'd* ([a-zA-Z].*)\n',str(trail_ratings['Trails'].sample(n=1, random_state=3))).group(1), 'rating':4},
            {'Trails':re.search(r'd* ([a-zA-Z].*)\n',str(trail_ratings['Trails'].sample(n=1, random_state=4))).group(1), 'rating':5}                      
         ] 
inputTrails = pd.DataFrame(userInput)
inputTrails

Unnamed: 0,Trails,rating
0,Morbid Mound Trail,3
1,Hidden Valley Campground to Barker Dam Nature ...,3
2,Quail Mountain from Covington Crest Trailhead,4
3,Negropolis Trail,5


For each other user, extract ratings for similar trails visited <br>

In [None]:
userSubset = trail_ratings[trail_ratings['Trails'].isin(inputTrails['Trails'].tolist())]
userSubset.head()

Unnamed: 0,Trails,Ratings,User ID,trail_id
288,Negropolis Trail,0,13,10269467
289,Negropolis Trail,0,23,10269467
290,Negropolis Trail,0,25,10269467
291,Negropolis Trail,0,16,10269467
292,Negropolis Trail,5,41,10269467


Group them by user

In [None]:
userSubsetGroup = userSubset.groupby(['User ID'])
userSubsetGroup = sorted(userSubsetGroup,  key=lambda x: len(x[1]), reverse=True)
userSubsetGroup

[(2,
                                              Trails Ratings User ID  trail_id
  295                               Negropolis Trail       2       2  10269467
  817  Quail Mountain from Covington Crest Trailhead       3       2  10311964),
 (36,                  Trails Ratings User ID  trail_id
  293    Negropolis Trail       5      36  10269467
  805  Morbid Mound Trail       3      36  10269469),
 (42,
                                              Trails Ratings User ID  trail_id
  804                             Morbid Mound Trail       3      42  10269469
  819  Quail Mountain from Covington Crest Trailhead       4      42  10311964),
 (5,                  Trails Ratings User ID  trail_id
  806  Morbid Mound Trail       5       5  10269469),
 (8,
                                              Trails Ratings User ID  trail_id
  821  Quail Mountain from Covington Crest Trailhead       5       8  10311964),
 (9,                  Trails Ratings User ID  trail_id
  800  Morbid Mound 

Most similar user, trails shared and reviews


In [None]:
userSubsetGroup[0][0], userSubsetGroup[0]

(2,
 (2,
                                              Trails Ratings User ID  trail_id
  295                               Negropolis Trail       2       2  10269467
  817  Quail Mountain from Covington Crest Trailhead       3       2  10311964))

Get similarity scores between users by using Pearson correlation

In [None]:
#Store the Pearson Correlation in a dictionary, where the key is the user Id and the value is the coefficient
pearsonCorrelationDict = {}

#For every user group in our subset
for name, group in userSubsetGroup:
    
    #Let's start by sorting the input and current user group so the values aren't mixed up later on
    group = group.sort_values(by='Trails')
    inputTrails = inputTrails.sort_values(by='Trails')
    
    #Get the N (total similar trails visited) for the formula 
    nRatings = len(group)
    
    #Get the review scores for the trails that they both have in common
    temp_df = inputTrails[inputTrails['Trails'].isin(group['Trails'].tolist())]
    
     #And then store them in a temporary buffer variable in a list format to facilitate future calculations
    tempRatingList = temp_df['rating'].tolist()
    
    #Let's also put the current user group reviews in a list format
    tempGroupList = group['Ratings'].tolist()
    
    #Now let's calculate the pearson correlation between two users, so called, x and y

    #For package based
    #scipy.stats import pearsonr
    #pearsonr(tempRatingList,tempGroupList)[0]

    #For hard code based
    Sxx = sum([i**2 for i in tempRatingList]) - pow(sum(tempRatingList),2)/float(nRatings)
    Syy = sum([i**2 for i in tempGroupList]) - pow(sum(tempGroupList),2)/float(nRatings)
    Sxy = sum( i*j for i, j in zip(tempRatingList, tempGroupList)) - sum(tempRatingList)*sum(tempGroupList)/float(nRatings)
    
    #If the denominator is different than zero, then divide, else, 0 correlation.
    if Sxx != 0 and Syy != 0:
        pearsonCorrelationDict[name] = Sxy/np.sqrt(Sxx*Syy)
    else:
        pearsonCorrelationDict[name] = 0

In [None]:
pearsonDF = pd.DataFrame.from_dict(pearsonCorrelationDict, orient='index')
pearsonDF.columns = ['similarityIndex']
pearsonDF['User ID'] = pearsonDF.index
pearsonDF.index = range(len(pearsonDF))
topUsers = pearsonDF.sort_values(by='similarityIndex', ascending=False)
topUsers.head()

Unnamed: 0,similarityIndex,User ID
2,1.0,42
1,1.0,36
10,0.0,21
12,0.0,23
19,0.0,46


Incorporate the similarityIndex of all users to each of the corresponding users' reviews.

In [None]:
topUsersRating = topUsers.merge(trail_ratings, left_on='User ID', right_on='User ID', how='inner')
topUsersRating.head()

Unnamed: 0,similarityIndex,User ID,Trails,Ratings,trail_id
0,1.0,42,Arch Rock Nature Trail,5,10019827
1,1.0,42,Split Rock Loop Trail,4,10252968
2,1.0,42,Cap Rock and Gram Parsons Nature Trail,1,10027342
3,1.0,42,Hall of Horrors,5,10282040
4,1.0,42,Lucky Boy Vista Trail to Elton Mine,5,10029617


Weighter rating is obtained by multiplying the similarity by the user's ratings

In [None]:
topUsersRating['weightedRating'] = topUsersRating['similarityIndex']*topUsersRating['Ratings'].astype(float)
topUsersRating.head()

Unnamed: 0,similarityIndex,User ID,Trails,Ratings,trail_id,weightedRating
0,1.0,42,Arch Rock Nature Trail,5,10019827,5.0
1,1.0,42,Split Rock Loop Trail,4,10252968,4.0
2,1.0,42,Cap Rock and Gram Parsons Nature Trail,1,10027342,1.0
3,1.0,42,Hall of Horrors,5,10282040,5.0
4,1.0,42,Lucky Boy Vista Trail to Elton Mine,5,10029617,5.0


Applies a sum to the topUsers after grouping it up by userId

In [None]:
tempTopUsersRating = topUsersRating.groupby('trail_id').sum()[['similarityIndex', 'weightedRating']]
tempTopUsersRating.columns = ['sum_similarityIndex','sum_weightedRating']
tempTopUsersRating.head()

Unnamed: 0_level_0,sum_similarityIndex,sum_weightedRating
trail_id,Unnamed: 1_level_1,Unnamed: 2_level_1
10003613,1.0,5.0
10003872,-1.0,-4.0
10005777,1.0,2.0
10011170,0.0,0.0
10014917,1.0,5.0


Calculate take the weighted average recommendation score

In [None]:
recommendation_df = pd.DataFrame()
recommendation_df['weighted average recommendation score'] = tempTopUsersRating['sum_weightedRating']/tempTopUsersRating['sum_similarityIndex']
recommendation_df.head()

Unnamed: 0_level_0,weighted average recommendation score
trail_id,Unnamed: 1_level_1
10003613,5.0
10003872,4.0
10005777,2.0
10011170,
10014917,5.0


In [None]:
recommendation_df = recommendation_df.sort_values(by='weighted average recommendation score', ascending=False)
recommendation_df = recommendation_df.merge(df[['trail_id', 'name']], left_on='trail_id', right_on='trail_id').set_index('trail_id')

Done! these are the trail recommendations sorted by weighted average recommendation score

In [None]:
recommendation_df

Unnamed: 0_level_0,weighted average recommendation score,name
trail_id,Unnamed: 1_level_1,Unnamed: 2_level_1
10311964,inf,Quail Mountain from Covington Crest Trailhead
10269467,inf,Negropolis Trail
10003613,5.0,Elephant Seal Cove Trail
10272192,5.0,Bajada Nature Walk
10235899,5.0,Cholla Cactus Garden Nature Trail
...,...,...
10483854,,Rattlesnake Canyon from Indian Cove Picnic Area
10485496,,Goler Wash and Mengal Pass Route
10541099,,"5 Tanks Including Twin Tanks, Ivanpah and Live..."
10541144,,Covington Loop Segment and Deer Horn Trail


Save file

In [None]:
from os.path import exists
from google.colab import drive
drive.mount('/drive')

# if exists(f'/drive/My Drive/CF_{file}') != False:
#   recommendation_df.to_csv(f'/drive/My Drive/CF_{file}')

recommendation_df.to_csv(f'/drive/My Drive/CF_{file}')

Drive already mounted at /drive; to attempt to forcibly remount, call drive.mount("/drive", force_remount=True).
