In [1]:
import numpy as np
import pandas as pd
import scipy.sparse
from scipy.spatial.distance import correlation

In [12]:
data=pd.read_csv('data_collaborative.csv')
placeInfo=pd.read_csv('data_content.csv')

In [13]:
data=pd.merge(data, placeInfo, left_on='itemId', right_on="itemId")
userIds=data.userId

In [14]:
data.loc[0:10,['userId']]
data=pd.DataFrame.sort_values(data, ['userId','itemId'], ascending=[0,1])

In [15]:
data.head()

Unnamed: 0,userId,itemId,rating,timestamp,category,distance,duration,nearby_places,title,url,p_rating,count
8,9,1,,878089716,Wildlife,6 Kms,1-2 Hours,,Jaipur Zoo,/rajasthan/jaipur/jaipur-zoo,2.2,158
17,9,2,3.0,890881335,Heritage,6 Kms,1-2 Hours,,Nahargarh Fort,/rajasthan/jaipur/nahargarh-fort,4.6,652
26,9,3,3.0,889502324,Heritage,13 Kms,2-3 Hours,"Jaigarh Fort (1 km by walk), Srijagat Siromani...",Amer Fort / Amber Fort,/rajasthan/jaipur/amer-fort-amber-fort,5.0,783
35,9,4,3.0,879525876,Pilgrimage,6 Kms,1-2 Hours,,Birla Mandir,/rajasthan/jaipur/birla-mandir,4.9,482
44,9,5,,879485532,Heritage,6 Kms,30 Mins,,Hawa Mahal,/rajasthan/jaipur/hawa-mahal,4.2,890


In [16]:
def favoritePlace(activeUser,N):
    topPlace=pd.DataFrame.sort_values(data[data.userId==activeUser],['rating'],ascending=[0])[:N]
    return list(topPlace.title)

In [17]:
userItemRatingMatrix=pd.pivot_table(data, values='rating', index=['userId'], columns=['itemId'])
userItemRatingMatrix.head()

itemId,1,2,3,4,5,6,7,8,9,10,...,23,24,25,26,27,28,29,30,31,32
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,3.0,1.0,,1.0,,,5.0,3.0,,...,,2.0,4.0,,5.0,,4.0,,3.0,
2,,4.0,,4.0,1.0,,1.0,4.0,,4.0,...,5.0,,,5.0,5.0,,3.0,,4.0,3.0
3,4.0,5.0,,5.0,,4.0,,3.0,1.0,3.0,...,5.0,3.0,,4.0,,4.0,4.0,,2.0,
4,4.0,,4.0,,,5.0,4.0,3.0,,,...,,4.0,4.0,,4.0,5.0,,2.0,5.0,
5,3.0,3.0,,3.0,4.0,,,3.0,5.0,5.0,...,4.0,3.0,3.0,,3.0,4.0,4.0,,4.0,4.0


In [18]:
def similarity(user1,user2):
    #print(user1, user2)
    try:
        user1=np.array(user1)-np.nanmean(user1)
        user2=np.array(user2)-np.nanmean(user2)
        commonItemIds=[i for i in range(len(user1)) if user1[i]>0 and user2[i]>0]
        #print(commonItemIds)
        if len(commonItemIds)==0:
           return 0
        else:
           user1=np.array([user1[i] for i in commonItemIds])
           user2=np.array([user2[i] for i in commonItemIds])
           #print(user1, user2)
           #print(correlation(user1, user2))
           return correlation(user1,user2)

    except ZeroDivisionError:
        print("You can't divide by zero!")

In [19]:
def nearestNeighbourRatings(activeUser,K):
    try:
        similarityMatrix=pd.DataFrame(index=userItemRatingMatrix.index,columns=['Similarity'])
        for i in userItemRatingMatrix.index:
            similarityMatrix.loc[i]=similarity(userItemRatingMatrix.loc[activeUser],userItemRatingMatrix.loc[i])
        #print(similarityMatrix)
        similarityMatrix=pd.DataFrame.sort_values(similarityMatrix,['Similarity'],ascending=[0])
        #print(similarityMatrix)
        nearestNeighbours=similarityMatrix[:K]
        #print(nearestNeighbours)
        neighbourItemRatings=userItemRatingMatrix.loc[nearestNeighbours.index]
        #print(neighbourItemRatings)
        predictItemRating=pd.DataFrame(index=userItemRatingMatrix.columns, columns=['Rating'])
        #print(predictItemRating)
        for i in userItemRatingMatrix.columns:
            predictedRating=np.nanmean(userItemRatingMatrix.loc[activeUser])
            #print(predictedRating)
            for j in neighbourItemRatings.index:
                if userItemRatingMatrix.loc[j,i]>0:
                   predictedRating += (userItemRatingMatrix.loc[j,i]-np.nanmean(userItemRatingMatrix.loc[j]))*nearestNeighbours.loc[j,'Similarity']
            predictItemRating.loc[i,'Rating']=predictedRating
    except ZeroDivisionError:
        print("You can't divide by zero!")            
    return predictItemRating

In [20]:
def topNRecommendations(activeUser,N):
    try:
        predictItemRating = nearestNeighbourRatings(activeUser,10)
        placeAlreadyWatched = list(userItemRatingMatrix.loc[activeUser].loc[userItemRatingMatrix.loc[activeUser]>0].index)
        predictItemRating = predictItemRating.drop(placeAlreadyWatched)
        topRecommendations = pd.DataFrame.sort_values(predictItemRating, ['Rating'],ascending=[0])[:N]
        topRecommendationTitles = (placeInfo.loc[placeInfo.itemId.isin(topRecommendations.index)])
    except ZeroDivisionError:
        print("You can't divide by zero!")
    return list(topRecommendationTitles.title)

In [21]:
activeUser=int(input("Enter userid: "))
print("The user's favorite places are: ")
print(favoritePlace(activeUser,5))
print("The recommended places for you are: ")
print(topNRecommendations(activeUser,3))

Enter userid: 5
The user's favorite places are: 
['City Palace / Sawai Man Singh II Museum', 'Akshardham Temple', 'Jaigarh Fort', 'Rambagh Palace', 'Albert Hall Museum']
The recommended places for you are: 
['Amer Fort / Amber Fort', 'Maharani Ki Chhatri', 'Moti Dungri Ganesh Temple']


  dist = 1.0 - uv / np.sqrt(uu * vv)


In [22]:
accuracy = metrics.accuracy_score(topNRecommendations(predictItemRating[User], Rating[User]))

Accuracy is : 83.7%
