> Note: KNN is a memory-based model, that means it will memorize the patterns and not generalize. It is simple yet powerful technique and compete with SOTA models like BERT4Rec.

In [None]:
import os
project_name = "reco-tut-itr"; branch = "main"; account = "sparsh-ai"
project_path = os.path.join('/content', project_name)

if not os.path.exists(project_path):
    !cp /content/drive/MyDrive/mykeys.py /content
    import mykeys
    !rm /content/mykeys.py
    path = "/content/" + project_name; 
    !mkdir "{path}"
    %cd "{path}"
    import sys; sys.path.append(path)
    !git config --global user.email "recotut@recohut.com"
    !git config --global user.name  "reco-tut"
    !git init
    !git remote add origin https://"{mykeys.git_token}":x-oauth-basic@github.com/"{account}"/"{project_name}".git
    !git pull origin "{branch}"
    !git checkout main
else:
    %cd "{project_path}"

In [None]:
import os
import numpy as np
import pandas as pd
import scipy.sparse
from scipy.spatial.distance import correlation

In [None]:
df = pd.read_parquet('./data/silver/rating.parquet.gz')
df.info()

In [None]:
df2 = pd.read_parquet('./data/silver/items.parquet.gz')
df2.info()

In [None]:
df = pd.merge(df, df2, on='itemId')
df.info()

In [None]:
rating_matrix = pd.pivot_table(df, values='rating',
                               index=['userId'], columns=['itemId'])
rating_matrix

In [None]:
def similarity(user1, user2):
    try:
        user1=np.array(user1)-np.nanmean(user1)
        user2=np.array(user2)-np.nanmean(user2)
        commonItemIds=[i for i in range(len(user1)) if user1[i]>0 and user2[i]>0]
        if len(commonItemIds)==0:
           return 0
        else:
           user1=np.array([user1[i] for i in commonItemIds])
           user2=np.array([user2[i] for i in commonItemIds])
           return correlation(user1,user2)
    except ZeroDivisionError:
        print("You can't divide by zero!")

In [None]:
def nearestNeighbourRatings(activeUser, K):
    try:
        similarityMatrix=pd.DataFrame(index=rating_matrix.index,columns=['Similarity'])
        for i in rating_matrix.index:
            similarityMatrix.loc[i]=similarity(rating_matrix.loc[activeUser],rating_matrix.loc[i])
        similarityMatrix=pd.DataFrame.sort_values(similarityMatrix,['Similarity'],ascending=[0])
        nearestNeighbours=similarityMatrix[:K]
        neighbourItemRatings=rating_matrix.loc[nearestNeighbours.index]
        predictItemRating=pd.DataFrame(index=rating_matrix.columns, columns=['Rating'])
        for i in rating_matrix.columns:
            predictedRating=np.nanmean(rating_matrix.loc[activeUser])
            for j in neighbourItemRatings.index:
                if rating_matrix.loc[j,i]>0:
                   predictedRating += (rating_matrix.loc[j,i]-np.nanmean(rating_matrix.loc[j]))*nearestNeighbours.loc[j,'Similarity']
                predictItemRating.loc[i,'Rating']=predictedRating
    except ZeroDivisionError:
        print("You can't divide by zero!")            
    return predictItemRating

In [None]:
def topNRecommendations(activeUser, N):
    try:
        predictItemRating = nearestNeighbourRatings(activeUser,N)
        placeAlreadyWatched = list(rating_matrix.loc[activeUser].loc[rating_matrix.loc[activeUser]>0].index)
        predictItemRating = predictItemRating.drop(placeAlreadyWatched)
        topRecommendations = pd.DataFrame.sort_values(predictItemRating,['Rating'],ascending = [0])[:N]
        topRecommendationTitles = (df.loc[df.itemId.isin(topRecommendations.index)])
    except ZeroDivisionError:
        print("You can't divide by zero!")
    return list([topRecommendationTitles.location,
                 topRecommendationTitles.place,
                 topRecommendationTitles.state,
                 topRecommendationTitles.location_rating])

In [None]:
def favoritePlace(activeUser,N):
    topPlace=pd.DataFrame.sort_values(df[df.userId==activeUser],['rating'],ascending=[0])[:N]
    return list([topPlace.location,
                 topPlace.place,
                 topPlace.state,
                 topPlace.location_rating])

In [None]:
activeUser = 4

In [None]:
print("Your favorite places are: ")
fav_place=pd.DataFrame(favoritePlace(str(activeUser),4))
fav_place=fav_place.T
fav_place=fav_place.sort_values(by='location_rating', ascending=False)
fav_place

In [None]:
print("The recommended places for you are: ")
topN = pd.DataFrame(topNRecommendations(str(activeUser), 4))
topN = topN.T
topN = topN.sort_values(by = 'location_rating', ascending=False).drop_duplicates().reset_index(drop=True)
topN