# Nearest Neighbor ***User based*** collaborative filtering

<img src = "https://miro.medium.com/max/950/1*0ygb2uJknG6FdYT1vZqnsg.png"> </img>

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
import scipy

In [None]:
movies_df = pd.read_csv('/kaggle/input/movie-lens-small-latest-dataset/movies.csv', usecols=['movieId', 'title'], dtype = {'movieId':'int32', 'title':'str'})
rating_df = pd.read_csv('/kaggle/input/movie-lens-small-latest-dataset/ratings.csv', usecols = ['userId', 'movieId', 'rating'],dtype={'userId': 'int32', 'movieId': 'int32', 'rating': 'float32'})

In [None]:
movies_df.head()

In [None]:
rating_df.head()

In [None]:
df = pd.merge(rating_df,movies_df,on='movieId')
df.head()

Let's find a movie rating count

In [None]:
# Subset parameter defines in which columns to look for missing values.
combine_movie_rating = df.dropna(axis=0, subset=['title'])
movie_ratingCount = (combine_movie_rating.groupby(by=['title'])['rating'].count().reset_index().rename(columns={'rating':'totalRatingCount'}))[['title','totalRatingCount']]
movie_ratingCount.head()

We will now merge this result to the 'combine_movie_rating' dataframe that we created

In [None]:
# We here do left join based on title column in combine_movie_rating (as left dataframe) and title column in movie_ratingCount (as right dataframe)
rating_with_totalRatingCount = combine_movie_rating.merge(movie_ratingCount, left_on = 'title', right_on='title', how= 'left')
rating_with_totalRatingCount.head()

Sampling the view for better representation

In [None]:
rating_with_totalRatingCount.sample(5)

In [None]:
# Describing the totalRatingCount column statistically
pd.set_option('display.float_format', lambda x: '%.3f' %x)
print(movie_ratingCount['totalRatingCount'].describe())

In [None]:
#movie_ratingCount.hist(column = 'totalRatingCount', bins =50, figsize=(20,5))
fig, ax = plt.subplots(figsize=(17,8))
plt.axvline(x=50,ymax=0.95, c='red', label = 'Threshold (50)');
sns.histplot(ax=ax, data = movie_ratingCount['totalRatingCount'], log_scale=True);
plt.legend(fontsize=25);

***Let's consider a popularity threshold rating for movie recommendations as 50***

In [None]:
popularity_threshold = 50
rating_popular_movie = rating_with_totalRatingCount.query('totalRatingCount >= @popularity_threshold')
rating_popular_movie.head()

In [None]:
# Sampling the results of the query
rating_popular_movie.sample(5)

In [None]:
# Dimensions of my data
rating_popular_movie.shape

## Creating Pivot Table

In [None]:
movie_features_df=rating_popular_movie.pivot_table(index='title',columns='userId',values='rating').fillna(0)
movie_features_df.head()

In [None]:
from scipy.sparse import csr_matrix # To convert pivot table to array matrix
movie_features_df_matrix = csr_matrix(movie_features_df.values)

## Concept of Cosine Similarity

<img src="https://dataaspirant.com/wp-content/uploads/2015/04/cosine.png"> </img>

***For cos (0 deg), similarity is 1***

***For cos (90 deg), similarity is 0***

***We have 606 vectors to count the similarities(item-item) between each other***

## Training Nearest Neighbors ML Model

In [None]:
# Remember this is not K - Nearest Neighbors classifier or regressor. It's NearestNeighbors
from sklearn.neighbors import NearestNeighbors
# Generally, we use KNN to find similar vectors based on euclidean distance, but here we use the distance metric as cosine score
model_knn = NearestNeighbors(metric = 'cosine', algorithm = 'brute')
model_knn.fit(movie_features_df_matrix)

[Sklearn distance metrics documentation](https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.DistanceMetric.html#sklearn.neighbors.DistanceMetric)

In [None]:
movie_features_df.shape

In [None]:
# movie_features_df.shape[0] samples a vector
query_index = np.random.choice(movie_features_df.shape[0])
print(query_index)
# Getting euclidean distance based on cosine metric and indices of respective neighbors which are nearest
distances, indices = model_knn.kneighbors(movie_features_df.iloc[query_index,:].values.reshape(1,-1), n_neighbors=6)

### ***Here we can see we have randomly selected a movieId as 447. We then find its Nearest Neighbors (Item-Item collaborative filtering).*** 

### ***In actual scenario, our API will get a GET request in the form of movieID and our model will have to find its Nearest Neighbors***

[Differences between .flatten() and .ravel()](https://www.geeksforgeeks.org/differences-flatten-ravel-numpy/)

In [None]:
# .flatten() returns a copy of the array collapsed into one dimension (row major).
for i in range(0, len(distances.flatten())):
    if i == 0:
        print('Recommendations for {0}:\n'.format(movie_features_df.index[query_index]))
    else:
        print('{0}: {1}, with distance of {2}:'.format(i, movie_features_df.index[indices.flatten()[i]], distances.flatten()[i]))

# ***Results***

## ***Recommendations for Being John Malkovich (1999):***

### ***1: Knocked Up (2007), with distance of 0.353415846824646***
### ***2: 40-Year-Old Virgin, The (2005), with distance of 0.4131535291671753***
### ***3: Meet the Parents (2000), with distance of 0.4245836138725281***
### ***4: Anchorman: The Legend of Ron Burgundy (2004), with distance of 0.4388437271118164***
### ***5: Zoolander (2001), with distance of 0.4485795497894287***