In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
import matplotlib.pyplot as plt
import seaborn as sns

# Content-based Filtering
This filtration strategy is based on the data provided about the items. The algorithm recommends products that are similar to the ones that a user has liked in the past. This similarity (generally cosine similarity) is computed from the data we have about the items as well as the user’s past preferences.
For example, if a user likes movies such as ‘The Prestige’ then we can recommend him the movies of ‘Christian Bale’ or movies with the genre ‘Thriller’ or maybe even movies directed by ‘Christopher Nolan’.So what happens here the recommendation system checks the past preferences of the user and find the film “The Prestige”, then tries to find similar movies to that using the information available in the database such as the lead actors, the director, genre of the film, production house, etc and based on this information find movies similar to “The Prestige”.

- Disadvantages

Different products do not get much exposure to the user.
Businesses cannot be expanded as the user does not try different types of products.

# Collaborative Filtering
This filtration strategy is based on the combination of the user’s behavior and comparing and contrasting that with other users’ behavior in the database. The history of all users plays an important role in this algorithm. The main difference between content-based filtering and collaborative filtering that in the latter, the interaction of all users with the items influences the recommendation algorithm while for content-based filtering only the concerned user’s data is taken into account.
There are multiple ways to implement collaborative filtering but the main concept to be grasped is that in collaborative filtering multiple user’s data influences the outcome of the recommendation. and doesn’t depend on only one user’s data for modeling.

There are 2 types of collaborative filtering algorithms:

### User-based Collaborative filtering
The basic idea here is to find users that have similar past preference patterns as the user ‘A’ has had and then recommending him or her items liked by those similar users which ‘A’ has not encountered yet. This is achieved by making a matrix of items each user has rated/viewed/liked/clicked depending upon the task at hand, and then computing the similarity score between the users and finally recommending items that the concerned user isn’t aware of but users similar to him/her are and liked it.

For example, if the user ‘A’ likes ‘Batman Begins’, ‘Justice League’ and ‘The Avengers’ while the user ‘B’ likes ‘Batman Begins’, ‘Justice League’ and ‘Thor’ then they have similar interests because we know that these movies belong to the super-hero genre. So, there is a high probability that the user ‘A’ would like ‘Thor’ and the user ‘B’ would like The Avengers’.

- Disadvantages

People are fickle-minded i.e their taste change from time to time and as this algorithm is based on user similarity it may pick up initial similarity patterns between 2 users who after a while may have completely different preferences.
There are many more users than items therefore it becomes very difficult to maintain such large matrices and therefore needs to be recomputed very regularly.
This algorithm is very susceptible to shilling attacks where fake users profiles consisting of biased preference patterns are used to manipulate key decisions.



### Item-based Collaborative Filtering
The concept in this case is to find similar movies instead of similar users and then recommending similar movies to that ‘A’ has had in his/her past preferences. This is executed by finding every pair of items that were rated/viewed/liked/clicked by the same user, then measuring the similarity of those rated/viewed/liked/clicked across all user who rated/viewed/liked/clicked both, and finally recommending them based on similarity scores.

Here, for example, we take 2 movies ‘A’ and ‘B’ and check their ratings by all users who have rated both the movies and based on the similarity of these ratings, and based on this rating similarity by users who have rated both we find similar movies. So if most common users have rated ‘A’ and ‘B’ both similarly and it is highly probable that ‘A’ and ‘B’ are similar, therefore if someone has watched and liked ‘A’ they should be recommended ‘B’ and vice versa.

- Advantages over User-based Collaborative Filtering


Unlike people’s taste, movies don’t change.
There are usually a lot fewer items than people, therefore easier to maintain and compute the matrices.
Shilling attacks are much harder because items cannot be faked.

In [None]:

rating = pd.read_csv('/kaggle/input/netflix-prize-data/combined_data_4.txt',header = None, names = ['Cust_Id', 'Rating'], usecols = [0,1])

# Convert Ratings column to a float datatype
rating['Rating'] = rating['Rating'].astype(float)
rating.head()

In [None]:
rating = rating.iloc[:2000000,]
rating.shape

In [None]:
# (24058263, 2)
movie = pd.read_csv('/kaggle/input/netflix-prize-data/movie_titles.csv', encoding = "ISO-8859-1", header = None, names = ['Movie_Id', 'Year', 'Name'])


print(movie.head(10))

In [None]:
movie.sample(5)

In [None]:
movie.shape

In [None]:
# To count all the 'nan' values in the Ratings column in the 'ratings' dataset
df_nan = pd.DataFrame(pd.isnull(rating.Rating),)

df_nan.head()

In [None]:
df1 = pd.isnull(rating['Rating'])
df2 = pd.DataFrame(df1)
df3 = df2[df2['Rating']==True]
df3

In [None]:
df3 = df3.reset_index()
df_nan = df3.copy()
df_nan.head()

In [None]:
#To create a numpy array containing movie ids according the 'ratings' dataset

movie_np = []
movie_id = 1

for i,j in zip(df_nan['index'][1:],df_nan['index'][:-1]):
    # numpy approach
    temp = np.full((1,i-j-1), movie_id)
    movie_np = np.append(movie_np, temp)
    movie_id += 1

# Account for last record and corresponding length
# numpy approach
last_record = np.full((1,len(rating) - df_nan.iloc[-1, 0] - 1),movie_id)
movie_np = np.append(movie_np, last_record)

In [None]:
#To append the above created array to the datset after removing the 'nan' rows
rating = rating[pd.notnull(rating['Rating'])]

rating['Movie_Id'] = movie_np.astype(int)
rating['Cust_Id'] =rating['Cust_Id'].astype(int)
print('-Dataset examples-')
rating.head()

In [None]:
# 1 rating is very less let's check the percentage of distribution
sns.countplot(rating['Rating']);

In [None]:
# percentage distribution of rating here we can either ignore 1 rating data because it's very less in percantage 
(rating['Rating'].value_counts()/len(rating))*100

In [None]:
rating= rating[rating['Rating']!=1]
rating['Rating'].unique()

In [None]:
final_dataset = rating.pivot(index='Movie_Id',columns='Cust_Id',values='Rating')
final_dataset.sample(5)

In [None]:
final_dataset.shape

Here, we can see that userId 2649426 has watched movieId 17  and rated  4.0 but has not rated movieId other. This interpretation is harder to extract from this dataframe. Therefore, to make things easier to understand and work with, we are going to make a new dataframe where each column would represent each unique userId and each row represents each unique movieId.

In [None]:
# we will take only top 30% of cust_id 
thresh = len(final_dataset)*0.3
final_dataset = final_dataset.dropna(thresh=thresh,axis=1)
final_dataset.shape

Let’s fix this and impute NaN with 0 to make things understandable for the algorithm and also making the data more eye-soothing.

In [None]:
final_dataset.fillna(0,inplace=True)
final_dataset.head()

## Removing Noise from the data
In the real-world, ratings are very sparse and data points are mostly collected from very popular movies and highly engaged users. 
We wouldn’t want movies that were rated by a small number of users because it’s not credible enough. Similarly, 
users who have rated only a handful of movies should also not be taken into account.

So with all that taken into account and some trial and error experimentations,  we will reduce the noise by adding some filters for the final dataset.



# Let’s visualize how these filters look like
Aggregating the number of users who voted and the number of movies that were voted.

In [None]:
no_user_voted = rating.groupby('Movie_Id')['Rating'].agg('count')
no_movies_voted = rating.groupby('Cust_Id')['Rating'].agg('count')

In [None]:
# There are huge differnec in mean and median of no_of_voted users so we can remove outliers
no_user_voted.describe()

In [None]:
sns.distplot(no_user_voted);

In [None]:
no_movies_voted.describe()

In [None]:
# same way let's visulaize no_of_movies voted
sns.distplot(no_movies_voted);

In [None]:
#Let’s visualize the number of users who voted with our threshold of 800.

f,ax = plt.subplots(1,1,figsize=(16,4))
# ratings['rating'].plot(kind='hist')
plt.scatter(no_user_voted.index,no_user_voted,color='mediumseagreen')
plt.axhline(y=800,color='r')
plt.xlabel('MovieId')
plt.ylabel('No. of users voted')
plt.show()

In [None]:
# final_dataset = final_dataset.loc[no_user_voted[no_user_voted >200].index,:]
# final_dataset.shape

In [None]:
f,ax = plt.subplots(1,1,figsize=(16,4))
#rating['Rating'].plot(kind='hist')
plt.scatter(no_movies_voted.index,no_movies_voted,color='mediumseagreen')
plt.axhline(y=10,color='r')
plt.xlabel('CustId')
plt.ylabel('No. of votes by user')
plt.show()

In [None]:
# final_dataset=final_dataset.loc[:,no_movies_voted[no_movies_voted > 10].index]
# final_dataset


## Removing sparsity
- Feature engineering tech

Our final_dataset has dimensions of 75 * 5903 where most of the values are sparse. We are using only a small dataset but for the original large dataset of movie  which has more than 100000 features, our system may run out of computational resources when that is feed to the model. To reduce the sparsity we use the csr_matrix function from the scipy library.

In [None]:
# An example of how it works :
sample = np.array([[0,0,3,0,0],[4,0,0,0,2],[0,0,0,0,1]])
sample

In [None]:
# it will give count of non zero value
np.count_nonzero(sample)

In [None]:
# total number of values
sample.size

In [None]:
sparsity = 1.0 - ( np.count_nonzero(sample) / float(sample.size) )
print(sparsity)

In [None]:
csr_sample = csr_matrix(sample)
print(csr_sample)

As you can see there is no sparse value in the csr_sample and values are assigned as rows and column index. for the 0th row and 2nd column, the value is 3

In [None]:
# Applying the csr_matrix method to the dataset :

csr_data = csr_matrix(final_dataset.values)
final_dataset.reset_index(inplace=True)

## Making the movie recommendation system model
We will be using the KNN algorithm to compute similarity with cosine distance metric which is very fast and more preferable than pearson coefficient.

In [None]:
knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=15, n_jobs=-1)
knn.fit(csr_data)

## Making the recommendation function
The working principle is very simple. We first check if the movie name input is in the database and if it is we use our recommendation system to find similar movies and sort them based on their similarity distance and output only the top 10 movies with their distances from the input movie.

## Recomendation by movie name

In [None]:
def movie_recommendation(movie_name):
    n_movies_to_reccomend = 6
    movie_list = movie[movie['Name']==movie_name]
    print(movie_list)
    if len(movie_list):        
        movie_idx= movie_list.iloc[0]['Movie_Id']
        movie_idx = final_dataset[final_dataset['Movie_Id'] == movie_idx].index
        distances , indices = knn.kneighbors(csr_data[movie_idx],n_neighbors=n_movies_to_reccomend+1) 
        rec_movie_indices = sorted(list(zip(indices.squeeze().tolist(),distances.squeeze().tolist())),key=lambda x: x[1])[:0:-1]
        recommend_frame = []
        for val in rec_movie_indices:
            movie_idx = final_dataset.iloc[val[0]]['Movie_Id']
            idx = movie[movie['Movie_Id'] == movie_idx].index
            recommend_frame.append({'Name':movie.iloc[idx]['Name'].values[0],'Distance':val[1]})
        
        df = pd.DataFrame(recommend_frame,index=range(1,n_movies_to_reccomend+1))
        return df
    else:
        return "No movies found try another movie"

In [None]:
movie_recommendation('Dinosaur Planet')

In [None]:
movie_recommendation('DDLJ')

In [None]:
movie_recommendation('Sick')

In [None]:
movie.head()

In [None]:
rating.head()

In [None]:
rating['Rating'].unique()

## Recomendation by Cust_id

In [None]:
def movie_recommendation(Cust_Id):
    n_movies_to_reccomend = 6
    movie_idx = rating[rating.loc[:,'Cust_Id']==2385003]['Movie_Id'].iloc[0]
    movie_list = movie[movie['Movie_Id']==movie_idx].loc[:,'Name']
    print(movie_list)
    if len(movie_list):        
        movie_idx = final_dataset[final_dataset['Movie_Id'] == movie_idx].index
        distances , indices = knn.kneighbors(csr_data[movie_idx],n_neighbors=n_movies_to_reccomend+1) 
        rec_movie_indices = sorted(list(zip(indices.squeeze().tolist(),distances.squeeze().tolist())),key=lambda x: x[1])[:0:-1]
        recommend_frame = []
        for val in rec_movie_indices:
            movie_idx = final_dataset.iloc[val[0]]['Movie_Id']
            idx = movie[movie['Movie_Id'] == movie_idx].index
            recommend_frame.append({'Name':movie.iloc[idx]['Name'].values[0],'Distance':val[1]})
        
        df = pd.DataFrame(recommend_frame,index=range(1,n_movies_to_reccomend+1))
        return df
    else:
        return "No movies found try another movie"
    
movie_recommendation(2385003)

## Decision Tree

In [None]:
final_df = pd.merge(movie,rating,on='Movie_Id',how='inner')
final_df.shape

In [None]:
final_df.isnull().sum()

In [None]:
final_df = final_df.dropna()
final_df.shape

In [None]:
final_df.head()

In [None]:
final_df.groupby('Name')['Rating'].mean().sort_values(ascending=False).head()

In [None]:
final_df.groupby('Name')['Rating'].count().sort_values(ascending=False).head()

In [None]:
rating_df = pd.DataFrame(final_df.groupby('Name')['Rating'].mean())
rating_df

In [None]:
rating_df['no_of_rating'] = final_df.groupby('Name')['Rating'].count()
rating_df

In [None]:
#
plt.figure(figsize=(10,4))
rating_df['no_of_rating'].hist(bins=70);

In [None]:
# by below plot we can understand most of the people given rating 3 to 4 but this follow normal distribution
plt.figure(figsize=(10,4))
rating_df['Rating'].hist(bins=70);

In [None]:
# here we can see number of ratings has huge outliers so we will filter data no of ratings near 25000
sns.jointplot(x='Rating',y='no_of_rating',data=rating_df,alpha=0.5);

In [None]:
filter_df = rating_df[rating_df['no_of_rating']<1000]
sns.jointplot(x='Rating',y='no_of_rating',data=filter_df,alpha=0.5);

In [None]:
filter_df.shape

In [None]:
final_df.groupby('Cust_Id')['Rating'].mean().sort_values(ascending=False).head()

In [None]:
final_df.groupby('Cust_Id')['Rating'].count().sort_values(ascending=False).head()

In [None]:
last_df = pd.DataFrame(final_df.groupby('Cust_Id')['Movie_Id'].mean().sort_values())
last_df

In [None]:
last_df['No_of_rating'] = final_df.groupby('Cust_Id')['Movie_Id'].count().sort_values()
last_df

In [None]:
plt.figure(figsize=(10,4))
last_df['No_of_rating'].hist(bins=70);

In [None]:
# by below graph we can see that most number of rating given by user is 4 then 5 then 3 and we can say 2 rating as outliers
plt.figure(figsize=(10,4))
last_df['Movie_Id'].hist(bins=70);

In [None]:
sns.jointplot(x='Movie_Id',y='No_of_rating',data=last_df,alpha=0.5);

In [None]:
# we will filter the data of having no of rating less then 100
last_filter_df = last_df[last_df['No_of_rating']<150]
last_filter_df.reset_index(inplace=True)
last_filter_df.head()

In [None]:
last_filter_df.shape

In [None]:
pivot = last_filter_df.groupby(['Cust_Id','Movie_Id'])['No_of_rating'].mean().to_frame()
pivot

In [None]:
sorted_df = last_filter_df.sort_values('No_of_rating',ascending=False).head(10)
sorted_df

In [None]:
## let's take two movie id from top 10 highest rated movie like 305344 and 303948
no_of_rating_305344 = pivot.loc[305344]
no_of_rating_303948 = pivot.loc[303948]