In [None]:
# There are the steps we are going to perform in building this model:
# 1. Obtained the data required to build the model
# 2. Create TF-IDF vectors for the plot description of every movie
# 3. Compute the pairwise cosine similarity score of evert movie
# 4. Write the recommender function that takes in a movie title as an argument and outputs movies... 
# ...most similar to it based on the plot
# این سیستم توصیه گر از کاربر یک فیلم را میگیرد و فیلم هایی که با این فیلم در ژانر بیشترین مشابهت را دارد برمیگرداند

In [None]:
import pandas as pd
import numpy as np

In [None]:
data=pd.read_csv('movie_data.csv')
data.head(2)

In [None]:
#Add the useful features into the dataframe

#data['overview'], data['id'] = data['tag'], data['movieId']



In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Define a TF-IDF Vectorizer object. Remove all english stopwords

tfidf=TfidfVectorizer(stop_words='english')

# Replace NaN with an empty string

data['genres']=data['genres'].fillna('')

# Construct the required TF-IDF matrix by applying the fit_transform method

tfidf_matrix=tfidf.fit_transform(data['genres'])

# Output the shape of tfidf_matrix

tfidf_matrix.shape

# تعداد سطرهای ماتریس برابر است با تعداد فیلم ها و تعداد ستونها تعداد کلمات مختلف استفاده شده در ستون ژانر است
#Recommendation system with python رجوع شود به صفحه 109 کتاب 
# movie_name    comedy triller western action fantasy
#  AAA            0        0      1       0      1
#  BBB            0       1       0       1      0

In [None]:
# ماتریس مشابهت فیلم ها را می خواهیم ایجاد کنیم 
#یک ماتریس مربعی که تعداد سطر و ستون برابر تعداد فیلم هاست
# و مثلا مقدار سطر سوم و ستون پنچم عددی است بین -1 و 1 
#و هر چه این عدد بزرگتر باشد دو فیلم بیشتر در ژانر بهم شباهت دارند

# Import linear_kernel to compute the dot product

from sklearn.metrics.pairwise import linear_kernel

# Compute the cosin similarity

cosine_sim=linear_kernel(tfidf_matrix, tfidf_matrix)


In [None]:
# Construct a reverse mapping of indices and movie titles and drop duplicates

indices=pd.Series(data.index, index=data['movie_title']).drop_duplicates()


In [None]:
# 1. Declare the title of the movie as an argument.
# 2. Obtain the index of the movie from the indices reverse
# mapping.
# 3. Get the list of cosine similarity scores for that particular
# movie with all movies using cosine_sim. Convert this into a
# list of tuples where the first element is the position and
# the second is the similarity score.
# 4. Sort this list of tuples on the basis of the cosine
# similarity scores.
# 5. Get the top 10 elements of this list. Ignore the first
# element as it refers to the similarity score with itself
# the movie most similar to a particular movie is
# obviously the movie itself).
# 6. Return the titles corresponding to the indices of the top 10 elements, excluding the first:

# Function that takes in movies title as input and gives recomendations

def content_recommender(title, cosine_sim=cosine_sim, data=data, indices=indices):

    # Obtain the index of the movie that matches the title
    idx = indices[title]

    # Get the pairwise similarity scores of all movies with that movie
    # And convert it into a list of tuples as described above

    sim_scores=list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the cosine similarity scores

    sim_scores=sorted(sim_scores, key=lambda x: x[1], reverse=True).all()

    # Get the scores of the 10 most similar movies, Ignore the first movie

    sim_scores=sim_scores[1:11]

    # Get the movie indices

    movie_indices= [i[0] for i in sim_scores]

    # Return the top 10 similar movies

    return data['movie_title'].iloc[movie_indices]


In [None]:
# The content-based recommender is built
# Now I want to ask it to recommendation of movies similar to Avatar'

content_recommender('Avatar')