## Load Libraries

In [None]:
from math import sqrt
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

## Load Files

In [None]:
df_tags= pd.read_csv('../input/tags.csv')
df_movies = pd.read_csv('../input/movies.csv')
df_ratings= pd.read_csv('../input/ratings.csv')
df_links= pd.read_csv('../input/links.csv')

## Exploratory Data Analysis(EDA)

In [None]:
#File Structure
df_movies.head(5)

### Most popular genres of movie released

In [None]:
plt.figure(figsize=(20,7))
generlist = df_movies['genres'].apply(lambda generlist_movie : str(generlist_movie).split("|"))
geners_count = {}

for generlist_movie in generlist:
    for gener in generlist_movie:
        if(geners_count.get(gener,False)):
            geners_count[gener]=geners_count[gener]+1
        else:
            geners_count[gener] = 1       
geners_count.pop("(no genres listed)")
plt.bar(geners_count.keys(),geners_count.values(),color='m')

In [None]:
df_ratings.head(5)

### Distribution of users rating

In [None]:
sns.distplot(df_ratings["rating"]);

In [None]:
df_tags.head(5)

In [None]:
df_links.head(5)

In [None]:
print("Shape of frames: \n"+ " Rating DataFrame"+ str(df_ratings.shape)+"\n Movies DataFrame"+ str(df_movies.shape)+"\n Tags DataFrame"+ str(df_tags.shape)+"\n Links DataFrame"+ str(df_links.shape))

# Content based filtering
#### We will consider genres as an important parameter to recommend user the movie he watches based on generes of movie user has already watched.

In [None]:
# Took help of datacamp course to write code, understand and build this model.
#Define a TF-IDF Vectorizer Object.
tfidf_movies_genres = TfidfVectorizer(token_pattern = '[a-zA-Z0-9\-]+')

#Replace NaN with an empty string
df_movies['genres'] = df_movies['genres'].replace(to_replace="(no genres listed)", value="")

#Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_movies_genres_matrix = tfidf_movies_genres.fit_transform(df_movies['genres'])
print(tfidf_movies_genres.get_feature_names())
# Compute the cosine similarity matrix
print(tfidf_movies_genres_matrix.shape)
print(tfidf_movies_genres_matrix.dtype)
cosine_sim_movies = linear_kernel(tfidf_movies_genres_matrix, tfidf_movies_genres_matrix)
# print(cosine_sim_movies)

In [None]:
def get_recommendations_based_on_genres(movie_title, cosine_sim_movies=cosine_sim_movies):
    """
    Calculates top 10 movies to recommend based on given movie titles genres. 
    :param movie_title: title of movie to be taken for base of recommendation
    :param cosine_sim_movies: cosine similarity between movies 
    :return: Titles of movies recommended to user
    """
    # Get the index of the movie that matches the title
    idx_movie = df_movies.loc[df_movies['title'].isin([movie_title])]
    idx_movie = idx_movie.index
    
    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores_movies = list(enumerate(cosine_sim_movies[idx_movie][0]))
    
    # Sort the movies based on the similarity scores
    sim_scores_movies = sorted(sim_scores_movies, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores_movies = sim_scores_movies[1:3]
    
    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores_movies]

    # Return the top 2 most similar movies
    return df_movies['title'].iloc[movie_indices]

# print(get_recommendations_based_on_genres('Toy Story (1995)'))

# KNN

In [None]:
print(type(tfidf_movies_genres_matrix))

In [None]:
from sklearn.neighbors import KNeighborsClassifier  
def get_movie_label(movie_id):
    classifier = KNeighborsClassifier(n_neighbors=5)
    x= tfidf_movies_genres_matrix
    y = df_movies.iloc[:,-1]
    classifier.fit(x, y)
    y_pred = classifier.predict(tfidf_movies_genres_matrix[movie_id])
    return y_pred

In [None]:
temp_df_movies = df_movies.iloc[0:50]
true_count = 0
false_count = 0
def evaluate_content_based_model():
    for key, colums in temp_df_movies.iterrows():
        movies_recommended_by_model = get_recommendations_based_on_genres(colums["title"])
        predicted_genres  = get_movie_label(movies_recommended_by_model.index)
        for predicted_genre in predicted_genres:
            global true_count, false_count
            if predicted_genre == colums["genres"]:
                true_count = true_count+1
            else:
                print(colums["genres"])
                print(predicted_genre)
                false_count = false_count +1
evaluate_content_based_model()
print(true_count)
print(false_count)

In [None]:
df_movies[df_movies['movieId']== 6]

In [None]:

len(df_ratings[df_ratings["userId"]==2])

In [None]:
def get_recommendation(userId):
    recommended_movie_list = []
    movie_list = []
    df_rating_filtered = df_ratings[df_ratings["userId"]== userId]
    for key, row in df_rating_filtered.iterrows():
        movie_list.append((df_movies["title"][row["movieId"]==df_movies["movieId"]]).values)
    print(len(movie_list))    
    for index, movie in enumerate(movie_list):
#         print(type(movie[0]))
#         ind_recommended_movie_list = get_recommendations_based_on_genres(movie[0])
#         print(ind_recommended_movie_list)
        for key, movie_recommended in get_recommendations_based_on_genres(movie[0]).iteritems():
#             print(key)
#             print(movie_recommended)
            recommended_movie_list.append(movie_recommended)
    return recommended_movie_list
print(len(set(get_recommendation(1)))) 
print(len(get_recommendation(1)))
        

In [None]:

# plt.figure(figsize=(20,7))
# movie_generlist = []
# for key, row in df_ratings.iterrows():
#     movie_generlist.append(df_movies["genres"][row["movieId"]==df_movies["movieId"]])


# movie_generlist = pd.Series(movie_generlist)
# print(movie_generlist)
# # generlist = movie_generlist.apply(lambda generlist_movie : str(generlist_movie).split("|"))
# # geners_count = {}

# # for generlist_movie in generlist:
# #     for gener in generlist_movie:
# #         if(geners_count.get(gener,False)):
# #             geners_count[gener]=geners_count[gener]+1
# #         else:
# #             geners_count[gener] = 1       
# # geners_count.pop("(no genres listed)")
# # plt.bar(geners_count.keys(),geners_count.values(),color='m')
# # print(generlist)