In [1]:
import pandas as pd
import numpy as np
from numpy import linalg as LA
from matplotlib import pyplot as plt
import time
import datetime

In [9]:
df_movies = pd.read_csv('movies.csv')
df_ratings = pd.read_csv('ratings.csv')

In [11]:
df_movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [2]:
def get_similarity_matrix(df_movies):
    l = list(df_movies['genres'].str.split("|"))
    s = set()
    for i in range(len(l)):
        for j in l[i]:
            s.add(j)
    genre_l = list(s)
    df_temp = df_movies.copy()
    df_movie_profile_byGenres = pd.DataFrame(index=range(len(df_movies["movieId"])),columns=genre_l)
    for i in range(len(df_movies["movieId"])):
        for j in range(len(genre_l)):
            df_movie_profile_byGenres[genre_l[j]][i] = int(genre_l[j] in df_temp["genres"][i])
    norm = np.array(df_movie_profile_byGenres.apply(LA.norm, axis=1)).reshape(-1,1)
    normalization = df_movie_profile_byGenres/norm
    similarity_matrix = np.dot(normalization,normalization.T)
    df_similarity_matrix = pd.DataFrame(similarity_matrix)
    df_similarity_matrix.columns=df_movies["movieId"]
    df_similarity_matrix.index=df_movies["movieId"]
    return df_similarity_matrix

In [3]:
def getRequirement(df_ratings,df_movies, min_rating=0, from_date="1996-01-01", end_date="2018-12-31", min_total_rating=0):
    from_timestamp = time.mktime(datetime.datetime.strptime(from_date, "%Y-%m-%d").timetuple())
    end_timestamp = time.mktime(datetime.datetime.strptime(end_date, "%Y-%m-%d").timetuple())
    df_summary = df_ratings.loc[(df_ratings["timestamp"]>=from_timestamp)&(df_ratings["timestamp"]<=end_timestamp)]\
    .groupby('movieId').agg({"userId":"count","rating":"mean"}).reset_index()
    df_summary.columns = ["movieId","total rating","avg rating"]
    df_summary = df_summary.loc[(df_summary["avg rating"]>=min_rating)&(df_summary["total rating"]>=min_total_rating)]
    df_summary = pd.merge(df_movies,df_summary,on="movieId")
    return df_summary

In [4]:
def getkSimilar(df_similarity_matrix,df_summary,movieId=1,k=10,min_score=0,max_score=1):
    df_TopkSimilarMovie = pd.merge(df_summary,df_similarity_matrix[movieId],left_on="movieId",right_on="movieId")
    df_TopkSimilarMovie.columns = ["movieId", "title","genres","total rating","avg rating","cos score"]
    df_TopkSimilarMovie = df_TopkSimilarMovie.loc[df_TopkSimilarMovie["movieId"]!=movieId]
    df_TopkSimilarMovie = df_TopkSimilarMovie.loc[(df_TopkSimilarMovie["cos score"]<max_score)&(df_TopkSimilarMovie["cos score"]>min_score)]\
    .sort_values(by=["cos score","avg rating","total rating"],ascending=False).head(k)
    return df_TopkSimilarMovie

In [5]:
# User starts from here

In [6]:
# similarity matrix may take very long time to load up
# but this will combine all movie genres, thus only necessary for first time usage.
df_similarity_matrix = get_similarity_matrix(df_movies)

In [7]:
df_similarity_matrix

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1,0.774597,0.316228,0.258199,0.447214,0,0.316228,0.632456,0,0.258199,...,0.447214,0.316228,0.316228,0.447214,0,0.67082,0.774597,0,0.316228,0.447214
2,0.774597,1,0,0,0,0,0,0.816497,0,0.333333,...,0,0,0,0,0,0.288675,0.333333,0,0,0
3,0.316228,0,1,0.816497,0.707107,0,1,0,0,0,...,0.353553,0,0.5,0,0,0.353553,0.408248,0,0,0.707107
4,0.258199,0,0.816497,1,0.57735,0,0.816497,0,0,0,...,0.288675,0.408248,0.816497,0,0,0.288675,0.333333,0.57735,0,0.57735
5,0.447214,0,0.707107,0.57735,1,0,0.707107,0,0,0,...,0.5,0,0.707107,0,0,0.5,0.57735,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193581,0.67082,0.288675,0.353553,0.288675,0.5,0.288675,0.353553,0,0.5,0.288675,...,0.75,0.353553,0.353553,0.5,0,1,0.866025,0,0.707107,0.5
193583,0.774597,0.333333,0.408248,0.333333,0.57735,0,0.408248,0,0,0,...,0.57735,0.408248,0.408248,0.57735,0,0.866025,1,0,0.408248,0.57735
193585,0,0,0,0.57735,0,0,0,0,0,0,...,0,0.707107,0.707107,0,0,0,0,1,0,0
193587,0.316228,0,0,0,0,0.408248,0,0,0.707107,0.408248,...,0.707107,0.5,0,0.707107,0,0.707107,0.408248,0,1,0


In [8]:
df_summary = getRequirement(df_ratings,df_movies)
df_TopkSimilarMovie = getkSimilar(df_similarity_matrix,df_summary,movieId=193587,k=10)
df_TopkSimilarMovie

Unnamed: 0,movieId,title,genres,total rating,avg rating,cos score
7878,95004,Superman/Doomsday (2007),Action|Animation,1,4.0,1.0
8062,99813,"Batman: The Dark Knight Returns, Part 2 (2013)",Action|Animation,8,3.875,1.0
7363,79274,Batman: Under the Red Hood (2010),Action|Animation,3,3.666667,1.0
5584,26913,Street Fighter II: The Animated Movie (Sutorît...,Action|Animation,1,1.5,1.0
8913,136297,Mortal Kombat: The Journey Begins (1995),Action|Animation,1,0.5,1.0
7885,95149,Superman/Batman: Public Enemies (2009),Action|Animation|Fantasy,1,5.0,0.816497
8130,102084,Justice League: Doom (2012),Action|Animation|Fantasy,1,5.0,0.816497
8030,98607,Redline (2009),Action|Animation|Sci-Fi,1,4.5,0.816497
7905,95475,Dragon Ball Z: Cooler's Revenge (Doragon bôru ...,Action|Adventure|Animation,2,4.0,0.816497
6908,64695,Sword of the Stranger (Sutorejia: Mukô hadan) ...,Action|Adventure|Animation,1,4.0,0.816497
