In [1]:
import pandas as pd
import numpy as np
from numpy import linalg as LA
from matplotlib import pyplot as plt
import time
import datetime
df_links = pd.read_csv('links.csv')
df_movies = pd.read_csv('movies.csv')
df_ratings = pd.read_csv('ratings.csv')
df_tags = pd.read_csv('tags.csv')

In [2]:
def get_similarity_matrix(df_movies):
    l = list(df_movies['genres'].str.split("|"))
    s = set()
    for i in range(len(l)):
        for j in l[i]:
            s.add(j)
    genre_l = list(s)
    df_temp = df_movies.copy()
    df_movie_profile_byGenres = pd.DataFrame(index=range(len(df_movies["movieId"])),columns=genre_l)
    for i in range(len(df_movies["movieId"])):
        for j in range(len(genre_l)):
            df_movie_profile_byGenres[genre_l[j]][i] = int(genre_l[j] in df_temp["genres"][i])
    norm = np.array(df_movie_profile_byGenres.apply(LA.norm, axis=1)).reshape(-1,1)
    normalization = df_movie_profile_byGenres/norm
    similarity_matrix = np.dot(normalization,normalization.T)
    df_similarity_matrix = pd.DataFrame(similarity_matrix)
    df_similarity_matrix.columns=df_movies["movieId"]
    df_similarity_matrix.index=df_movies["movieId"]
    return df_similarity_matrix

In [88]:
def getRequirement(df_ratings,df_movies, min_rating=0, from_date="1996-01-01", end_date="2018-12-31", min_total_rating=0):
    from_timestamp = time.mktime(datetime.datetime.strptime(from_date, "%Y-%m-%d").timetuple())
    end_timestamp = time.mktime(datetime.datetime.strptime(end_date, "%Y-%m-%d").timetuple())
    df_summary = df_ratings.loc[(df_ratings["timestamp"]>=from_timestamp)&(df_ratings["timestamp"]<=end_timestamp)]\
    .groupby('movieId').agg({"userId":"count","rating":"mean"}).reset_index()
    df_summary.columns = ["movieId","total rating","avg rating"]
    df_summary = df_summary.loc[(df_summary["avg rating"]>=min_rating)&(df_summary["total rating"]>=min_total_rating)]
    df_summary = pd.merge(df_movies,df_summary,on="movieId")
    return df_summary

In [89]:
def getkSimilar(df_similarity_matrix,df_summary,movieId=1,k=10,min_score=0,max_score=1):
    df_TopkSimilarMovie = pd.merge(df_summary,df_similarity_matrix[movieId],left_on="movieId",right_on="movieId")
    df_TopkSimilarMovie.columns = ["movieId", "title","genres","total rating","avg rating","cos score"]
    df_TopkSimilarMovie = df_TopkSimilarMovie.loc[df_TopkSimilarMovie["movieId"]!=movieId]
    df_TopkSimilarMovie = df_TopkSimilarMovie.loc[(df_TopkSimilarMovie["cos score"]<max_score)&(df_TopkSimilarMovie["cos score"]>min_score)]\
    .sort_values(by=["cos score","avg rating","total rating"],ascending=False).head(k)
    return df_TopkSimilarMovie

In [7]:
# User starts from here

In [None]:
# similarity matrix may take very long time to load up
# but this will combine all movie genres, thus only necessary for first time usage.
df_similarity_matrix = get_similarity_matrix(df_movies)

In [96]:
df_summary = getRequirement(df_ratings,df_movies)
df_TopkSimilarMovie = getkSimilar(df_similarity_matrix,df_summary,movieId=3)
df_TopkSimilarMovie

Unnamed: 0,movieId,title,genres,total rating,avg rating,cos score
49,951,His Girl Friday (1940),Comedy|Romance,14,4.392857,1.0
35,905,It Happened One Night (1934),Comedy|Romance,14,4.321429,1.0
199,4973,"Amelie (Fabuleux destin d'Amélie Poulain, Le) ...",Comedy|Romance,120,4.183333,1.0
31,898,"Philadelphia Story, The (1940)",Comedy|Drama|Romance,29,4.310345,0.816497
94,1235,Harold and Maude (1971),Comedy|Drama|Romance,26,4.288462,0.816497
116,1277,Cyrano de Bergerac (1990),Comedy|Drama|Romance,12,4.125,0.816497
98,1244,Manhattan (1979),Comedy|Drama|Romance,33,4.106061,0.816497
32,899,Singin' in the Rain (1952),Comedy|Musical|Romance,47,4.074468,0.816497
100,1247,"Graduate, The (1967)",Comedy|Drama|Romance,79,4.063291,0.816497
41,916,Roman Holiday (1953),Comedy|Drama|Romance,26,4.057692,0.816497
