### How similar in ratings the movies with the most similar plots?

In [20]:
import pickle
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
#Loading data 
pkl_file = open('filtered_movies.pkl', 'rb')
col_movies = pickle.load(pkl_file)
pkl_file.close()

pkl_file = open('filtered_movies_content.pkl', 'rb')
con_movies = pickle.load(pkl_file)
pkl_file.close()

pkl_file = open('collaborative_decomposed_data.pkl', 'rb')
col_decomposed = pickle.load(pkl_file)
pkl_file.close()

pkl_file = open('content_decomposed_data.pkl', 'rb')
con_decomposed = pickle.load(pkl_file)
pkl_file.close()


pkl_file = open('con_highests.pkl', 'rb')
con_high = pickle.load(pkl_file)
pkl_file.close()


#### Finding the rating similarity of the movies with the most similar plots

In [28]:
class movie_similarity_lookup(object):
    """
    This class calculates the cosine distance of one row from all other rows in a matrix.
    
    Args:
    dataframe (dataframe): a dataframe with the column 'movie_title' containting the movie titles 
                           and indexed in the same order as the decomposed_data matrix
    decomposed_data (array): a numpy array with rows for movies and columns for user ratings after dimensionality reduction
    """
    def __init__(self, dataframe, decomposed_data):
        self.dataframe = dataframe
        self.decomposed_data = decomposed_data
        
    def similarity(self,row):
         """
        This method finds the cosine similarity between the decopmposed vectors of the movies.
        
        Args:
        row (pandas.Series): a pandas.Series containing values for title1 and title2
        
        Return:
        Cosine similarity as float
        """
        movie_index1 = self.dataframe[self.dataframe.movie_title==row.title1].index[0]
        movie_index2 = self.dataframe[self.dataframe.movie_title==row.title2].index[0]
        v1 = self.decomposed_data[movie_index1,:].reshape(1, -1)
        v2 = self.decomposed_data[movie_index2,:].reshape(1, -1)
        return round(cosine_similarity(v1,v2),3)

In [29]:
m = movie_similarity_lookup(col_movies,col_decomposed)

In [23]:
df = pd.DataFrame(con_high, columns=['con_sim','title1','title2'])

df['col_sim'] = df.apply(lambda x:m.similarity(x), axis=1)

df['diff'] = df['con_sim'] - df['col_sim']

### The movies which are very similar based on their plots but different based on the ratings  - Remakes

In [37]:
df.sort_values('diff', ascending=False).head(20)

Unnamed: 0,con_sim,title1,title2,col_sim,diff
4338,1.0,Curfew (2012),Before I Disappear (2014),0.107,0.893
2222,1.0,Before I Disappear (2014),Curfew (2012),0.107,0.893
2882,0.864,Excalibur (1981),King Arthur (2004),0.01,0.854
3871,0.864,King Arthur (2004),Excalibur (1981),0.01,0.854
506,0.844,The Talented Mr. Ripley (1999),Plein soleil (1960),0.002,0.842
3812,0.844,Plein soleil (1960),The Talented Mr. Ripley (1999),0.002,0.842
5732,0.824,American Pie (1999),Date and Switch (2014),-0.007,0.831
1891,0.824,Date and Switch (2014),American Pie (1999),-0.007,0.831
3657,0.816,Mascots (2016),Surf's Up (2007),0.004,0.812
5283,0.816,Surf's Up (2007),Mascots (2016),0.004,0.812


### Movies with very similar ratings but with different plots - Movie series

In [42]:
df.sort_values('diff', ascending=True).head(20)

Unnamed: 0,con_sim,title1,title2,col_sim,diff
1976,0.365,The Bourne Supremacy (2004),The Bourne Ultimatum (2007),0.876,-0.511
2549,0.365,The Bourne Ultimatum (2007),The Bourne Supremacy (2004),0.876,-0.511
2529,0.368,Rurôni Kenshin: Kyôto Taika-hen (2014),Rurouni Kenshin: The Legend Ends (2014),0.836,-0.468
5661,0.412,Lethal Weapon 2 (1989),Lethal Weapon 3 (1992),0.86,-0.448
3857,0.389,Indiana Jones and the Temple of Doom (1984),Indiana Jones and the Last Crusade (1989),0.815,-0.426
3038,0.418,Kill Bill: Vol. 2 (2004),Kill Bill: Vol. 1 (2003),0.831,-0.413
3977,0.519,The Lord of the Rings: The Fellowship of the R...,The Lord of the Rings: The Two Towers (2002),0.93,-0.411
576,0.541,The Lord of the Rings: The Two Towers (2002),The Lord of the Rings: The Return of the King ...,0.949,-0.408
575,0.541,The Lord of the Rings: The Return of the King ...,The Lord of the Rings: The Two Towers (2002),0.949,-0.408
849,0.413,Sherlock Holmes (2009),Sherlock Holmes: A Game of Shadows (2011),0.801,-0.388
