### After finding similar movies based on the ratings, I'm going to do the same but based on the plots of the movies

In [1]:
import omdb
import pandas as pd
import numpy as np
import scipy
import re
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
cols = ['movie_id', 'movie_title', 'movie_genre']
movies = pd.read_csv('latest/movies.dat', sep='::',
                    index_col=False, names=cols,
                          encoding="UTF-8", engine='python')

#### For being able to compare the results, the same movies will be used as in Collaborative notebook 

In [3]:
#Loading filtered dataframe 
pkl_file = open('double_filtered.pkl', 'rb')
double_filtered = pickle.load(pkl_file)
pkl_file.close()

In [4]:
#Using only the movies used for rating based similarity calculation
filtered_movies = double_filtered.groupby('movie_id').count().reset_index()
filtered_movies = filtered_movies[['movie_id']].merge(movies, on='movie_id', how='left')

#### Retrieving the full plots from imdb database

In [None]:
# It turned out that when calling the API several time it tends to freeze.
# For avoiding to do the whole porcess again, after every 100 calls the data is saved. 
for k in range(60):   
    i = 0
    for mid in filtered_movies['movie_id']:
        if mid not in plot_dict:
            try:
                plot_dict[mid] = omdb.imdbid('tt'+'0'*(7-len(str(mid)))+str(mid),fullplot=True)['plot']
            except:
                plot_dict[mid] = 'N/A'
            i += 1
            if i == 100:
                break
    output = open('plot_dict_full.pkl', 'wb')
    pickle.dump(plot_dict, output)
    output.close()

In [6]:
# We have the right amount of movies
len(plot_dict)

5770

In [7]:
plot_df = pd.DataFrame(plot_dict.items(), columns=['movie_id','plot'])

In [8]:
# There are only 2 movies with no plot. That seems a tolerable loss
plot_df[plot_df['plot']=='N/A']

Unnamed: 0,movie_id,plot
2360,79417,
2614,80801,


In [9]:
# Drop these movies from the dataframe
plot_df = plot_df[plot_df['plot']!='N/A']

In [10]:
len(plot_df)

5768

#### Making a vector for all plot, using 1 and 2 words long tokens and scale to this corpus of the plots using tf-idf method

In [11]:
tf = TfidfVectorizer(analyzer='word',
                             ngram_range=(1, 2),
                             min_df=0,
                             stop_words='english')
tfidf_matrix = tf.fit_transform(plot_df['plot'])

In [12]:
tfidf_matrix.shape

(5768, 208378)

#### Using the same SVD as during rating based analysis for dimensionality reduction

In [13]:
svd_500 = TruncatedSVD(n_components=500, algorithm='arpack')
svd_500.fit(tfidf_matrix)
svd_500.explained_variance_ratio_.sum()

0.17640226992159991

In [14]:
decomposed_data = svd_500.transform(tfidf_matrix)

#### Adding the titles to the plots for making the results interpretable

In [15]:
movies_with_plot = plot_df.merge(filtered_movies, on='movie_id', how='left')

In [16]:
class movie_lookup(object):
    """
    This class calculates the cosine distance of one row from all other rows in a matrix.
    
    Args:
    dataframe (dataframe): a dataframe with the column 'movie_title' containting the movie titles 
                           and indexed in the same order as the decomposed_data matrix
    decomposed_data (array): a numpy array with rows for movies and columns for user ratings after dimensionality reduction
    """
    def __init__(self, dataframe, decomposed_data):
        self.dataframe = dataframe
        self.decomposed_data = decomposed_data
    
    def most_similar_movies(self,title,howmany):
        """
        This method connects the movie titles with the decomposed_data and finds X rows with the smallest cosine distance
        from the given row.
        
        Args:
        title (str): a title from the movie_title column of the dataframe
        howmany (int): a number defining how many results should be returned
        
        Return:
        A list of the most similar movie titles, and their 1-cosine distance from the given title
        """
        if not title in self.dataframe['movie_title'].tolist():
            raise ValueError('The given title is not among the movies')
        movie_index = self.dataframe[self.dataframe.movie_title==title].index[0]
        v = self.decomposed_data[movie_index,:].reshape(1, -1)
        distances = scipy.spatial.distance.cdist(self.decomposed_data, v, 'cosine').reshape(-1)
        most_similars = np.argsort(distances)[1:howmany+1]
        
        # making sure that not the original movie is returned even if there are other movies with the similarity of 1
        if movies_with_plot.loc[most_similars[0]]['movie_title'] != title:
            return [(movies_with_plot.loc[x]['movie_title'], 1-round(distances[x],3)) for x in most_similars]
        else:
            most_similars = np.argsort(distances)[0:howmany]
            return [(movies_with_plot.loc[x]['movie_title'], 1-round(distances[x],3)) for x in most_similars if movies_with_plot.loc[x]['movie_title'] != title]


In [17]:
m = movie_lookup(movies_with_plot, decomposed_data)
highests = []

for title in movies_with_plot['movie_title']:
    most_similar = m.most_similar_movies(title,1)
    highests.append((most_similar[0][1],title, most_similar[0][0]))

In [18]:
sorted(highests)[-20:]

[(0.864, 'King Arthur (2004)', 'Excalibur (1981)'),
 (0.867,
  'Halloween 4: The Return of Michael Myers (1988)',
  'Halloween: The Curse of Michael Myers (1995)'),
 (0.867,
  'Halloween: The Curse of Michael Myers (1995)',
  'Halloween 4: The Return of Michael Myers (1988)'),
 (0.869, 'Carrie (1976)', 'Carrie (2013)'),
 (0.869, 'Carrie (2013)', 'Carrie (1976)'),
 (0.885,
  'Star Wars: Episode V - The Empire Strikes Back (1980)',
  'Star Wars: Episode IV - A New Hope (1977)'),
 (0.92, 'True Grit (1969)', 'True Grit (2010)'),
 (0.92, 'True Grit (2010)', 'True Grit (1969)'),
 (0.929,
  'Star Wars: Episode IV - A New Hope (1977)',
  'Star Wars: Episode VI - Return of the Jedi (1983)'),
 (0.929,
  'Star Wars: Episode VI - Return of the Jedi (1983)',
  'Star Wars: Episode IV - A New Hope (1977)'),
 (0.962, "L'ennemi public n\xc2\xb01 (2008)", "L'instinct de mort (2008)"),
 (0.962, "L'instinct de mort (2008)", "L'ennemi public n\xc2\xb01 (2008)"),
 (0.979,
  'The Disappearance of Eleanor Rig

### The most similar movies according to this measure are the ones based on the same story, so basically the remakes of old movies

In [21]:
sorted(highests)[-60:-40]

[(0.8089999999999999, 'Batman Forever (1995)', 'The Dark Knight (2008)'),
 (0.8089999999999999,
  'Dragon: The Bruce Lee Story (1993)',
  'The Karate Kid (1984)'),
 (0.8089999999999999,
  'Pawn Sacrifice (2014)',
  'Searching for Bobby Fischer (1993)'),
 (0.8089999999999999,
  'Searching for Bobby Fischer (1993)',
  'Pawn Sacrifice (2014)'),
 (0.8089999999999999, 'The Dark Knight (2008)', 'Batman Forever (1995)'),
 (0.8089999999999999,
  'The Karate Kid (1984)',
  'Dragon: The Bruce Lee Story (1993)'),
 (0.81, 'Before Midnight (2013)', 'Before Sunset (2004)'),
 (0.81, 'Before Sunset (2004)', 'Before Midnight (2013)'),
 (0.81, 'The Manchurian Candidate (1962)', 'The Manchurian Candidate (2004)'),
 (0.81, 'The Manchurian Candidate (2004)', 'The Manchurian Candidate (1962)'),
 (0.8160000000000001, 'Mascots (2016)', "Surf's Up (2007)"),
 (0.8160000000000001, "Surf's Up (2007)", 'Mascots (2016)'),
 (0.8180000000000001,
  'Terminator 2: Judgment Day (1991)',
  'Terminator Genisys (2015)'),
 

#### After the remakes the series appear here as well

In [22]:
#Saving dataframes for later usage in ohter notebook
output = open('filtered_movies_content.pkl', 'wb')
pickle.dump(movies_with_plot, output)
output.close()

output = open('content_decomposed_data.pkl', 'wb')
pickle.dump(decomposed_data, output)
output.close()

In [23]:
output = open('con_highests.pkl', 'wb')
pickle.dump(highests, output)
output.close()

In [None]:
pkl_file = open('plot_dict_full.pkl', 'rb')
plot_dict = pickle.load(pkl_file)
pkl_file.close()