In [1]:
import pandas as pd
from rake_nltk import Rake
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
df = pd.read_csv('Datasets\IMDB_Top250Engmovies.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,Title,Year,Rated,Released,Runtime,Genre,Director,Writer,Actors,...,tomatoConsensus,tomatoUserMeter,tomatoUserRating,tomatoUserReviews,tomatoURL,DVD,BoxOffice,Production,Website,Response
0,1,The Shawshank Redemption,1994,R,14 Oct 1994,142 min,"Crime, Drama",Frank Darabont,"Stephen King (short story ""Rita Hayworth and S...","Tim Robbins, Morgan Freeman, Bob Gunton, Willi...",...,,,,,http://www.rottentomatoes.com/m/shawshank_rede...,27 Jan 1998,,Columbia Pictures,,True
1,2,The Godfather,1972,R,24 Mar 1972,175 min,"Crime, Drama",Francis Ford Coppola,"Mario Puzo (screenplay), Francis Ford Coppola ...","Marlon Brando, Al Pacino, James Caan, Richard ...",...,,,,,http://www.rottentomatoes.com/m/godfather/,09 Oct 2001,,Paramount Pictures,http://www.thegodfather.com,True
2,3,The Godfather: Part II,1974,R,20 Dec 1974,202 min,"Crime, Drama",Francis Ford Coppola,"Francis Ford Coppola (screenplay), Mario Puzo ...","Al Pacino, Robert Duvall, Diane Keaton, Robert...",...,,,,,http://www.rottentomatoes.com/m/godfather_part...,24 May 2005,,Paramount Pictures,http://www.thegodfather.com/,True
3,4,The Dark Knight,2008,PG-13,18 Jul 2008,152 min,"Action, Crime, Drama",Christopher Nolan,"Jonathan Nolan (screenplay), Christopher Nolan...","Christian Bale, Heath Ledger, Aaron Eckhart, M...",...,,,,,http://www.rottentomatoes.com/m/the_dark_knight/,09 Dec 2008,"$533,316,061",Warner Bros. Pictures/Legendary,http://thedarkknight.warnerbros.com/,True
4,5,12 Angry Men,1957,APPROVED,01 Apr 1957,96 min,"Crime, Drama",Sidney Lumet,"Reginald Rose (story), Reginald Rose (screenplay)","Martin Balsam, John Fiedler, Lee J. Cobb, E.G....",...,,,,,http://www.rottentomatoes.com/m/1000013-12_ang...,06 Mar 2001,,Criterion Collection,http://www.criterion.com/films/27871-12-angry-men,True


In [3]:
df.shape

(250, 38)

In [4]:
# Choosing the fixed features for recommendation
df = df[['Title','Genre','Director','Actors','Plot']]
df.head()

Unnamed: 0,Title,Genre,Director,Actors,Plot
0,The Shawshank Redemption,"Crime, Drama",Frank Darabont,"Tim Robbins, Morgan Freeman, Bob Gunton, Willi...",Two imprisoned men bond over a number of years...
1,The Godfather,"Crime, Drama",Francis Ford Coppola,"Marlon Brando, Al Pacino, James Caan, Richard ...",The aging patriarch of an organized crime dyna...
2,The Godfather: Part II,"Crime, Drama",Francis Ford Coppola,"Al Pacino, Robert Duvall, Diane Keaton, Robert...",The early life and career of Vito Corleone in ...
3,The Dark Knight,"Action, Crime, Drama",Christopher Nolan,"Christian Bale, Heath Ledger, Aaron Eckhart, M...",When the menace known as the Joker emerges fro...
4,12 Angry Men,"Crime, Drama",Sidney Lumet,"Martin Balsam, John Fiedler, Lee J. Cobb, E.G....",A jury holdout attempts to prevent a miscarria...


Transforming the full names of actors and directors in single words so they are considered as unique values.

In [5]:
# Discarding the commas between the actors' full names and getting only first three names
df['Actors'] = df['Actors'].map(lambda x: x.split(',')[:3])

# Putting the genres in a list of words
df['Genre'] = df['Genre'].map(lambda x: x.lower().split(','))

df['Director'] = df['Director'].map(lambda x: x.split(' '))

# merging together first and last name for each actor and director, so it's considered as one word 
# and there is no mix up between people sharing a first name
for index, row in df.iterrows():
    row['Actors'] = [x.lower().replace(' ','') for x in row['Actors']]
    row['Director'] = ''.join(row['Director']).lower()

In [6]:
df

Unnamed: 0,Title,Genre,Director,Actors,Plot
0,The Shawshank Redemption,"[crime, drama]",frankdarabont,"[timrobbins, morganfreeman, bobgunton]",Two imprisoned men bond over a number of years...
1,The Godfather,"[crime, drama]",francisfordcoppola,"[marlonbrando, alpacino, jamescaan]",The aging patriarch of an organized crime dyna...
2,The Godfather: Part II,"[crime, drama]",francisfordcoppola,"[alpacino, robertduvall, dianekeaton]",The early life and career of Vito Corleone in ...
3,The Dark Knight,"[action, crime, drama]",christophernolan,"[christianbale, heathledger, aaroneckhart]",When the menace known as the Joker emerges fro...
4,12 Angry Men,"[crime, drama]",sidneylumet,"[martinbalsam, johnfiedler, leej.cobb]",A jury holdout attempts to prevent a miscarria...
...,...,...,...,...,...
245,The Lost Weekend,"[drama, film-noir]",billywilder,"[raymilland, janewyman, phillipterry]",The desperate life of a chronic alcoholic is f...
246,Short Term 12,[drama],destindanielcretton,"[brielarson, johngallagherjr., stephaniebeatriz]",A 20-something supervising staff member of a r...
247,His Girl Friday,"[comedy, drama, romance]",howardhawks,"[carygrant, rosalindrussell, ralphbellamy]",A newspaper editor uses every trick in the boo...
248,The Straight Story,"[biography, drama]",davidlynch,"[sissyspacek, janegallowayheitz, josepha.carpe...",An old man makes a long journey by lawn-mover ...


Extracting key words from the plot description

In [7]:
# initializing the new column
df['Key_words'] = ""

for index, row in df.iterrows():
    plot = row['Plot']
    
    # instantiating Rake, by default it uses english stopwords from NLTK
    # and discards all puntuation characters as well
    r = Rake()

    # extracting the words by passing the text
    r.extract_keywords_from_text(plot)

    # getting the dictionary with key words as keys and their scores as values
    key_words_dict_scores = r.get_word_degrees()
    
    # assigning the key words to the new column for the corresponding movie
    row['Key_words'] = list(key_words_dict_scores.keys())

# dropping the Plot column
df.drop(columns = ['Plot'], inplace = True)


In [8]:
df

Unnamed: 0,Title,Genre,Director,Actors,Key_words
0,The Shawshank Redemption,"[crime, drama]",frankdarabont,"[timrobbins, morganfreeman, bobgunton]","[acts, two, imprisoned, men, bond, eventual, r..."
1,The Godfather,"[crime, drama]",francisfordcoppola,"[marlonbrando, alpacino, jamescaan]","[reluctant, son, organized, crime, dynasty, tr..."
2,The Godfather: Part II,"[crime, drama]",francisfordcoppola,"[alpacino, robertduvall, dianekeaton]","[family, crime, syndicate, son, tightens, 1920..."
3,The Dark Knight,"[action, crime, drama]",christophernolan,"[christianbale, heathledger, aaroneckhart]","[wreaks, havoc, physical, tests, fight, injust..."
4,12 Angry Men,"[crime, drama]",sidneylumet,"[martinbalsam, johnfiedler, leej.cobb]","[evidence, miscarriage, prevent, forcing, reco..."
...,...,...,...,...,...
245,The Lost Weekend,"[drama, film-noir]",billywilder,"[raymilland, janewyman, phillipterry]","[four, chronic, alcoholic, day, drinking, bout..."
246,Short Term 12,[drama],destindanielcretton,"[brielarson, johngallagherjr., stephaniebeatriz]","[world, alongside, troubled, waters, longtime,..."
247,His Girl Friday,"[comedy, drama, romance]",howardhawks,"[carygrant, rosalindrussell, ralphbellamy]","[newspaper, editor, uses, every, trick, book, ..."
248,The Straight Story,"[biography, drama]",davidlynch,"[sissyspacek, janegallowayheitz, josepha.carpe...","[long, journey, mend, lawn, relationship, move..."


In [9]:
df.set_index('Title',inplace=True)
df.head()

Unnamed: 0_level_0,Genre,Director,Actors,Key_words
Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
The Shawshank Redemption,"[crime, drama]",frankdarabont,"[timrobbins, morganfreeman, bobgunton]","[acts, two, imprisoned, men, bond, eventual, r..."
The Godfather,"[crime, drama]",francisfordcoppola,"[marlonbrando, alpacino, jamescaan]","[reluctant, son, organized, crime, dynasty, tr..."
The Godfather: Part II,"[crime, drama]",francisfordcoppola,"[alpacino, robertduvall, dianekeaton]","[family, crime, syndicate, son, tightens, 1920..."
The Dark Knight,"[action, crime, drama]",christophernolan,"[christianbale, heathledger, aaroneckhart]","[wreaks, havoc, physical, tests, fight, injust..."
12 Angry Men,"[crime, drama]",sidneylumet,"[martinbalsam, johnfiedler, leej.cobb]","[evidence, miscarriage, prevent, forcing, reco..."


In [10]:
df['bag_of_words'] = ''
columns = df.columns
for index,row in df.iterrows():
    words = ''
    for col in columns:
        if col != 'Director':
            words = words + ' '.join(row[col])+ ' '
        else:
            words = words + row[col] + ' '
    row['bag_of_words'] = words
df.drop(columns= [col for col in df.columns if col!= 'bag_of_words'], inplace=True)

In [11]:
df.head()

Unnamed: 0_level_0,bag_of_words
Title,Unnamed: 1_level_1
The Shawshank Redemption,crime drama frankdarabont timrobbins morganfr...
The Godfather,crime drama francisfordcoppola marlonbrando a...
The Godfather: Part II,crime drama francisfordcoppola alpacino rober...
The Dark Knight,action crime drama christophernolan christia...
12 Angry Men,crime drama sidneylumet martinbalsam johnfied...


In [12]:
count = CountVectorizer()
count_matrix = count.fit_transform(df['bag_of_words'])

# creating a Series for the movie titles so they are associated to an ordered numerical
# list I will use later to match the indexes
indices = pd.Series(df.index)
indices[:7]

0                         The Shawshank Redemption
1                                    The Godfather
2                           The Godfather: Part II
3                                  The Dark Knight
4                                     12 Angry Men
5                                 Schindler's List
6    The Lord of the Rings: The Return of the King
Name: Title, dtype: object

In [13]:
# generating the cosine similarity matrix
cosine_sim = cosine_similarity(count_matrix)
cosine_sim

array([[1.        , 0.15789474, 0.13764944, ..., 0.05263158, 0.05263158,
        0.05564149],
       [0.15789474, 1.        , 0.36706517, ..., 0.05263158, 0.05263158,
        0.05564149],
       [0.13764944, 0.36706517, 1.        , ..., 0.04588315, 0.04588315,
        0.04850713],
       ...,
       [0.05263158, 0.05263158, 0.04588315, ..., 1.        , 0.05263158,
        0.05564149],
       [0.05263158, 0.05263158, 0.04588315, ..., 0.05263158, 1.        ,
        0.05564149],
       [0.05564149, 0.05564149, 0.04850713, ..., 0.05564149, 0.05564149,
        1.        ]])

In [14]:
# function that takes in movie title as input and returns the top n recommended movies
def recommendations(title, num, cosine_sim = cosine_sim):
    
    recommended_movies = []
    
    # gettin the index of the movie that matches the title
    idx = indices[indices == title].index[0]

    # creating a Series with the similarity scores in descending order
    score_series = pd.Series(cosine_sim[idx]).sort_values(ascending = False)
    score_series_list = list(score_series[1:num+1])
    
    # getting the indexes of the 10 most similar movies
    top_indexes = list(score_series.iloc[1:num+1].index)

    # populating the list with the titles of the best 10 matching movies
    for i in top_indexes:
        recommended_movies.append(list(df.index)[i])
    
    print ("Recommending {:d} movies similar to {:s} ".format(num,title))
#     print ("Recommending " + str(num) + " movies similar to " + title)
    print ("------------------------------------------")
    print ("Index\t\t" +"Movie Name\t\t\t" + "Score")
    print ("------------------------------------------")
    j=0
    for i in recommended_movies:
        print (str(top_indexes[j])+"\t\t"+i+ "\t\t\t"+ str(score_series_list[j]))
        j+=1


In [15]:
recommendations('Fargo',10)

Recommending 10 movies similar to Fargo 
------------------------------------------
Index		Movie Name			Score
------------------------------------------
132		No Country for Old Men			0.23094010767585035
34		The Departed			0.22680460581325726
226		Rope			0.22222222222222224
1		The Godfather			0.22075539284417395
61		Reservoir Dogs			0.21516574145596762
2		The Godfather: Part II			0.1924500897298753
100		On the Waterfront			0.1814436846506058
15		Goodfellas			0.17868542247296582
214		Arsenic and Old Lace			0.17213259316477408
125		The Big Lebowski			0.17213259316477408


In [16]:
def get_title_from_index(index):
    return df.index[index]
get_title_from_index(132)

'No Country for Old Men'

In [17]:
def get_index_from_title(title):
    return indices[indices == title].index[0]
get_index_from_title('No Country for Old Men')

132