## Part 1 - Read and tranform the movies data to get a column containing all keywords

In [3]:
import pandas as pd

In [4]:
df  = pd.read_csv('../data/movies.csv')

df.shape

(15430, 15)

In [5]:
df.head()

Unnamed: 0,ID,Title,Year,Age,IMDb,Rotten Tomatoes,Netflix,Hulu,Prime Video,Disney+,Directors,Genres,Country,Language,Runtime
0,1,Inception,2010,13+,8.8,87%,Yes,No,No,No,Christopher Nolan,"Action,Adventure,Sci-Fi,Thriller",United States,English,148
1,2,The Matrix,1999,18+,8.7,87%,Yes,No,No,No,"Lana Wachowski,Lilly Wachowski","Action,Sci-Fi",United States,English,136
2,3,Avengers: Infinity War,2018,13+,8.5,84%,Yes,No,No,No,"Anthony Russo,Joe Russo","Action,Adventure,Sci-Fi",United States,English,149
3,4,Back to the Future,1985,7+,8.5,96%,Yes,No,No,No,Robert Zemeckis,"Adventure,Comedy,Sci-Fi",United States,English,116
4,5,"The Good, the Bad and the Ugly",1966,18+,8.8,97%,Yes,No,Yes,No,Sergio Leone,Western,Italy,Italian,161


### Selecting the required columns
> We will use Genre, Age rating, Directors and Imdb rating to find similarity between the movies

In [6]:
df_new = df.loc[:,['Title','Age','IMDb', 'Directors', 'Genres']]

In [7]:
df_new.head()

Unnamed: 0,Title,Age,IMDb,Directors,Genres
0,Inception,13+,8.8,Christopher Nolan,"Action,Adventure,Sci-Fi,Thriller"
1,The Matrix,18+,8.7,"Lana Wachowski,Lilly Wachowski","Action,Sci-Fi"
2,Avengers: Infinity War,13+,8.5,"Anthony Russo,Joe Russo","Action,Adventure,Sci-Fi"
3,Back to the Future,7+,8.5,Robert Zemeckis,"Adventure,Comedy,Sci-Fi"
4,"The Good, the Bad and the Ugly",18+,8.8,Sergio Leone,Western


### Creating a function to remove spaces

> - We will apply this function to the Directors column to get the full name of the director in a single word.
> - This is done so that similarity is not detected just based on the first name or last name of two different directors.

In [8]:
def removeSpaces(inputString):
    if(inputString):
        return str(inputString).replace(" ","")
    else:
        return None

In [9]:
df_new.loc[:,'Directors']= df_new.loc[:,'Directors'].copy().apply(removeSpaces)

In [10]:
df_new['Directors'].head(10)

0                            ChristopherNolan
1                LanaWachowski,LillyWachowski
2                       AnthonyRusso,JoeRusso
3                              RobertZemeckis
4                                 SergioLeone
5    BobPersichetti,PeterRamsey,RodneyRothman
6                               RomanPolanski
7                            QuentinTarantino
8                             StevenSpielberg
9                            QuentinTarantino
Name: Directors, dtype: object

### Creating a function to classify the IMdb rating as "great", "okay" or "bad"
> We will use this function to get a new column "Comment"

In [11]:
def getComment(rating):
    if rating == None:
        return ""
    elif rating >= 7.6:
        return "great"
    elif rating >= 5.9 and rating <= 7.5:
        return "okay"
    elif  rating < 5.9:
        return "bad"

In [12]:
df_new.loc[:,'Comment'] = df_new.loc[:,"IMDb"].apply(getComment)

In [151]:
df_new["Comment"]

0        great
1        great
2        great
3        great
4        great
         ...  
15425     okay
15426     okay
15427      bad
15428     okay
15429      bad
Name: Comment, Length: 15430, dtype: object

### Finally concatenating values of "Genres", "Directors", "Comment" and "Age" columns to get "bagofwords"
> This column contains all the keywords related to the movie in comma separated format which we will use to generate the similarity matrix

In [17]:
df_new.loc[:,'bagofwords'] = df_new['Genres'].map(str) + ','  + df_new['Directors'].map(str)  + ',' + df_new['Comment'].map(str)+ ',' + df_new['Age'].map(str)

### Removing any "None" or "nan"  values if present

In [19]:
def removeInvalidValue (input):
    arr = input.split(",")
    arr = [i for i in arr if i != "None" and i != "nan"]
    return ",".join(arr).lower()

In [21]:
df_new.loc[:,'bagofwords'] = df_new.loc[:,'bagofwords'].apply(removeInvalidValue)

In [24]:
df_new['bagofwords']

0        action,adventure,sci-fi,thriller,christopherno...
1        action,sci-fi,lanawachowski,lillywachowski,gre...
2        action,adventure,sci-fi,anthonyrusso,joerusso,...
3          adventure,comedy,sci-fi,robertzemeckis,great,7+
4                            western,sergioleone,great,18+
                               ...                        
15425           adventure,drama,family,edwardm.abroms,okay
15426                drama,family,bernardmceveety,okay,all
15427               adventure,family,dereckjoubert,bad,all
15428        comedy,family,fantasy,horror,brucebilson,okay
15429               comedy,family,sci-fi,nealisrael,bad,7+
Name: bagofwords, Length: 15430, dtype: object

## Part 2 - As we now have the "bagofwords" we will move on to find similar movies

### Importing libraries

In [26]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

### Converting bagofwords column to a vector (using count vectorization technique)

In [27]:
count = CountVectorizer()
count_vector =  count.fit_transform(df_new['bagofwords'].values)

### Creating a similiarity matrix using the above generated vector
> In our case we will use the same count vector at both the ends

In [28]:
cosine_sim = cosine_similarity(count_vector, count_vector)

In [29]:
cosine_sim

array([[1.        , 0.53452248, 0.75      , ..., 0.15811388, 0.        ,
        0.28867513],
       [0.53452248, 1.        , 0.53452248, ..., 0.        , 0.        ,
        0.3086067 ],
       [0.75      , 0.53452248, 1.        , ..., 0.15811388, 0.        ,
        0.28867513],
       ...,
       [0.15811388, 0.        , 0.15811388, ..., 1.        , 0.18257419,
        0.36514837],
       [0.        , 0.        , 0.        , ..., 0.18257419, 1.        ,
        0.33333333],
       [0.28867513, 0.3086067 , 0.28867513, ..., 0.36514837, 0.33333333,
        1.        ]])

### Creating a method to return 10 similar movies based on an input movie

In [30]:
def getSimilarMovies(moviename):
    inp_index = df.index[df_new['Title'] == moviename].tolist()[0]
    print(inp_index)
    similar_index = (-cosine_sim[inp_index]).argsort()[:11]
    similar_index = similar_index[similar_index != inp_index]
    print(df.iloc[similar_index, [1]]) 

## Finally we can find similar movies

### Input movie : The Matrix

In [31]:
getSimilarMovies("The Matrix")

1
                                    Title
248                The Matrix Revolutions
82                    The Matrix Reloaded
27                             District 9
759                     Jupiter Ascending
295    Terminator 3: Rise of the Machines
3206      Star Trek II: The Wrath of Khan
13788                        CyberTracker
9274                           Fortress 2
14169                          Retrograde
10928      Exterminators of the Year 3000


### Input movie : The Incredibles

In [33]:
getSimilarMovies("The Incredibles")

14977
                                                   Title
33                                         Incredibles 2
6655                                     The Monkey King
293                                      Boy & the World
14979                                         Big Hero 6
10792                                  The Amazing Zorro
161            Lupin the Third: The Castle of Cagliostro
1490   Walt Disney Animation Studios Short Films Coll...
5                      Spider-Man: Into the Spider-Verse
14973                                        Ratatouille
70                                       Song of the Sea
