In [39]:
import numpy as np
import pandas as pd

## 0. Exploring Dataset and Preparing DataFrame

In [40]:
movies = pd.read_csv("./dataset/tmdb_5000_movies.csv")
credits = pd.read_csv("./dataset/tmdb_5000_credits.csv")

In [41]:
movies.head(1)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800


In [42]:
credits.head(4)

Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,Spectre,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,49026,The Dark Knight Rises,"[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."


In [43]:
print(movies.shape, credits.shape)

(4803, 20) (4803, 4)


In [44]:
movies = movies.merge(credits, left_on="id", right_on="movie_id")

In [45]:
movies.shape

(4803, 24)

In [46]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 24 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   budget                4803 non-null   int64  
 1   genres                4803 non-null   object 
 2   homepage              1712 non-null   object 
 3   id                    4803 non-null   int64  
 4   keywords              4803 non-null   object 
 5   original_language     4803 non-null   object 
 6   original_title        4803 non-null   object 
 7   overview              4800 non-null   object 
 8   popularity            4803 non-null   float64
 9   production_companies  4803 non-null   object 
 10  production_countries  4803 non-null   object 
 11  release_date          4802 non-null   object 
 12  revenue               4803 non-null   int64  
 13  runtime               4801 non-null   float64
 14  spoken_languages      4803 non-null   object 
 15  status               

#### Keeping required columns only

In [47]:
movies = movies[["id", "title_x", "genres", "keywords", "overview", "cast", "crew"]]
movies.rename(columns={"title_x":"title"}, inplace=True)
movies.head(2)

Unnamed: 0,id,title,genres,keywords,overview,cast,crew
0,19995,Avatar,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","In the 22nd century, a paraplegic Marine is di...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...","[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...","Captain Barbossa, long believed to be dead, ha...","[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."


# 1. Preprocessing
## 1.1 Fixing missing data and duplicates


In [48]:
movies.isnull().sum()

id          0
title       0
genres      0
keywords    0
overview    3
cast        0
crew        0
dtype: int64

In [49]:
movies.dropna(inplace=True)

In [50]:
movies.duplicated().sum()

np.int64(0)

## 1.2 Filter useful strings in columns: genres, keywords, overview, cast, crew
 

#### Checking the structure of "genres" and "keywords" column and fixing them

In [51]:
from ast import literal_eval # to turn the above string into a python list

In [52]:
movies.iloc[0]["genres"]

'[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]'

In [53]:
#helper function
def convert(obj):
    L= []

    for dict in literal_eval(obj):
        L.append(dict["name"])
    
    return L

In [54]:
movies["genres"]= movies["genres"].apply(convert)

#keywords has the same structure as genres column

print(movies.iloc[0]["keywords"])
movies.keywords = movies.keywords.apply(convert)

[{"id": 1463, "name": "culture clash"}, {"id": 2964, "name": "future"}, {"id": 3386, "name": "space war"}, {"id": 3388, "name": "space colony"}, {"id": 3679, "name": "society"}, {"id": 3801, "name": "space travel"}, {"id": 9685, "name": "futuristic"}, {"id": 9840, "name": "romance"}, {"id": 9882, "name": "space"}, {"id": 9951, "name": "alien"}, {"id": 10148, "name": "tribe"}, {"id": 10158, "name": "alien planet"}, {"id": 10987, "name": "cgi"}, {"id": 11399, "name": "marine"}, {"id": 13065, "name": "soldier"}, {"id": 14643, "name": "battle"}, {"id": 14720, "name": "love affair"}, {"id": 165431, "name": "anti war"}, {"id": 193554, "name": "power relations"}, {"id": 206690, "name": "mind and soul"}, {"id": 209714, "name": "3d"}]


In [55]:
movies.head(2)

Unnamed: 0,id,title,genres,keywords,overview,cast,crew
0,19995,Avatar,"[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","In the 22nd century, a paraplegic Marine is di...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ...","Captain Barbossa, long believed to be dead, ha...","[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."


#### Checking the structure of "cast" column and fixing it

In [56]:
movies.iloc[0]["cast"]

'[{"cast_id": 242, "character": "Jake Sully", "credit_id": "5602a8a7c3a3685532001c9a", "gender": 2, "id": 65731, "name": "Sam Worthington", "order": 0}, {"cast_id": 3, "character": "Neytiri", "credit_id": "52fe48009251416c750ac9cb", "gender": 1, "id": 8691, "name": "Zoe Saldana", "order": 1}, {"cast_id": 25, "character": "Dr. Grace Augustine", "credit_id": "52fe48009251416c750aca39", "gender": 1, "id": 10205, "name": "Sigourney Weaver", "order": 2}, {"cast_id": 4, "character": "Col. Quaritch", "credit_id": "52fe48009251416c750ac9cf", "gender": 2, "id": 32747, "name": "Stephen Lang", "order": 3}, {"cast_id": 5, "character": "Trudy Chacon", "credit_id": "52fe48009251416c750ac9d3", "gender": 1, "id": 17647, "name": "Michelle Rodriguez", "order": 4}, {"cast_id": 8, "character": "Selfridge", "credit_id": "52fe48009251416c750ac9e1", "gender": 2, "id": 1771, "name": "Giovanni Ribisi", "order": 5}, {"cast_id": 7, "character": "Norm Spellman", "credit_id": "52fe48009251416c750ac9dd", "gender": 

In [57]:
cast_count = 4
#helper function
def convert_cast(obj):
    L= []
    count = 1
    for dict in literal_eval(obj):

        if count <= cast_count:
            L.append(dict["name"])
            count+=1
        else:
            break
    
    return L

In [58]:
movies["cast"] = movies["cast"].apply(convert_cast)

In [59]:
movies.head(2)

Unnamed: 0,id,title,genres,keywords,overview,cast,crew
0,19995,Avatar,"[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","In the 22nd century, a paraplegic Marine is di...","[Sam Worthington, Zoe Saldana, Sigourney Weave...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ...","Captain Barbossa, long believed to be dead, ha...","[Johnny Depp, Orlando Bloom, Keira Knightley, ...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."


#### Checking the structure of "crew" column and fixing it

In [60]:
movies.crew[0]

'[{"credit_id": "52fe48009251416c750aca23", "department": "Editing", "gender": 0, "id": 1721, "job": "Editor", "name": "Stephen E. Rivkin"}, {"credit_id": "539c47ecc3a36810e3001f87", "department": "Art", "gender": 2, "id": 496, "job": "Production Design", "name": "Rick Carter"}, {"credit_id": "54491c89c3a3680fb4001cf7", "department": "Sound", "gender": 0, "id": 900, "job": "Sound Designer", "name": "Christopher Boyes"}, {"credit_id": "54491cb70e0a267480001bd0", "department": "Sound", "gender": 0, "id": 900, "job": "Supervising Sound Editor", "name": "Christopher Boyes"}, {"credit_id": "539c4a4cc3a36810c9002101", "department": "Production", "gender": 1, "id": 1262, "job": "Casting", "name": "Mali Finn"}, {"credit_id": "5544ee3b925141499f0008fc", "department": "Sound", "gender": 2, "id": 1729, "job": "Original Music Composer", "name": "James Horner"}, {"credit_id": "52fe48009251416c750ac9c3", "department": "Directing", "gender": 2, "id": 2710, "job": "Director", "name": "James Cameron"},

In [61]:
def find_director(obj):
    
    for i in literal_eval(obj):
        if i["job"]=="Director":
            return i["name"]

In [62]:
movies["crew"] = movies["crew"].apply(find_director)

In [63]:
movies.head()

Unnamed: 0,id,title,genres,keywords,overview,cast,crew
0,19995,Avatar,"[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","In the 22nd century, a paraplegic Marine is di...","[Sam Worthington, Zoe Saldana, Sigourney Weave...",James Cameron
1,285,Pirates of the Caribbean: At World's End,"[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ...","Captain Barbossa, long believed to be dead, ha...","[Johnny Depp, Orlando Bloom, Keira Knightley, ...",Gore Verbinski
2,206647,Spectre,"[Action, Adventure, Crime]","[spy, based on novel, secret agent, sequel, mi...",A cryptic message from Bond’s past sends him o...,"[Daniel Craig, Christoph Waltz, Léa Seydoux, R...",Sam Mendes
3,49026,The Dark Knight Rises,"[Action, Crime, Drama, Thriller]","[dc comics, crime fighter, terrorist, secret i...",Following the death of District Attorney Harve...,"[Christian Bale, Michael Caine, Gary Oldman, A...",Christopher Nolan
4,49529,John Carter,"[Action, Adventure, Science Fiction]","[based on novel, mars, medallion, space travel...","John Carter is a war-weary, former military ca...","[Taylor Kitsch, Lynn Collins, Samantha Morton,...",Andrew Stanton


#### Converting "overview" column to a list of words, to make consistent with other columns

In [64]:
movies["overview"]= movies["overview"].apply(lambda x:x.split())

#### Converting phrases like "Chrish Hemsworth" into a single tag "ChrishHemsworth"

In [65]:
type(movies.genres[0])

list

In [66]:
#in genres, keywords, cast columns
movies["genres"] = movies["genres"].apply(lambda list_of_genre: [genre.replace(" ", "") for genre in list_of_genre])
movies["keywords"] = movies["keywords"].apply(lambda list_of_keywords: [keyword.replace(" ", "") for keyword in list_of_keywords])
movies["cast"] = movies["cast"].apply(lambda x: [i.replace(" ", "") for i in x])

In [67]:
movies.head()

Unnamed: 0,id,title,genres,keywords,overview,cast,crew
0,19995,Avatar,"[Action, Adventure, Fantasy, ScienceFiction]","[cultureclash, future, spacewar, spacecolony, ...","[In, the, 22nd, century,, a, paraplegic, Marin...","[SamWorthington, ZoeSaldana, SigourneyWeaver, ...",James Cameron
1,285,Pirates of the Caribbean: At World's End,"[Adventure, Fantasy, Action]","[ocean, drugabuse, exoticisland, eastindiatrad...","[Captain, Barbossa,, long, believed, to, be, d...","[JohnnyDepp, OrlandoBloom, KeiraKnightley, Ste...",Gore Verbinski
2,206647,Spectre,"[Action, Adventure, Crime]","[spy, basedonnovel, secretagent, sequel, mi6, ...","[A, cryptic, message, from, Bond’s, past, send...","[DanielCraig, ChristophWaltz, LéaSeydoux, Ralp...",Sam Mendes
3,49026,The Dark Knight Rises,"[Action, Crime, Drama, Thriller]","[dccomics, crimefighter, terrorist, secretiden...","[Following, the, death, of, District, Attorney...","[ChristianBale, MichaelCaine, GaryOldman, Anne...",Christopher Nolan
4,49529,John Carter,"[Action, Adventure, ScienceFiction]","[basedonnovel, mars, medallion, spacetravel, p...","[John, Carter, is, a, war-weary,, former, mili...","[TaylorKitsch, LynnCollins, SamanthaMorton, Wi...",Andrew Stanton


###### Crew column was found to be different. Here I am arbitrarily keeping empty string wherever director is not present. Alternative: make the "crew" column values a single element list (i.e. with director only) so that the same name extraction and tag conversion code structure can be used and no additional steps in concatenation are required

In [68]:
movies["crew"] = movies["crew"].apply(lambda x: x.replace(" ", "") if x is not None else "")

In [69]:
movies.head()

Unnamed: 0,id,title,genres,keywords,overview,cast,crew
0,19995,Avatar,"[Action, Adventure, Fantasy, ScienceFiction]","[cultureclash, future, spacewar, spacecolony, ...","[In, the, 22nd, century,, a, paraplegic, Marin...","[SamWorthington, ZoeSaldana, SigourneyWeaver, ...",JamesCameron
1,285,Pirates of the Caribbean: At World's End,"[Adventure, Fantasy, Action]","[ocean, drugabuse, exoticisland, eastindiatrad...","[Captain, Barbossa,, long, believed, to, be, d...","[JohnnyDepp, OrlandoBloom, KeiraKnightley, Ste...",GoreVerbinski
2,206647,Spectre,"[Action, Adventure, Crime]","[spy, basedonnovel, secretagent, sequel, mi6, ...","[A, cryptic, message, from, Bond’s, past, send...","[DanielCraig, ChristophWaltz, LéaSeydoux, Ralp...",SamMendes
3,49026,The Dark Knight Rises,"[Action, Crime, Drama, Thriller]","[dccomics, crimefighter, terrorist, secretiden...","[Following, the, death, of, District, Attorney...","[ChristianBale, MichaelCaine, GaryOldman, Anne...",ChristopherNolan
4,49529,John Carter,"[Action, Adventure, ScienceFiction]","[basedonnovel, mars, medallion, spacetravel, p...","[John, Carter, is, a, war-weary,, former, mili...","[TaylorKitsch, LynnCollins, SamanthaMorton, Wi...",AndrewStanton


## 1.3 Making the "tags" column by concatenating all lists in the necessary columns
 

In [70]:
movies["tags"] = movies["genres"] + movies["keywords"] + movies["overview"] + movies["cast"] + movies["crew"].apply(lambda x: [x])

In [71]:
mov_df = movies[["id", "title", "tags"]]

In [72]:
mov_df

Unnamed: 0,id,title,tags
0,19995,Avatar,"[Action, Adventure, Fantasy, ScienceFiction, c..."
1,285,Pirates of the Caribbean: At World's End,"[Adventure, Fantasy, Action, ocean, drugabuse,..."
2,206647,Spectre,"[Action, Adventure, Crime, spy, basedonnovel, ..."
3,49026,The Dark Knight Rises,"[Action, Crime, Drama, Thriller, dccomics, cri..."
4,49529,John Carter,"[Action, Adventure, ScienceFiction, basedonnov..."
...,...,...,...
4798,9367,El Mariachi,"[Action, Crime, Thriller, unitedstates–mexicob..."
4799,72766,Newlyweds,"[Comedy, Romance, A, newlywed, couple's, honey..."
4800,231617,"Signed, Sealed, Delivered","[Comedy, Drama, Romance, TVMovie, date, loveat..."
4801,126186,Shanghai Calling,"[When, ambitious, New, York, attorney, Sam, is..."


#### Turning the list of tag words to a string and converting to lcase

In [73]:
mov_df["tags"] = mov_df["tags"].apply(lambda x: " ".join(x)).apply(lambda x: x.lower())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mov_df["tags"] = mov_df["tags"].apply(lambda x: " ".join(x)).apply(lambda x: x.lower())


In [74]:
mov_df["tags"][0]

'action adventure fantasy sciencefiction cultureclash future spacewar spacecolony society spacetravel futuristic romance space alien tribe alienplanet cgi marine soldier battle loveaffair antiwar powerrelations mindandsoul 3d in the 22nd century, a paraplegic marine is dispatched to the moon pandora on a unique mission, but becomes torn between following orders and protecting an alien civilization. samworthington zoesaldana sigourneyweaver stephenlang jamescameron'

## 1.4 We see that the tags contain variations of the same root word, so performing stemming
 

comparing different stemmers

In [78]:
from nltk.stem import PorterStemmer, SnowballStemmer, LancasterStemmer, RegexpStemmer

ps = PorterStemmer()
sb=SnowballStemmer("english")
lc=LancasterStemmer()
# rx = RegexpStemmer() requires custom regex pattern, so not using it

In [79]:
def stem(stemmer, tags):
    L=[]

    for word in tags.split():
        root = stemmer.stem(word)
        L.append(root)

    return " ".join(L)


In [80]:
stem_test_string ="action adventure fantasy sciencefiction cultureclash future spacewar spacecolony society spacetravel futuristic romance space alien tribe alienplanet cgi marine soldier battle loveaffair antiwar powerrelations mindandsoul 3d in the 22nd century, a paraplegic marine is dispatched to the moon pandora on a unique mission, but becomes torn between following orders and protecting an alien civilization. samworthington zoesaldana sigourneyweaver stephenlang jamescameron"

In [81]:
print("porter: "+ stem(ps,stem_test_string))
print("snowball: "+ stem(sb,stem_test_string))
print("lancaster: "+ stem(lc,stem_test_string))

porter: action adventur fantasi sciencefict cultureclash futur spacewar spacecoloni societi spacetravel futurist romanc space alien tribe alienplanet cgi marin soldier battl loveaffair antiwar powerrel mindandsoul 3d in the 22nd century, a parapleg marin is dispatch to the moon pandora on a uniqu mission, but becom torn between follow order and protect an alien civilization. samworthington zoesaldana sigourneyweav stephenlang jamescameron
snowball: action adventur fantasi sciencefict cultureclash futur spacewar spacecoloni societi spacetravel futurist romanc space alien tribe alienplanet cgi marin soldier battl loveaffair antiwar powerrel mindandsoul 3d in the 22nd century, a parapleg marin is dispatch to the moon pandora on a uniqu mission, but becom torn between follow order and protect an alien civilization. samworthington zoesaldana sigourneyweav stephenlang jamescameron
lancaster: act adv fantasy sciencefict cultureclash fut spacew spacecolony socy spacetravel fut rom spac aly tri

we go with snowball stemmer

In [82]:
mov_df["tags"] = mov_df["tags"].apply(lambda x: stem(sb,x))
mov_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mov_df["tags"] = mov_df["tags"].apply(lambda x: stem(sb,x))


Unnamed: 0,id,title,tags
0,19995,Avatar,action adventur fantasi sciencefict culturecla...
1,285,Pirates of the Caribbean: At World's End,adventur fantasi action ocean drugabus exotici...
2,206647,Spectre,action adventur crime spi basedonnovel secreta...
3,49026,The Dark Knight Rises,action crime drama thriller dccomic crimefight...
4,49529,John Carter,action adventur sciencefict basedonnovel mar m...
...,...,...,...
4798,9367,El Mariachi,action crime thriller unitedstates–mexicobarri...
4799,72766,Newlyweds,comedi romanc a newlyw coupl honeymoon is upen...
4800,231617,"Signed, Sealed, Delivered",comedi drama romanc tvmovi date loveatfirstsig...
4801,126186,Shanghai Calling,when ambiti new york attorney sam is sent to s...


# 2. Vectorizing

 

In [84]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5000, stop_words="english")


###### vectorizing without finding the stem word would result in differently worded duplicate features: eg: act, acted, acting

In [85]:
vectors = cv.fit_transform(mov_df["tags"]).toarray()

In [86]:
vectors

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [87]:
cv_features = cv.get_feature_names_out()
print(list(cv_features))

['000', '007', '10', '100', '11', '12', '13', '14', '15', '16', '17', '17th', '18', '18th', '18thcenturi', '19', '1930s', '1940s', '1950', '1950s', '1960', '1960s', '1970s', '1971', '1974', '1976', '1980', '1980s', '1985', '1990s', '1999', '19th', '19thcenturi', '20', '200', '2003', '2009', '20th', '24', '25', '30', '300', '3d', '40', '50', '500', '60', '60s', '70', '70s', 'aaron', 'aaroneckhart', 'aarontaylor', 'abandon', 'abduct', 'abigailbreslin', 'abil', 'abl', 'aboard', 'abov', 'abus', 'academ', 'academi', 'accept', 'access', 'accid', 'accident', 'acclaim', 'accompani', 'accomplish', 'account', 'accus', 'ace', 'achiev', 'acquaint', 'act', 'action', 'actionhero', 'activ', 'activist', 'activities', 'actor', 'actress', 'actual', 'adam', 'adambrodi', 'adamsandl', 'adamshankman', 'adapt', 'add', 'addict', 'adjust', 'admir', 'admit', 'adolesc', 'adopt', 'ador', 'adrienbrodi', 'adult', 'adultanim', 'adulteri', 'adulthood', 'advanc', 'adventur', 'adventure', 'adventures', 'advertis', 'adv

# 3. Calculating Cosine Similarity Matrix: similarity of each movie against all movies

In [88]:
from sklearn.metrics.pairwise import cosine_similarity

In [89]:
similarity_matrix = cosine_similarity(vectors)
similarity_matrix

array([[1.        , 0.08585457, 0.08585457, ..., 0.04604093, 0.        ,
        0.        ],
       [0.08585457, 1.        , 0.06060606, ..., 0.02437575, 0.        ,
        0.        ],
       [0.08585457, 0.06060606, 1.        , ..., 0.02437575, 0.        ,
        0.        ],
       ...,
       [0.04604093, 0.02437575, 0.02437575, ..., 1.        , 0.04174829,
        0.0437374 ],
       [0.        , 0.        , 0.        , ..., 0.04174829, 1.        ,
        0.04656202],
       [0.        , 0.        , 0.        , ..., 0.0437374 , 0.04656202,
        1.        ]])

In [90]:
similarity_matrix.shape

(4800, 4800)

In [91]:
# to get the similarity for the movie with index 0 (the first movie):
similarity_matrix[0] #see that it is most similar to itself (score of 1 in 0th element of the returned array)

array([1.        , 0.08585457, 0.08585457, ..., 0.04604093, 0.        ,
       0.        ])

# 4. Get a movie and recommend the 5 most similar movies

finding the most similar n movies:

In [92]:
top_n = 10

###### this way, we only get the top 10 similar movies, but have no idea which movie the similarities are against (i.e. we lose index)


In [93]:
sorted(similarity_matrix[0], reverse=True)[:top_n]

[np.float64(1.0),
 np.float64(0.2926585541394632),
 np.float64(0.2666323910600458),
 np.float64(0.26401000024165),
 np.float64(0.25903973506580724),
 np.float64(0.2507784139231543),
 np.float64(0.2500965064695278),
 np.float64(0.24784079854830487),
 np.float64(0.23995690956687135),
 np.float64(0.23995690956687135)]

###### so we enumerate the similarities for a movie as: (index, similarity_score). 
###### Then sort the enumerated tuple based on the similarity_score i.e. x[1]

In [100]:
similarity_sorted = sorted(list(enumerate(similarity_matrix[0])), reverse=True, key= lambda x: x[1])


In [101]:
def recommend(movie_name):
    #.index returns all the indices. In this case, all the indices in the subsetted mov_df, essentially giving the index for movie_name
    movie_index = mov_df[mov_df["title"] == movie_name].index[0]
    print("for movie: ",movie_index)
    similarity_for_movie_name = similarity_matrix[movie_index]
    top_similar_movies = sorted(list(enumerate(similarity_for_movie_name)), reverse=True, key= lambda x: x[1])[:top_n]
    print(f"the top {top_n} similar ones are: (index,similarity)",top_similar_movies)

    print("the titles")
    for i in top_similar_movies:
        index = i[0]
        print(mov_df.iloc[index].title)

In [102]:
recommend('Avatar')

for movie:  0
the top 10 similar ones are: (index,similarity) [(0, np.float64(1.0)), (1213, np.float64(0.2926585541394632)), (2403, np.float64(0.2666323910600458)), (3723, np.float64(0.26401000024165)), (507, np.float64(0.25903973506580724)), (582, np.float64(0.2507784139231543)), (539, np.float64(0.2500965064695278)), (1201, np.float64(0.24784079854830487)), (61, np.float64(0.23995690956687135)), (1191, np.float64(0.23995690956687135))]
the titles
Avatar
Aliens vs Predator: Requiem
Aliens
Falcon Rising
Independence Day
Battle: Los Angeles
Titan A.E.
Predators
Jupiter Ascending
Small Soldiers


# 5. For the Website

###### pickle is for serializing Python objects so that they can be stored on the disk

In [97]:
movies_dict = mov_df.to_dict()


In [98]:
import pickle

pickle.dump(mov_df.to_dict(), open("movies.pkl", "wb"))


In [99]:
pickle.dump(similarity_matrix, open("similarity_matrix.pkl", "wb"))