
# Library Imports

In [1]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

        
import numpy as np # linear algebra
import pandas as pd # data processing
import ast # abstract syntax tree - for processing
import json # processing json
import nltk # for stemming words using PorterStemmer
from nltk.stem.porter import PorterStemmer
from pandas import json_normalize # for normalization 
from sklearn.feature_extraction.text import TfidfVectorizer # For vectorizing text
from sklearn.metrics.pairwise import cosine_similarity # to calculate similarity

/kaggle/input/tmdb-movie-metadata/tmdb_5000_movies.csv
/kaggle/input/tmdb-movie-metadata/tmdb_5000_credits.csv


# Load Data

In [2]:
credits = pd.read_csv("../input/tmdb-movie-metadata/tmdb_5000_credits.csv")
movies = pd.read_csv("../input/tmdb-movie-metadata/tmdb_5000_movies.csv")

# merging both the data set on ids
df = pd.merge(movies, credits, left_on="id", right_on="movie_id")

# Understanding Data 

In [3]:
# check for top 3 records in the dataframe
df.head(3)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,...,spoken_languages,status,tagline,title_x,vote_average,vote_count,movie_id,title_y,cast,crew
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...",...,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...",...,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,285,Pirates of the Caribbean: At World's End,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,245000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.sonypictures.com/movies/spectre/,206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...",...,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466,206647,Spectre,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."


In [4]:
# what is the size of the movie before and after merge
movies.shape, credits.shape, df.shape

((4803, 20), (4803, 4), (4803, 24))

In [5]:
# check the datatype
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4803 entries, 0 to 4802
Data columns (total 24 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   budget                4803 non-null   int64  
 1   genres                4803 non-null   object 
 2   homepage              1712 non-null   object 
 3   id                    4803 non-null   int64  
 4   keywords              4803 non-null   object 
 5   original_language     4803 non-null   object 
 6   original_title        4803 non-null   object 
 7   overview              4800 non-null   object 
 8   popularity            4803 non-null   float64
 9   production_companies  4803 non-null   object 
 10  production_countries  4803 non-null   object 
 11  release_date          4802 non-null   object 
 12  revenue               4803 non-null   int64  
 13  runtime               4801 non-null   float64
 14  spoken_languages      4803 non-null   object 
 15  status               

# Content Based

## Step 1. Select columns

Columns selected

* `id`
* `original_title`
* `overview`
* `genres`
* `keywords`
* `cast`
* `crew`

In [6]:
# select only few columns from the main dataframe
collab_df = df[[ 'id','original_title', 'overview', 'genres', 'keywords',  'cast', 'crew']]
collab_df.head(3)

Unnamed: 0,id,original_title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...","[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...","[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,Spectre,A cryptic message from Bond’s past sends him o...,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...","[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."


## Step 2. Preprocessing 

Convert genres, keywords, cast and crew to a list

In [7]:
collab_df.columns

Index(['id', 'original_title', 'overview', 'genres', 'keywords', 'cast',
       'crew'],
      dtype='object')

In [8]:
# sample record
collab_df[collab_df['original_title'] == 'Inception']

Unnamed: 0,id,original_title,overview,genres,keywords,cast,crew
96,27205,Inception,"Cobb, a skilled thief who commits corporate es...","[{""id"": 28, ""name"": ""Action""}, {""id"": 53, ""nam...","[{""id"": 1014, ""name"": ""loss of lover""}, {""id"":...","[{""cast_id"": 1, ""character"": ""Dom Cobb"", ""cred...","[{""credit_id"": ""56e8462cc3a368408400354c"", ""de..."


In [9]:
# # for testing
# collab_df['original_title'][0:2]
# json.loads(collab_df['genres'][0])
# json.loads(collab_df['keywords'][0])
# json.loads(collab_df['crew'][0])[4:10]

# # for testing
# json.loads(collab_df['cast'][0])[0]
# ast.literal_eval(collab_df['crew'][0])[0]
# ast.literal_eval(collab_df['crew'][0])[0]
# ast.literal_eval(collab_df['crew'][0])[0]

## Process all columns

In [10]:
# except for crew
def process_dict(obj):
    lst = []
    for i in json.loads(obj):
        lst.append(i['name'])
    return lst
    
# for crew
def process_crew(obj):
    lst = []
    for i in json.loads(obj):
        if i['job'] == 'Director':
            lst.append(i['name'])
            break
    return lst


In [11]:
collab_df['genres'] = collab_df['genres'].apply(process_dict)
collab_df['keywords'] = collab_df['keywords'].apply(process_dict)
collab_df['cast'] = collab_df['cast'].apply(process_dict)
collab_df['crew'] = collab_df['crew'].apply(process_crew)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using

In [12]:
collab_df.head(3)

Unnamed: 0,id,original_title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","[Sam Worthington, Zoe Saldana, Sigourney Weave...",[James Cameron]
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ...","[Johnny Depp, Orlando Bloom, Keira Knightley, ...",[Gore Verbinski]
2,206647,Spectre,A cryptic message from Bond’s past sends him o...,"[Action, Adventure, Crime]","[spy, based on novel, secret agent, sequel, mi...","[Daniel Craig, Christoph Waltz, Léa Seydoux, R...",[Sam Mendes]


## Drop col 
Need to drop a column as there are 3 NaNs in overview column.

In [13]:
collab_df.isnull().sum()

id                0
original_title    0
overview          3
genres            0
keywords          0
cast              0
crew              0
dtype: int64

In [14]:
# what are those columns
collab_df[collab_df['overview'].isnull()]

Unnamed: 0,id,original_title,overview,genres,keywords,cast,crew
2656,370980,Chiamatemi Francesco - Il Papa della gente,,[Drama],"[pope, biography]","[Rodrigo de la Serna, Sergio Hernández, Àlex B...",[Daniele Luchetti]
4140,459488,"To Be Frank, Sinatra at 100",,[Documentary],"[music, actors, legendary perfomer, classic ho...",[Tony Oppedisano],[Simon Napier-Bell]
4431,292539,Food Chains,,[Documentary],[],[],[Sanjay Rawal]


In [15]:
# drop those cols
collab_df.dropna(inplace=True)
collab_df.head(3)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)


Unnamed: 0,id,original_title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","[Sam Worthington, Zoe Saldana, Sigourney Weave...",[James Cameron]
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ...","[Johnny Depp, Orlando Bloom, Keira Knightley, ...",[Gore Verbinski]
2,206647,Spectre,A cryptic message from Bond’s past sends him o...,"[Action, Adventure, Crime]","[spy, based on novel, secret agent, sequel, mi...","[Daniel Craig, Christoph Waltz, Léa Seydoux, R...",[Sam Mendes]


In [16]:
# convert overview column to list
collab_df['overview'] = collab_df['overview'].apply(lambda x: x.split())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [17]:
collab_df.head(3)

Unnamed: 0,id,original_title,overview,genres,keywords,cast,crew
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin...","[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","[Sam Worthington, Zoe Saldana, Sigourney Weave...",[James Cameron]
1,285,Pirates of the Caribbean: At World's End,"[Captain, Barbossa,, long, believed, to, be, d...","[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ...","[Johnny Depp, Orlando Bloom, Keira Knightley, ...",[Gore Verbinski]
2,206647,Spectre,"[A, cryptic, message, from, Bond’s, past, send...","[Action, Adventure, Crime]","[spy, based on novel, secret agent, sequel, mi...","[Daniel Craig, Christoph Waltz, Léa Seydoux, R...",[Sam Mendes]


# Final steps

- Remove spaces on cast and crew names
- Convert to list
- lower case
- stopwords

## Remove Spaces

In [18]:
# There are space like so - we need to remove them to make it unique for all fields
collab_df['cast'][0][0:3]

['Sam Worthington', 'Zoe Saldana', 'Sigourney Weaver']

### Convert to list

In [19]:
collab_df['overview'] = collab_df['overview'].apply(lambda x: [a.replace(" ", "") for a in x])
collab_df['genres'] = collab_df['genres'].apply(lambda x: [a.replace(" ", "") for a in x])
collab_df['keywords'] = collab_df['keywords'].apply(lambda x: [a.replace(" ", "") for a in x])
collab_df['cast'] = collab_df['cast'].apply(lambda x: [a.replace(" ", "") for a in x])
collab_df['crew'] = collab_df['crew'].apply(lambda x: [a.replace(" ", "") for a in x])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using

In [20]:
collab_df.head(3)

Unnamed: 0,id,original_title,overview,genres,keywords,cast,crew
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin...","[Action, Adventure, Fantasy, ScienceFiction]","[cultureclash, future, spacewar, spacecolony, ...","[SamWorthington, ZoeSaldana, SigourneyWeaver, ...",[JamesCameron]
1,285,Pirates of the Caribbean: At World's End,"[Captain, Barbossa,, long, believed, to, be, d...","[Adventure, Fantasy, Action]","[ocean, drugabuse, exoticisland, eastindiatrad...","[JohnnyDepp, OrlandoBloom, KeiraKnightley, Ste...",[GoreVerbinski]
2,206647,Spectre,"[A, cryptic, message, from, Bond’s, past, send...","[Action, Adventure, Crime]","[spy, basedonnovel, secretagent, sequel, mi6, ...","[DanielCraig, ChristophWaltz, LéaSeydoux, Ralp...",[SamMendes]


## Combine

Convert the words into their root word for overview, genres and keywords
Then combine them with crew and cast


In [21]:
collab_df['combined'] = collab_df['overview'] + collab_df['genres'] + collab_df['keywords']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


## Applying Stemming

In [22]:
ps = PorterStemmer()

def stemming(text):
    lst = []
    for i in text.split():
        lst.append(ps.stem(i))
    return " ".join(lst)

collab_df['combined'] = collab_df['combined'].apply(lambda x:" ".join(x))
collab_df['combined'] = collab_df['combined'].apply(stemming)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == "__main__":
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.


## Convert to Strings

In [23]:
collab_df['cast'] = collab_df['cast'].apply(lambda x: " ".join(x))
collab_df['crew'] = collab_df['crew'].apply(lambda x: " ".join(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [24]:
collab_df.head(3)

Unnamed: 0,id,original_title,overview,genres,keywords,cast,crew,combined
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin...","[Action, Adventure, Fantasy, ScienceFiction]","[cultureclash, future, spacewar, spacecolony, ...",SamWorthington ZoeSaldana SigourneyWeaver Step...,JamesCameron,"in the 22nd century, a parapleg marin is dispa..."
1,285,Pirates of the Caribbean: At World's End,"[Captain, Barbossa,, long, believed, to, be, d...","[Adventure, Fantasy, Action]","[ocean, drugabuse, exoticisland, eastindiatrad...",JohnnyDepp OrlandoBloom KeiraKnightley Stellan...,GoreVerbinski,"captain barbossa, long believ to be dead, ha c..."
2,206647,Spectre,"[A, cryptic, message, from, Bond’s, past, send...","[Action, Adventure, Crime]","[spy, basedonnovel, secretagent, sequel, mi6, ...",DanielCraig ChristophWaltz LéaSeydoux RalphFie...,SamMendes,a cryptic messag from bond’ past send him on a...


In [25]:
collab_df['combined'][0]

'in the 22nd century, a parapleg marin is dispatch to the moon pandora on a uniqu mission, but becom torn between follow order and protect an alien civilization. action adventur fantasi sciencefict cultureclash futur spacewar spacecoloni societi spacetravel futurist romanc space alien tribe alienplanet cgi marin soldier battl loveaffair antiwar powerrel mindandsoul 3d'

## Combine cast, crew with others

In [26]:
collab_df['combined'] =  collab_df['combined'] + ' ' + collab_df['cast'] +' '+ collab_df['crew']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [27]:
collab_df['cast'][0]

'SamWorthington ZoeSaldana SigourneyWeaver StephenLang MichelleRodriguez GiovanniRibisi JoelDavidMoore CCHPounder WesStudi LazAlonso DileepRao MattGerald SeanAnthonyMoran JasonWhyte ScottLawrence KellyKilgour JamesPatrickPitt SeanPatrickMurphy PeterDillon KevinDorman KelsonHenderson DavidVanHorn JacobTomuri MichaelBlain-Rozgay JonCurry LukeHawker WoodySchultz PeterMensah SoniaYee JahnelCurfman IlramChoi KylaWarren LisaRoumain DebraWilson ChrisMala TaylorKibby JodieLandau JulieLamm CullenB.Madden JosephBradyMadden FrankieTorres AustinWilson SaraWilson TamicaWashington-Miller LucyBriant NathanMeister GerryBlair MatthewChamberlain PaulYates WrayWilson JamesGaylyn MelvinLenoClarkIII CarvonFutrell BrandonJelkes MicahMoch HanniyahMuhammad ChristopherNolen ChristaOliver AprilMarieThomas BravitaA.Threatt ColinBleasdale MikeBodnar MattClayton NicoleDionne JamieHarrison AllanHenry AnthonyIngruber AshleyJeffery DeanKnowsley JosephMika-Hunt TerryNotary KaiPantano LoganPithyou StuartPollock Raja Ga

In [28]:
collab_df['combined'][10]

'superman return to discov hi 5-year absenc ha allow lex luthor to walk free, and that those he wa closest too felt abandon and have move on. luthor plot hi ultim reveng that could see million kill and chang the face of the planet forever, as well as rid himself of the man of steel. adventur fantasi action sciencefict savingtheworld dccomic invulner sequel superhero basedoncomicbook kryptonit superpow superhumanstrength lexluthor BrandonRouth KevinSpacey KateBosworth JamesMarsden ParkerPosey FrankLangella SamHuntington EvaMarieSaint MarlonBrando KalPenn TristanLakeLeabu DavidFabrizio IanRoberts VincentStone JackLarson NoelNeill KeeganJoyce JordanaBeatty BryanSinger'

In [29]:
collab_df.head(3)

Unnamed: 0,id,original_title,overview,genres,keywords,cast,crew,combined
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin...","[Action, Adventure, Fantasy, ScienceFiction]","[cultureclash, future, spacewar, spacecolony, ...",SamWorthington ZoeSaldana SigourneyWeaver Step...,JamesCameron,"in the 22nd century, a parapleg marin is dispa..."
1,285,Pirates of the Caribbean: At World's End,"[Captain, Barbossa,, long, believed, to, be, d...","[Adventure, Fantasy, Action]","[ocean, drugabuse, exoticisland, eastindiatrad...",JohnnyDepp OrlandoBloom KeiraKnightley Stellan...,GoreVerbinski,"captain barbossa, long believ to be dead, ha c..."
2,206647,Spectre,"[A, cryptic, message, from, Bond’s, past, send...","[Action, Adventure, Crime]","[spy, basedonnovel, secretagent, sequel, mi6, ...",DanielCraig ChristophWaltz LéaSeydoux RalphFie...,SamMendes,a cryptic messag from bond’ past send him on a...


# Collaborative Filtering 

In [30]:
tf = TfidfVectorizer(max_features=5000,analyzer='word',stop_words={'english'})
vectors = tf.fit_transform(collab_df['combined']).toarray()
vectors.shape

(4800, 5000)

## What are the top features

In [31]:
tf.get_feature_names_out()[:100]

array(['000', '007', '10', '100', '11', '12', '13', '14', '15', '16',
       '17', '18', '18th', '19', '1930', '1940', '1950', '1960', '1960s',
       '1970', '1970s', '1980', '1990', '19th', '19thcenturi', '20',
       '20th', '24', '25', '30', '3d', '40', '50', '60', '70', 'aaron',
       'aaroneckhart', 'aarontaylor', 'aasifmandvi', 'abandon', 'abduct',
       'abigailbreslin', 'abil', 'abl', 'about', 'abov', 'abus',
       'academi', 'accept', 'access', 'accid', 'accident', 'accompani',
       'accomplish', 'account', 'accus', 'ace', 'achiev', 'across', 'act',
       'action', 'activ', 'activist', 'actor', 'actress', 'actual',
       'adam', 'adambrody', 'adamgoldberg', 'adamlefevre', 'adamsandler',
       'adamscott', 'adamshankman', 'adapt', 'add', 'addict',
       'adewaleakinnuoye', 'adjust', 'admir', 'admit', 'adolesc', 'adopt',
       'ador', 'adrianmartinez', 'adrienbrody', 'adult', 'adulteri',
       'adulthood', 'advanc', 'adventur', 'adventure', 'advertis',
       'advic'

## Calculating Cosine Similarity

In [32]:
similarity_score = cosine_similarity(vectors)

In [33]:
similarity_score

array([[1.        , 0.03456974, 0.03922371, ..., 0.04889589, 0.01552399,
        0.01255302],
       [0.03456974, 1.        , 0.07161851, ..., 0.03725022, 0.02554626,
        0.01743875],
       [0.03922371, 0.07161851, 1.        , ..., 0.04499789, 0.01988136,
        0.02199699],
       ...,
       [0.04889589, 0.03725022, 0.04499789, ..., 1.        , 0.06382215,
        0.0533685 ],
       [0.01552399, 0.02554626, 0.01988136, ..., 0.06382215, 1.        ,
        0.0710892 ],
       [0.01255302, 0.01743875, 0.02199699, ..., 0.0533685 , 0.0710892 ,
        1.        ]])

In [34]:
print("Shape : \n" , similarity_score.shape)
print("\n Sample data: \n",similarity_score[10])
print("\n Single data shape: \n",similarity_score[10].shape)


Shape : 
 (4800, 4800)

 Sample data: 
 [0.04094582 0.05145652 0.05285622 ... 0.05329347 0.08631546 0.07072732]

 Single data shape: 
 (4800,)


### Sample data
Showing similarity of 10th record with every other movie

In [35]:
similarity_score[10]

array([0.04094582, 0.05145652, 0.05285622, ..., 0.05329347, 0.08631546,
       0.07072732])

## Building Recommendations

We will create a function which will take 

movie as an argument, find index of the movie and sort it fir

1. Take movie name as the argument
2. Find the index of that movie
3. Sort and enum to get the index
4. Pick the top 5 starting from index 1


In [36]:
def recommend_movies(movie):
    index = collab_df[collab_df['original_title'] == movie].index[0]
    top5 = sorted(list(enumerate(similarity_score[index])), key=lambda x:x[1], reverse=True)[1:6]
    
    for i in top5:
        print(collab_df.iloc[i[0]].original_title)

In [37]:
recommend_movies('The X Files')

The X Files: I Want to Believe
E.T. the Extra-Terrestrial
Conspiracy Theory
Predator 2
Angels & Demons


In [38]:
collab_df[collab_df['original_title'].str.contains('The X Files')]

Unnamed: 0,id,original_title,overview,genres,keywords,cast,crew,combined
626,846,The X Files,"[Mulder, and, Scully,, now, taken, off, the, F...","[Mystery, ScienceFiction, Thriller]","[bomb, helicopter, secret, obsession, extrater...",DavidDuchovny GillianAnderson MitchPileggi Wil...,RobBowman,"mulder and scully, now taken off the fbi' x fi..."
1422,8836,The X Files: I Want to Believe,"[Six, years, after, the, events, of, The, X-Fi...","[Drama, Mystery, ScienceFiction, Thriller]","[extraterrestrialtechnology, fbi, alien, fbiag...",DavidDuchovny GillianAnderson AmandaPeet Billy...,ChrisCarter,six year after the event of the x-file seri fi...


In [39]:
recommend_movies('Batman Begins')

The Dark Knight
The Dark Knight Rises
Batman Returns
Batman
Batman v Superman: Dawn of Justice


In [40]:
recommend_movies('Bulletproof Monk')

兔侠传奇
功夫
Kung Pow: Enter the Fist
Kung Fu Panda 2
The Forbidden Kingdom


In [41]:
recommend_movies('Die Another Day')

The World Is Not Enough
Never Say Never Again
Quantum of Solace
Licence to Kill
GoldenEye


In [42]:
recommend_movies('The Last Airbender')

Signs
Lady in the Water
The Ice Pirates
The Village
The Happening


In [43]:
collab_df['original_title'].sample(10)

271                             The Island
1566                         About Schmidt
1174                          Ride Along 2
3827                                Friday
2049                       Dudley Do-Right
605        Legends of Oz: Dorothy's Return
3031                            Wrong Turn
3003    Nick and Norah's Infinite Playlist
383                                Twister
3279                           Prefontaine
Name: original_title, dtype: object

In [44]:
collab_df['original_title'].sample(30)

3453                         The Wood
3288                             Fido
534                           Bandits
4299                         Home Run
2478                    Drowning Mona
2242                  Flash of Genius
4555                    Enter Nowhere
1832                         Chocolat
4210                    दिल जो भी कहे
1809            The Constant Gardener
3359                      In Too Deep
4693                               H.
1549                The Addams Family
2517                The King's Speech
3353                      The Descent
237        The Huntsman: Winter's War
1617                                9
3328                       Persepolis
124                            Frozen
3559            Paranormal Activity 3
2045                I Heart Huckabees
508     The Lost World: Jurassic Park
1831            Bridget Jones's Diary
2129                   The Black Hole
391                         Enchanted
551                       Fool's Gold
2944        

In [45]:
recommend_movies('Spotlight')

Elizabeth
The Mighty Macs
Stolen Summer
Mystic River
The Core


In [46]:
recommend_movies('Michael Clayton')

The Rainmaker
The Firm
Chill Factor
The Judge
My Big Fat Greek Wedding


In [47]:
collab_df[collab_df['original_title'].str.contains("Clayton")]

Unnamed: 0,id,original_title,overview,genres,keywords,cast,crew,combined
2061,4566,Michael Clayton,"[A, law, firm, brings, in, its, 'fixer', to, r...","[Drama, Mystery, Crime]","[killing, restaurant, chambersofabarrister, sc...",GeorgeClooney TomWilkinson TildaSwinton Sydney...,TonyGilroy,a law firm bring in it 'fixer' to remedi the s...
