In [17]:
import pandas as pd
import numpy as np
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

## Connecting to MongoDB

In [9]:
import pymongo

client = pymongo.MongoClient("mongodb://localhost:27017") 
movies_db = client['movies_db_new']
movies_coll = movies_db['movies_new_3']
movies_list = list(movies_coll.find())


In [10]:
movies_df = pd.DataFrame(movies_list)
movies_df

Unnamed: 0,_id,title,id,year,runtime,cast,tmdb_id,keywords,original_language,overview,genres,image,popularity
0,635cc1400947ed318548b5c3,Black Adam,tt6443346,2022,125 min,"[Dwayne Johnson, Aldis Hodge, Pierce Brosnan, ...",436270,"[anti hero, superhero, based on comic, dc exte...",en,"Nearly 5,000 years after he was bestowed with ...","[Action, Science Fiction, Fantasy]",https://m.media-amazon.com/images/M/MV5BYzZkOG...,4328.431
1,635cc1400947ed318548b5c4,The School for Good and Evil,tt2935622,2022,147 min,"[Kit Young, Sophia Anne Caruso, Cate Blanchett...",779782,"[school, based on young adult novel]",en,Best friends Sophie and Agatha navigate an enc...,"[Fantasy, Action, Drama]",https://m.media-amazon.com/images/M/MV5BNzM1OD...,1275.654
2,635cc1400947ed318548b5c5,Bullet Train,tt12593682,2022,127 min,"[Brad Pitt, Joey King, Aaron Taylor-Johnson, B...",718930,"[japan, assassin, based on novel or book, miss...",en,Unlucky assassin Ladybug is determined to do h...,"[Action, Comedy, Thriller]",https://m.media-amazon.com/images/M/MV5BMDU2Zm...,1339.013
3,635cc1400947ed318548b5c7,Black Panther: Wakanda Forever,tt9114286,2022,161 min,"[Angela Bassett, Tenoch Huerta, Martin Freeman...",505642,"[hero, sequel, superhero, based on comic, deat...",en,"Queen Ramonda, Shuri, M’Baku, Okoye and the Do...","[Action, Adventure, Science Fiction]",https://m.media-amazon.com/images/M/MV5BNTM4Nj...,4594.775
4,635cc1400947ed318548b5c8,Top Gun: Maverick,tt1745960,2022,130 min,"[Tom Cruise, Jennifer Connelly, Miles Teller, ...",361743,"[fighter pilot, u.s. navy, sequel, nuclear wea...",en,After more than thirty years of service as one...,"[Action, Drama]",https://m.media-amazon.com/images/M/MV5BZWYzOG...,1491.786
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3332,635cf6ae0947ed31854b2eca,Traffik,tt5670152,2018,96 min,"[Priscilla Quintana, Luke Goss, Paula Patton, ...",473149,"[corruption, sheriff, husband wife relationshi...",en,A couple off for a romantic weekend in the mou...,"[Horror, Thriller]",https://m.media-amazon.com/images/M/MV5BMjIzMz...,25.301
3333,635cf6e30947ed31854b3322,The 9th Life of Louis Drax,tt3991412,2016,108 min,"[Jamie Dornan, Aiden Longworth, Sarah Gadon, A...",294795,"[based on novel or book, hitchcockian, sixth s...",en,A psychologist who begins working with a young...,"[Fantasy, Mystery, Thriller, Drama]",https://m.media-amazon.com/images/M/MV5BMjQ0Nj...,27.108
3334,635cf6e50947ed31854b334d,Blackway,tt4061010,2015,90 min,"[Anthony Hopkins, Julia Stiles, Ray Liotta, Al...",359790,"[stalker, revenge, murder, logging]",en,A young woman newly returned to her hometown b...,"[Thriller, Mystery]",https://m.media-amazon.com/images/M/MV5BOTAwND...,30.046
3335,635cf6f50947ed31854b34ad,Beyond the Reach,tt2911668,2014,91 min,"[Michael Douglas, Jeremy Irvine, Martin Palmer...",284289,"[desert, hunting]",en,A high-rolling corporate shark and his impover...,[Thriller],https://m.media-amazon.com/images/M/MV5BMjE5MT...,21.906


In [20]:
movies_df.isnull().sum()

_id                   0
title                 0
id                    0
year                  0
runtime              42
cast                  0
tmdb_id               0
keywords              0
original_language     0
overview              0
genres                0
image                 0
popularity            0
dtype: int64

## Data Cleansing

In [21]:
movies_df['cast'] = movies_df['cast'].apply(lambda x : [i.replace(' ','') for i in x])

movies_df['keywords'] = movies_df['keywords'].apply(lambda x : [i.replace(' ','') for i in x])

movies_df['genres'] = movies_df['genres'].apply(lambda x : [i.replace(' ','') for i in x])

movies_df

Unnamed: 0,_id,title,id,year,runtime,cast,tmdb_id,keywords,original_language,overview,genres,image,popularity
0,635cc1400947ed318548b5c3,Black Adam,tt6443346,2022,125 min,"[DwayneJohnson, AldisHodge, PierceBrosnan, Noa...",436270,"[antihero, superhero, basedoncomic, dcextended...",en,"Nearly 5,000 years after he was bestowed with ...","[Action, ScienceFiction, Fantasy]",https://m.media-amazon.com/images/M/MV5BYzZkOG...,4328.431
1,635cc1400947ed318548b5c4,The School for Good and Evil,tt2935622,2022,147 min,"[KitYoung, SophiaAnneCaruso, CateBlanchett, Li...",779782,"[school, basedonyoungadultnovel]",en,Best friends Sophie and Agatha navigate an enc...,"[Fantasy, Action, Drama]",https://m.media-amazon.com/images/M/MV5BNzM1OD...,1275.654
2,635cc1400947ed318548b5c5,Bullet Train,tt12593682,2022,127 min,"[BradPitt, JoeyKing, AaronTaylor-Johnson, Bria...",718930,"[japan, assassin, basedonnovelorbook, mission,...",en,Unlucky assassin Ladybug is determined to do h...,"[Action, Comedy, Thriller]",https://m.media-amazon.com/images/M/MV5BMDU2Zm...,1339.013
3,635cc1400947ed318548b5c7,Black Panther: Wakanda Forever,tt9114286,2022,161 min,"[AngelaBassett, TenochHuerta, MartinFreeman, L...",505642,"[hero, sequel, superhero, basedoncomic, deatho...",en,"Queen Ramonda, Shuri, M’Baku, Okoye and the Do...","[Action, Adventure, ScienceFiction]",https://m.media-amazon.com/images/M/MV5BNTM4Nj...,4594.775
4,635cc1400947ed318548b5c8,Top Gun: Maverick,tt1745960,2022,130 min,"[TomCruise, JenniferConnelly, MilesTeller, Val...",361743,"[fighterpilot, u.s.navy, sequel, nuclearweapon...",en,After more than thirty years of service as one...,"[Action, Drama]",https://m.media-amazon.com/images/M/MV5BZWYzOG...,1491.786
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3332,635cf6ae0947ed31854b2eca,Traffik,tt5670152,2018,96 min,"[PriscillaQuintana, LukeGoss, PaulaPatton, Wil...",473149,"[corruption, sheriff, husbandwiferelationship,...",en,A couple off for a romantic weekend in the mou...,"[Horror, Thriller]",https://m.media-amazon.com/images/M/MV5BMjIzMz...,25.301
3333,635cf6e30947ed31854b3322,The 9th Life of Louis Drax,tt3991412,2016,108 min,"[JamieDornan, AidenLongworth, SarahGadon, Aaro...",294795,"[basedonnovelorbook, hitchcockian, sixthsense,...",en,A psychologist who begins working with a young...,"[Fantasy, Mystery, Thriller, Drama]",https://m.media-amazon.com/images/M/MV5BMjQ0Nj...,27.108
3334,635cf6e50947ed31854b334d,Blackway,tt4061010,2015,90 min,"[AnthonyHopkins, JuliaStiles, RayLiotta, Alexa...",359790,"[stalker, revenge, murder, logging]",en,A young woman newly returned to her hometown b...,"[Thriller, Mystery]",https://m.media-amazon.com/images/M/MV5BOTAwND...,30.046
3335,635cf6f50947ed31854b34ad,Beyond the Reach,tt2911668,2014,91 min,"[MichaelDouglas, JeremyIrvine, MartinPalmer, H...",284289,"[desert, hunting]",en,A high-rolling corporate shark and his impover...,[Thriller],https://m.media-amazon.com/images/M/MV5BMjE5MT...,21.906


In [22]:
movies_df_new = movies_df[['title','year','original_language','cast','keywords','genres','overview','image']]
movies_df_export=movies_df_new[['title','year','original_language','image','overview']]
movies_df_export

Unnamed: 0,title,year,original_language,image,overview
0,Black Adam,2022,en,https://m.media-amazon.com/images/M/MV5BYzZkOG...,"Nearly 5,000 years after he was bestowed with ..."
1,The School for Good and Evil,2022,en,https://m.media-amazon.com/images/M/MV5BNzM1OD...,Best friends Sophie and Agatha navigate an enc...
2,Bullet Train,2022,en,https://m.media-amazon.com/images/M/MV5BMDU2Zm...,Unlucky assassin Ladybug is determined to do h...
3,Black Panther: Wakanda Forever,2022,en,https://m.media-amazon.com/images/M/MV5BNTM4Nj...,"Queen Ramonda, Shuri, M’Baku, Okoye and the Do..."
4,Top Gun: Maverick,2022,en,https://m.media-amazon.com/images/M/MV5BZWYzOG...,After more than thirty years of service as one...
...,...,...,...,...,...
3332,Traffik,2018,en,https://m.media-amazon.com/images/M/MV5BMjIzMz...,A couple off for a romantic weekend in the mou...
3333,The 9th Life of Louis Drax,2016,en,https://m.media-amazon.com/images/M/MV5BMjQ0Nj...,A psychologist who begins working with a young...
3334,Blackway,2015,en,https://m.media-amazon.com/images/M/MV5BOTAwND...,A young woman newly returned to her hometown b...
3335,Beyond the Reach,2014,en,https://m.media-amazon.com/images/M/MV5BMjE5MT...,A high-rolling corporate shark and his impover...


In [24]:
movies_df_new['cast'] = movies_df_new['cast'].apply(lambda x : " ".join(x))

movies_df_new['keywords'] = movies_df_new['keywords'].apply(lambda x : " ".join(x))

movies_df_new['genres'] = movies_df_new['genres'].apply(lambda x : " ".join(x))

movies_df_new

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies_df_new['cast'] = movies_df_new['cast'].apply(lambda x : " ".join(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies_df_new['keywords'] = movies_df_new['keywords'].apply(lambda x : " ".join(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies_df_new['genres'] = movies_df_new['genr

Unnamed: 0,title,year,original_language,cast,keywords,genres,overview,image
0,Black Adam,2022,en,DwayneJohnson AldisHodge PierceBrosnan NoahCen...,antihero superhero basedoncomic dcextendeduniv...,Action ScienceFiction Fantasy,"Nearly 5,000 years after he was bestowed with ...",https://m.media-amazon.com/images/M/MV5BYzZkOG...
1,The School for Good and Evil,2022,en,KitYoung SophiaAnneCaruso CateBlanchett LiamWoon,school basedonyoungadultnovel,Fantasy Action Drama,Best friends Sophie and Agatha navigate an enc...,https://m.media-amazon.com/images/M/MV5BNzM1OD...
2,Bullet Train,2022,en,BradPitt JoeyKing AaronTaylor-Johnson BrianTyr...,japan assassin basedonnovelorbook mission trai...,Action Comedy Thriller,Unlucky assassin Ladybug is determined to do h...,https://m.media-amazon.com/images/M/MV5BMDU2Zm...
3,Black Panther: Wakanda Forever,2022,en,AngelaBassett TenochHuerta MartinFreeman Lupit...,hero sequel superhero basedoncomic deathofking...,Action Adventure ScienceFiction,"Queen Ramonda, Shuri, M’Baku, Okoye and the Do...",https://m.media-amazon.com/images/M/MV5BNTM4Nj...
4,Top Gun: Maverick,2022,en,TomCruise JenniferConnelly MilesTeller ValKilmer,fighterpilot u.s.navy sequel nuclearweapons mi...,Action Drama,After more than thirty years of service as one...,https://m.media-amazon.com/images/M/MV5BZWYzOG...
...,...,...,...,...,...,...,...,...
3332,Traffik,2018,en,PriscillaQuintana LukeGoss PaulaPatton William...,corruption sheriff husbandwiferelationship sma...,Horror Thriller,A couple off for a romantic weekend in the mou...,https://m.media-amazon.com/images/M/MV5BMjIzMz...
3333,The 9th Life of Louis Drax,2016,en,JamieDornan AidenLongworth SarahGadon AaronPaul,basedonnovelorbook hitchcockian sixthsense acc...,Fantasy Mystery Thriller Drama,A psychologist who begins working with a young...,https://m.media-amazon.com/images/M/MV5BMjQ0Nj...
3334,Blackway,2015,en,AnthonyHopkins JuliaStiles RayLiotta Alexander...,stalker revenge murder logging,Thriller Mystery,A young woman newly returned to her hometown b...,https://m.media-amazon.com/images/M/MV5BOTAwND...
3335,Beyond the Reach,2014,en,MichaelDouglas JeremyIrvine MartinPalmer Hanna...,desert hunting,Thriller,A high-rolling corporate shark and his impover...,https://m.media-amazon.com/images/M/MV5BMjE5MT...


In [25]:
movies_df_new['cast'] = movies_df_new['cast'].apply(lambda x : x.lower())

movies_df_new['keywords'] = movies_df_new['keywords'].apply(lambda x : x.lower())

movies_df_new['genres'] = movies_df_new['genres'].apply(lambda x : x.lower())

movies_df_new['overview'] = movies_df_new['overview'].apply(lambda x : x.lower())


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies_df_new['cast'] = movies_df_new['cast'].apply(lambda x : x.lower())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies_df_new['keywords'] = movies_df_new['keywords'].apply(lambda x : x.lower())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies_df_new['genres'] = movies_df_new['genres']

## Using Tf-idf vectorizer to transform text into numbers

In [26]:
tfidf_keywords = TfidfVectorizer(stop_words='english',dtype=np.float32)
tfidf_genres = TfidfVectorizer(stop_words='english',dtype=np.float32)
tfidf_cast = TfidfVectorizer(stop_words='english',dtype=np.float32)
tfidf_overview = TfidfVectorizer(stop_words='english',dtype=np.float32)


In [27]:
tfidf_cast_matrix = tfidf_cast.fit_transform(movies_df_new['cast'])

In [12]:
tfidf_cast_tokens = tfidf_cast.get_feature_names()

df_tfidf_cast = pd.DataFrame(data = tfidf_cast_matrix.toarray(), columns = tfidf_cast_tokens)

tfidf_cast_matrix



<33928x53861 sparse matrix of type '<class 'numpy.float32'>'
	with 143545 stored elements in Compressed Sparse Row format>

In [28]:
tfidf_keywords_matrix = tfidf_keywords.fit_transform(movies_df_new['keywords'])

In [14]:
tfidf_keywords_tokens = tfidf_keywords.get_feature_names()

df_tfidf_keywords = pd.DataFrame(data = tfidf_keywords_matrix.toarray(), columns = tfidf_keywords_tokens)

df_tfidf_keywords



Unnamed: 0,10thcentury,10thcenturybc,11,11thcentury,12345678,12thcentury,13thcentury,14thcentury,14thcenturybc,15thcentury,...,茶花女,處處藍天,蚁人,起跑线,青春,音乐剧,범죄,살인,스릴러,인스팅트
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33923,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
33924,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
33925,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
33926,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [29]:
tfidf_genres_matrix = tfidf_genres.fit_transform(movies_df_new['genres'])

In [16]:
tfidf_genres_tokens = tfidf_genres.get_feature_names()

df_tfidf_genres = pd.DataFrame(data = tfidf_genres_matrix.toarray(), columns = tfidf_genres_tokens)

df_tfidf_genres



Unnamed: 0,action,adventure,animation,comedy,crime,documentary,drama,family,fantasy,history,horror,music,mystery,romance,sciencefiction,thriller,tvmovie,war,western
0,0.471597,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,0.652193,0.0,0.0,0.0,0.0,0.0,0.593498,0.000000,0.0,0.0,0.000000
1,0.543769,0.000000,0.0,0.000000,0.0,0.0,0.372568,0.0,0.752003,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000
2,0.621074,0.000000,0.0,0.512647,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.592840,0.0,0.0,0.000000
3,0.504970,0.584079,0.0,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.635497,0.000000,0.0,0.0,0.000000
4,0.824943,0.000000,0.0,0.000000,0.0,0.0,0.565216,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33923,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,1.000000
33924,0.000000,0.678949,0.0,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.734186
33925,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,1.000000
33926,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,1.000000


In [30]:
tfidf_overview_matrix = tfidf_overview.fit_transform(movies_df_new['overview'])

In [18]:
tfidf_overview_tokens = tfidf_overview.get_feature_names()

df_tfidf_overview = pd.DataFrame(data = tfidf_overview_matrix.toarray(), columns = tfidf_overview_tokens)

df_tfidf_overview



Unnamed: 0,00,000,000th,0014,007,01,01776,01984,02,03,...,అన,కర,నక,నమయ,శవ,ดสาคร,ﬁlm,ﬁlming,ﬁnd,ﬂying
0,0.0,0.181997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33923,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
33924,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
33925,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
33926,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Using cosine similarity to find the similarity between two movies

In [31]:
cosine_sim_cast=cosine_similarity(tfidf_cast_matrix,tfidf_cast_matrix)
cosine_sim_cast.shape

(3337, 3337)

In [32]:
cosine_sim_keywords = cosine_similarity(tfidf_keywords_matrix,tfidf_keywords_matrix)
cosine_sim_keywords.shape

(3337, 3337)

In [33]:
cosine_sim_genres = cosine_similarity(tfidf_genres_matrix,tfidf_genres_matrix)
cosine_sim_genres.shape

(3337, 3337)

In [34]:
cosine_sim_overview = cosine_similarity(tfidf_overview_matrix,tfidf_overview_matrix)
cosine_sim_overview.shape

(3337, 3337)

In [35]:
cosine_sim=(cosine_sim_keywords*0.4)+(cosine_sim_genres*0.3)+(cosine_sim_cast*0.15)+(cosine_sim_overview*0.15)
cosine_sim

array([[1.        , 0.20181136, 0.07818849, ..., 0.        , 0.        ,
        0.        ],
       [0.20181136, 0.99999994, 0.0846082 , ..., 0.00313253, 0.00286239,
        0.10609832],
       [0.07818849, 0.0846082 , 1.        , ..., 0.09650654, 0.17847404,
        0.12865898],
       ...,
       [0.        , 0.00313253, 0.09650654, ..., 1.        , 0.16533676,
        0.11694134],
       [0.        , 0.00286239, 0.17847404, ..., 0.16533676, 1.        ,
        0.21626505],
       [0.        , 0.10609832, 0.12865898, ..., 0.11694134, 0.21626505,
        1.        ]], dtype=float32)

In [36]:
indices=pd.Series(movies_df_new.index,index=movies_df_new['title'])
indices

title
Black Adam                           0
The School for Good and Evil         1
Bullet Train                         2
Black Panther: Wakanda Forever       3
Top Gun: Maverick                    4
                                  ... 
Traffik                           3332
The 9th Life of Louis Drax        3333
Blackway                          3334
Beyond the Reach                  3335
Mine 9                            3336
Length: 3337, dtype: int64

## Storing the data in a pickle file

In [37]:
import pickle

pickle.dump(cosine_sim,open('similarity.pkl','wb'))
pickle.dump(indices,open('indices.pkl','wb'))
pickle.dump(movies_df_export.to_dict(),open('movies.pkl','wb'))