In [1]:
!pip install kaggle



In [2]:
import os
import shutil

# Make sure the .kaggle directory exists
os.makedirs(os.path.expanduser('~/.kaggle'), exist_ok=True)

# Move kaggle.json to the .kaggle directory
shutil.copy('kaggle.json', os.path.expanduser('~/.kaggle/kaggle.json'))

# Set proper permissions
os.chmod(os.path.expanduser('~/.kaggle/kaggle.json'), 0o600)


In [3]:
!kaggle datasets download -d tmdb/tmdb-movie-metadata

Dataset URL: https://www.kaggle.com/datasets/tmdb/tmdb-movie-metadata
License(s): other
tmdb-movie-metadata.zip: Skipping, found more recently modified local copy (use --force to force download)


In [4]:
import zipfile

with zipfile.ZipFile("tmdb-movie-metadata.zip", "r") as zip_ref:
    zip_ref.extractall("movies_data")  # or any folder name

In [5]:
import numpy as np
import pandas as pd

In [6]:
movies = pd.read_csv('movies_data/tmdb_5000_movies.csv')
credits = pd.read_csv('movies_data/tmdb_5000_credits.csv') 

In [7]:
movies.head(2)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2007-05-19,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500


In [8]:
credits.head(2)

Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."


In [9]:
movies = movies.merge(credits,on='title')

In [10]:
movies.shape

(4809, 23)

selecting columns form all that will affect the results dropping all the rest

In [11]:
movies = movies[['movie_id','title','overview','genres','keywords','cast','crew']]

In [12]:
movies.sample(5)

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
1201,708,The Living Daylights,James Bond helps a Russian General escape into...,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 212, ""name"": ""london england""}, {""id"":...","[{""cast_id"": 18, ""character"": ""James Bond"", ""c...","[{""credit_id"": ""52fe426ec3a36847f801df5f"", ""de..."
2541,134411,Snitch,Construction company owner John Matthews learn...,"[{""id"": 53, ""name"": ""Thriller""}, {""id"": 18, ""n...","[{""id"": 494, ""name"": ""father son relationship""...","[{""cast_id"": 12, ""character"": ""John Matthews"",...","[{""credit_id"": ""52fe4bd0c3a368484e19a431"", ""de..."
925,50646,"Crazy, Stupid, Love.",Cal Weaver is living the American dream. He ha...,"[{""id"": 35, ""name"": ""Comedy""}, {""id"": 18, ""nam...","[{""id"": 815, ""name"": ""soulmates""}, {""id"": 1599...","[{""cast_id"": 3, ""character"": ""Cal Weaver"", ""cr...","[{""credit_id"": ""565b6197c3a368507d0034b0"", ""de..."
1247,18937,Quest for Camelot,"During the times of King Arthur, Kayley is a b...","[{""id"": 14, ""name"": ""Fantasy""}, {""id"": 16, ""na...",[],"[{""cast_id"": 1, ""character"": ""Kayley (voice)"",...","[{""credit_id"": ""59139064c3a36842c10005f4"", ""de..."
7,99861,Avengers: Age of Ultron,When Tony Stark tries to jumpstart a dormant p...,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 8828, ""name"": ""marvel comic""}, {""id"": ...","[{""cast_id"": 76, ""character"": ""Tony Stark / Ir...","[{""credit_id"": ""55d5f7d4c3a3683e7e0016eb"", ""de..."


In [13]:
movies.isnull().sum()

movie_id    0
title       0
overview    3
genres      0
keywords    0
cast        0
crew        0
dtype: int64

In [14]:
#dropping the movies with no overview
movies.dropna(inplace = True)

In [15]:
#checking for duplicates 
movies.duplicated().sum()

0

making the array of strings to a list of keywords using ast

In [16]:
import ast

In [17]:
#making a function to get the genres of the movie 
def convert(obj):
    L = []
    for i in ast.literal_eval(obj):
        L.append(i['name'])
    return L

In [18]:
movies['genres'] = movies['genres'].apply(convert)

In [19]:
#now applying the same thing to keywords
movies['keywords'] = movies['keywords'].apply(convert)


In [20]:
#making a function to get the genres of the movie 
def convert2(obj):
    L = []
    counter = 0
    for i in ast.literal_eval(obj):
        if counter != 3:
            L.append(i['name'])
            counter += 1
        else:
            break
    return L

In [21]:
movies['cast'] = movies['cast'].apply(convert2)

from the movies column we only need the director 

In [22]:
def fetch_director(obj):
    L=[]
    for i in ast.literal_eval(obj):
        if i['job'] == 'Director':
            L.append(i['name'])
            break
    return L

In [23]:
movies['crew'] = movies['crew'].apply(fetch_director)

now from the overview column we will make a list 

In [24]:
movies['overview'] = movies['overview'].apply(lambda x:x.split())

now we have all the column as list 

In [25]:
movies.head()

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin...","[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","[Sam Worthington, Zoe Saldana, Sigourney Weaver]",[James Cameron]
1,285,Pirates of the Caribbean: At World's End,"[Captain, Barbossa,, long, believed, to, be, d...","[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ...","[Johnny Depp, Orlando Bloom, Keira Knightley]",[Gore Verbinski]
2,206647,Spectre,"[A, cryptic, message, from, Bond’s, past, send...","[Action, Adventure, Crime]","[spy, based on novel, secret agent, sequel, mi...","[Daniel Craig, Christoph Waltz, Léa Seydoux]",[Sam Mendes]
3,49026,The Dark Knight Rises,"[Following, the, death, of, District, Attorney...","[Action, Crime, Drama, Thriller]","[dc comics, crime fighter, terrorist, secret i...","[Christian Bale, Michael Caine, Gary Oldman]",[Christopher Nolan]
4,49529,John Carter,"[John, Carter, is, a, war-weary,, former, mili...","[Action, Adventure, Science Fiction]","[based on novel, mars, medallion, space travel...","[Taylor Kitsch, Lynn Collins, Samantha Morton]",[Andrew Stanton]


now we want to convert all the words in the columns so that they don't have any space so as to create only one tag for them

In [26]:
#making a function for it
def collapse(L):
    L1 = []
    for i in L:
        L1.append(i.replace(" ",""))
    return L1

In [27]:
movies['cast'] = movies['cast'].apply(collapse)
movies['crew'] = movies['crew'].apply(collapse)
movies['genres'] = movies['genres'].apply(collapse)
movies['keywords'] = movies['keywords'].apply(collapse)

In [28]:
movies.head()

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin...","[Action, Adventure, Fantasy, ScienceFiction]","[cultureclash, future, spacewar, spacecolony, ...","[SamWorthington, ZoeSaldana, SigourneyWeaver]",[JamesCameron]
1,285,Pirates of the Caribbean: At World's End,"[Captain, Barbossa,, long, believed, to, be, d...","[Adventure, Fantasy, Action]","[ocean, drugabuse, exoticisland, eastindiatrad...","[JohnnyDepp, OrlandoBloom, KeiraKnightley]",[GoreVerbinski]
2,206647,Spectre,"[A, cryptic, message, from, Bond’s, past, send...","[Action, Adventure, Crime]","[spy, basedonnovel, secretagent, sequel, mi6, ...","[DanielCraig, ChristophWaltz, LéaSeydoux]",[SamMendes]
3,49026,The Dark Knight Rises,"[Following, the, death, of, District, Attorney...","[Action, Crime, Drama, Thriller]","[dccomics, crimefighter, terrorist, secretiden...","[ChristianBale, MichaelCaine, GaryOldman]",[ChristopherNolan]
4,49529,John Carter,"[John, Carter, is, a, war-weary,, former, mili...","[Action, Adventure, ScienceFiction]","[basedonnovel, mars, medallion, spacetravel, p...","[TaylorKitsch, LynnCollins, SamanthaMorton]",[AndrewStanton]


now we will combine all the columns into one column to make tags from it 

In [29]:
movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']

In [30]:
new = movies.drop(columns=['overview','genres','keywords','cast','crew'])
#new.head()

now we will convert the list in tags column into a string 

In [31]:
new['tags'] = new['tags'].apply(lambda x: " ".join(x))
new.head()

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha..."
2,206647,Spectre,A cryptic message from Bond’s past sends him o...
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...
4,49529,John Carter,"John Carter is a war-weary, former military ca..."


In [32]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5000,stop_words='english')

In [33]:
vector = cv.fit_transform(new['tags']).toarray()

In [34]:
vector.shape

(4806, 5000)

we are calculating the difference of angles of vectors of different movies

In [35]:
from sklearn.metrics.pairwise import cosine_similarity

In [36]:
similarity = cosine_similarity(vector)

In [37]:
#now we will make a function that will recommend movies to us 

In [38]:
def recommend(movie):
    index = new[new['title'] == movie].index[0]
    distances = sorted(list(enumerate(similarity[index])),reverse=True,key = lambda x: x[1])
    for i in distances[1:6]:
        print(new.iloc[i[0]].title)

In [46]:
new.sample(20)

Unnamed: 0,movie_id,title,tags
4560,322194,Subconscious,An investigation into a retired WWII sub plung...
1825,6466,Freddy vs. Jason,Evil dream-demon Freddy Krueger devises a plan...
2894,13079,The Life Before Her Eyes,As the 15th anniversary of a fatal high school...
4232,127918,The Gatekeepers,In an unprecedented and candid series of inter...
104,503,Poseidon,A packed cruise ship traveling the Atlantic is...
1534,120467,The Grand Budapest Hotel,The Grand Budapest Hotel tells of a legendary ...
2093,274,The Silence of the Lambs,"FBI trainee, Clarice Starling ventures into a ..."
3759,512,Scoop,An American journalism student in London scoop...
1184,6073,The Mexican,"Jerry Welbach, a reluctant bagman, has been gi..."
4389,39183,Once in a Lifetime: The Extraordinary Story of...,In the 1970s the North American Soccer League ...


In [48]:
recommend('Bogus')

The Boxtrolls
Nicholas Nickleby
Shipwrecked
Duma
Growing Up Smith


In [None]:
import pickle

In [None]:
pickle.dump(new.to_dict(),open('movie_dict.pkl','wb'))

In [None]:
pickle.dump(similarity,open('similarity.pkl','wb'))