In [2]:
import numpy as np
import pandas as pd

In [3]:
movies = pd.read_csv('movies_metadata.csv',low_memory=False)
credits = pd.read_csv('credits.csv')

In [4]:
movies.head(1)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0


In [5]:
credits.head(1)

Unnamed: 0,cast,crew,id
0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",862


In [6]:
# Convert safely to integers
movies['id'] = pd.to_numeric(movies['id'], errors='coerce').astype('Int64')
credits['id'] = credits['id'].astype('Int64')

# Drop rows with NaN IDs (if any) before merging
movies = movies.dropna(subset=['id'])
credits = credits.dropna(subset=['id'])

# Merge
movies = movies.merge(credits, on='id')

In [7]:
movies.shape

(45538, 26)

In [8]:
credits.shape

(45476, 3)

In [9]:
movies.head(1)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,cast,crew
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de..."


In [10]:

#id
#title
#overview
#genres
#cast
#crew
movies = movies[['id', 'title', 'overview', 'genres', 'cast', 'crew']]

In [11]:
movies.head()

Unnamed: 0,id,title,overview,genres,cast,crew
0,862,Toy Story,"Led by Woody, Andy's toys live happily in his ...","[{'id': 16, 'name': 'Animation'}, {'id': 35, '...","[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de..."
1,8844,Jumanji,When siblings Judy and Peter discover an encha...,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...","[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de..."
2,15602,Grumpier Old Men,A family wedding reignites the ancient feud be...,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...","[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de..."
3,31357,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...","[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...","[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de..."
4,11862,Father of the Bride Part II,Just when George Banks has recovered from his ...,"[{'id': 35, 'name': 'Comedy'}]","[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de..."


In [12]:
movies.isnull().sum()

id            0
title         3
overview    954
genres        0
cast          0
crew          0
dtype: int64

In [13]:
movies = movies.dropna(subset=['title', 'overview'])
print(movies.isnull().sum())

id          0
title       0
overview    0
genres      0
cast        0
crew        0
dtype: int64


In [14]:
movies.head(1)

Unnamed: 0,id,title,overview,genres,cast,crew
0,862,Toy Story,"Led by Woody, Andy's toys live happily in his ...","[{'id': 16, 'name': 'Animation'}, {'id': 35, '...","[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de..."


In [15]:
import ast

def convert_genres(text):
    try:
        # Convert string to Python object (list of dicts)
        genres = ast.literal_eval(text)
        # Extract only the 'name' field
        return [g['name'] for g in genres]
    except:
        return []

# Apply to your column
movies['genres'] = movies['genres'].apply(convert_genres)

In [16]:
movies.head()

Unnamed: 0,id,title,overview,genres,cast,crew
0,862,Toy Story,"Led by Woody, Andy's toys live happily in his ...","[Animation, Comedy, Family]","[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de..."
1,8844,Jumanji,When siblings Judy and Peter discover an encha...,"[Adventure, Fantasy, Family]","[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de..."
2,15602,Grumpier Old Men,A family wedding reignites the ancient feud be...,"[Romance, Comedy]","[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de..."
3,31357,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...","[Comedy, Drama, Romance]","[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de..."
4,11862,Father of the Bride Part II,Just when George Banks has recovered from his ...,[Comedy],"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de..."


In [17]:
def convert3(text):
    L = []
    counter = 0
    try:
        for i in ast.literal_eval(text):
            if counter < 3:
                L.append(i['name'])
            counter += 1
    except:
        return []
    return L

movies['cast'] = movies['cast'].apply(convert3)

In [18]:
# Keep only top 3 cast members
movies['cast'] = movies['cast'].apply(lambda x: x[:3] if isinstance(x, list) else [])
movies['cast'].head()

0                  [Tom Hanks, Tim Allen, Don Rickles]
1       [Robin Williams, Jonathan Hyde, Kirsten Dunst]
2           [Walter Matthau, Jack Lemmon, Ann-Margret]
3    [Whitney Houston, Angela Bassett, Loretta Devine]
4           [Steve Martin, Diane Keaton, Martin Short]
Name: cast, dtype: object

In [19]:
def fetch_director(text):
    L = []
    for i in ast.literal_eval(text):
        if i['job'] == 'Director':
            L.append(i['name'])
    return L 

In [20]:
movies['crew'] = movies['crew'].apply(fetch_director)

In [21]:
movies['crew'].head()

0      [John Lasseter]
1       [Joe Johnston]
2      [Howard Deutch]
3    [Forest Whitaker]
4      [Charles Shyer]
Name: crew, dtype: object

In [22]:
movies.head(1)

Unnamed: 0,id,title,overview,genres,cast,crew
0,862,Toy Story,"Led by Woody, Andy's toys live happily in his ...","[Animation, Comedy, Family]","[Tom Hanks, Tim Allen, Don Rickles]",[John Lasseter]


In [23]:
def collapse(L):
    L1 = []
    for i in L:
        L1.append(i.replace(" ",""))
    return L1

In [24]:
movies['cast'] = movies['cast'].apply(collapse)
movies['crew'] = movies['crew'].apply(collapse)
movies['genres'] = movies['genres'].apply(collapse)

In [25]:
movies.head()

Unnamed: 0,id,title,overview,genres,cast,crew
0,862,Toy Story,"Led by Woody, Andy's toys live happily in his ...","[Animation, Comedy, Family]","[TomHanks, TimAllen, DonRickles]",[JohnLasseter]
1,8844,Jumanji,When siblings Judy and Peter discover an encha...,"[Adventure, Fantasy, Family]","[RobinWilliams, JonathanHyde, KirstenDunst]",[JoeJohnston]
2,15602,Grumpier Old Men,A family wedding reignites the ancient feud be...,"[Romance, Comedy]","[WalterMatthau, JackLemmon, Ann-Margret]",[HowardDeutch]
3,31357,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...","[Comedy, Drama, Romance]","[WhitneyHouston, AngelaBassett, LorettaDevine]",[ForestWhitaker]
4,11862,Father of the Bride Part II,Just when George Banks has recovered from his ...,[Comedy],"[SteveMartin, DianeKeaton, MartinShort]",[CharlesShyer]


In [26]:
movies['overview'] = movies['overview'].apply(lambda x:x.split())

In [27]:
movies['tags'] = movies['overview'] + movies['genres'] + movies['cast'] + movies['crew']

In [28]:
new = movies.drop(columns=['overview','genres','cast','crew'])

In [29]:
new['tags'] = new['tags'].apply(lambda x: " ".join(x))
new.head()

Unnamed: 0,id,title,tags
0,862,Toy Story,"Led by Woody, Andy's toys live happily in his ..."
1,8844,Jumanji,When siblings Judy and Peter discover an encha...
2,15602,Grumpier Old Men,A family wedding reignites the ancient feud be...
3,31357,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom..."
4,11862,Father of the Bride Part II,Just when George Banks has recovered from his ...


In [30]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pickle


In [31]:
import os 
print(os.getcwd())

C:\Users\DELL\machine-learning\movie-recommendation-system


In [32]:
cv = CountVectorizer(max_features=500, stop_words='english')
vectors = cv.fit_transform(new['tags']) 
print("Vector shape:", vectors.shape)

Vector shape: (44581, 500)


In [33]:
def recommend(movie_title, movies, vectors):
    try:
        movie_index = movies[movies['title'] == movie_title].index[0]
    except IndexError:
        print("❌ Movie not found in dataset")
        return None  # return None if movie not found
    
    distances = cosine_similarity(vectors[movie_index], vectors).flatten()
    similar_indices = distances.argsort()[-6:-1][::-1]
    
    print(f"\nMovies similar to '{movie_title}':\n")
    for i in similar_indices:
        print(movies.iloc[i].title)
    
    return distances

In [34]:
movie_input = input("Enter a movie name: ")
distances = recommend(movie_input, new, vectors)

Enter a movie name:  Avatar



Movies similar to 'Avatar':

Act One
At Sea
Mr. Untouchable
Possible Loves
Orson Welles: The One-Man Band


In [35]:
if distances is not None:
    pickle.dump(vectors, open('movie_list.pkl', 'wb'))
    pickle.dump(distances, open('similarity.pkl', 'wb'))