# **loadind datasets**

In [1]:

import pandas as pd 
import numpy as np

# ✅ Correct file names 
movies = pd.read_excel('tmdb_5000_movies.xlsx')
credits = pd.read_excel('tmdb.xlsx')

credits = credits[['movie_id', 'title', 'cast', 'crew']]
credits.isnull().sum()
movies.head(1)


Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800


# ✅ Use inner join to prevent mismatch errors

In [2]:


movies = movies.merge(credits, on='title', how='inner')
movies.head(2)


Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,movie_id,cast,crew
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...",...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,19995,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...",...,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,285,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."


In [3]:

# Selecting required columns
movies = movies[['id','genres','keywords','title','overview','cast','crew']]
movies.head()

movies.isnull().sum()
movies.dropna(inplace=True)


# Formatting the data

In [4]:

import ast 

# ✅ Safe converter for 'genres' and 'keywords'
def safe_convert(obj):
    try:
        data = ast.literal_eval(obj)
        return [i['name'] for i in data]
    except:
        return []

movies['genres'] = movies['genres'].apply(safe_convert)
movies['keywords'] = movies['keywords'].apply(safe_convert)


# ✅ Convert top 3 cast members safely

In [5]:


def convert_cast(cast):
    try:
        data = ast.literal_eval(cast)
        return [i['name'] for i in data[:3]]
    except:
        return []

movies['cast'] = movies['cast'].apply(convert_cast)


# ✅  director extraction from crew

In [6]:


def find_director(dic):
    mylist = []
    if pd.isna(dic) or dic is None:
        return mylist
    if isinstance(dic, list):
        crew_list = dic
    elif isinstance(dic, str):
        try:
            crew_list = ast.literal_eval(dic)
        except (ValueError, SyntaxError):
            return mylist
    else:
        return mylist
    for i in crew_list:
        if i.get('job') == 'Director':
            mylist.append(i['name'])
            break
    return mylist

movies['crew'] = movies['crew'].apply(find_director)
movies.head(2)


Unnamed: 0,id,genres,keywords,title,overview,cast,crew
0,19995,"[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...",Avatar,"In the 22nd century, a paraplegic Marine is di...","[Sam Worthington, Zoe Saldana, Sigourney Weaver]",[James Cameron]
1,285,"[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ...",Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[Johnny Depp, Orlando Bloom, Keira Knightley]",[Gore Verbinski]


In [7]:

# ✅ Ensure overview is clean text
movies['overview'] = movies['overview'].astype(str)


In [8]:
movies.head(2)

Unnamed: 0,id,genres,keywords,title,overview,cast,crew
0,19995,"[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...",Avatar,"In the 22nd century, a paraplegic Marine is di...","[Sam Worthington, Zoe Saldana, Sigourney Weaver]",[James Cameron]
1,285,"[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ...",Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[Johnny Depp, Orlando Bloom, Keira Knightley]",[Gore Verbinski]


In [9]:
# Combine all features into one "tags" column
def combine_features(row):
    return ' '.join(row['genres']) + ' ' + ' '.join(row['keywords']) + ' ' + row['overview'] + ' ' + ' '.join(row['cast']) + ' ' + ' '.join(row['crew'])

movies['tags'] = movies.apply(combine_features, axis=1)

# Show result
movies.head(2)


Unnamed: 0,id,genres,keywords,title,overview,cast,crew,tags
0,19995,"[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...",Avatar,"In the 22nd century, a paraplegic Marine is di...","[Sam Worthington, Zoe Saldana, Sigourney Weaver]",[James Cameron],Action Adventure Fantasy Science Fiction cultu...
1,285,"[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ...",Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[Johnny Depp, Orlando Bloom, Keira Knightley]",[Gore Verbinski],Adventure Fantasy Action ocean drug abuse exot...


# keeping only needed columns

In [10]:
movies = movies[['id','title','tags']]
movies.head(2)

Unnamed: 0,id,title,tags
0,19995,Avatar,Action Adventure Fantasy Science Fiction cultu...
1,285,Pirates of the Caribbean: At World's End,Adventure Fantasy Action ocean drug abuse exot...


In [11]:
movies['tags'][0]

'Action Adventure Fantasy Science Fiction culture clash future space war space colony society space travel futuristic romance space alien tribe alien planet cgi marine soldier battle love affair anti war power relations mind and soul 3d In the 22nd century, a paraplegic Marine is dispatched to the moon Pandora on a unique mission, but becomes torn between following orders and protecting an alien civilization. Sam Worthington Zoe Saldana Sigourney Weaver James Cameron'

In [12]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(max_features=5000,stop_words='english')

In [13]:
vectors = cv.fit_transform(movies['tags']).toarray()

In [14]:
vectors

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], shape=(4802, 5000))

In [15]:
cv.get_feature_names_out()

array(['000', '007', '10', ..., 'zooey', 'zucker', 'zwick'],
      shape=(5000,), dtype=object)

# applying stemming

**['loved','loving','love'] ==> ['love','love','love']**

***ps.stem('activities') ==> 'activ'***

In [16]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [17]:
def stem(text):
    y=[]
    for i in text.split():
        y.append(ps.stem(i))
    return ' '.join(y)

movies['tags'] = movies['tags'].apply(stem)

In [18]:
movies

Unnamed: 0,id,title,tags
0,19995,Avatar,action adventur fantasi scienc fiction cultur ...
1,285,Pirates of the Caribbean: At World's End,adventur fantasi action ocean drug abus exot i...
2,206647,Spectre,action adventur crime spi base on novel secret...
3,49026,The Dark Knight Rises,action crime drama thriller dc comic crime fig...
4,49529,John Carter,action adventur scienc fiction base on novel m...
...,...,...,...
4804,9367,El Mariachi,action crime thriller unit states–mexico barri...
4805,72766,Newlyweds,comedi romanc a newlyw couple' honeymoon is up...
4806,231617,"Signed, Sealed, Delivered",comedi drama romanc tv movi date love at first...
4807,126186,Shanghai Calling,when ambiti new york attorney sam is sent to s...


In [19]:
vectors = cv.fit_transform(movies['tags']).toarray()
for i in cv.get_feature_names_out():
    print(i)

000
007
10
100
11
12
13
14
15
16
16th
17
17th
18
18th
19
1910
1930
1940
1950
1950s
1960
1960s
1970
1970s
1971
1974
1976
1980
1985
1990
1999
19th
20
200
2000
2001
2003
2009
20th
21st
23
24
25
30
300
3d
40
50
500
51
60
70
aaron
abandon
abbi
abduct
abigail
abil
abl
aboard
aborigin
abov
abraham
abroad
abus
academi
accent
accept
access
accid
accident
acclaim
accompani
accomplish
account
accus
ace
achiev
act
action
activ
activist
activities
actor
actress
actual
ad
adam
adapt
add
addict
adjust
admir
admit
adolesc
adolf
adopt
ador
adrian
adrien
adult
adulteri
adulthood
advanc
adventur
adventure
advertis
advic
advis
affair
affect
affleck
afghanistan
africa
african
aftercreditssting
afterlif
aftermath
ag
age
agenc
agency
agenda
agent
agents
aggress
ago
agre
ahead
aid
aidan
ail
aim
air
aircraft
airplan
airport
aka
al
alabama
alan
alaska
alba
albert
alcohol
alec
alejandro
alex
alexand
alexi
alfr
ali
alic
alice
alicia
alien
alik
alison
aliv
alive
allan
allen
alli
allianc
allison
allow
alon
alongsid

**Eucledian distance fails 90% time in higher dimensional data so we have to calculate Co-Sine distance(theta btw two vectors(movies))**

In [20]:
from sklearn.metrics.pairwise import cosine_similarity

all_vector = cosine_similarity(vectors)


In [21]:
all_vector[0]

array([1.        , 0.06737215, 0.06810052, ..., 0.04472136, 0.03113996,
       0.        ], shape=(4802,))

In [22]:
movies = movies.reset_index()
movies = movies.drop(columns=['index'])

**we have to keep the original index and sort the specified array based on similarity**

In [23]:
sorted(list(enumerate(all_vector[0])),reverse=True,key=lambda x:x[1])[:6]

[(0, np.float64(0.9999999999999994)),
 (2402, np.float64(0.49333587107582266)),
 (4332, np.float64(0.41403933560541245)),
 (1530, np.float64(0.39259818304894545)),
 (3157, np.float64(0.39056328877620144)),
 (372, np.float64(0.3829708431025352))]

In [24]:
# movies = movies.drop(columns=['level_0'])

In [25]:
def recomend(movie):
    idx = movies[movies['title']==movie].index[0]
    movie_sim = all_vector[idx]
    movie_list_idx = sorted(list(enumerate(movie_sim)),reverse=True,key=lambda x:x[1])[1:6]
    l = [i[0] for i in movie_list_idx]
    for i in l:
        print(movies.iloc[i]['title'])

recomend('Avatar')

Aliens
Silent Running
Moonraker
Alien
Mission to Mars


In [26]:
movies.iloc[3157]
movie = 'Avatar'
idx = movies[movies['title']==movie].index[0]
movies.iloc[372]

id                                                    2067
title                                      Mission to Mars
tags     scienc fiction mar spacecraft space travel ali...
Name: 372, dtype: object

In [27]:
import pickle
pickle.dump(movies,open('movi_recomender.pkl','wb'))

In [28]:
pickle.dump(all_vector,open('all_vector.pkl','wb'))

In [31]:
for i in movies['title']:
    print(i)

Avatar
Pirates of the Caribbean: At World's End
Spectre
The Dark Knight Rises
John Carter
Spider-Man 3
Tangled
Avengers: Age of Ultron
Harry Potter and the Half-Blood Prince
Batman v Superman: Dawn of Justice
Superman Returns
Quantum of Solace
Pirates of the Caribbean: Dead Man's Chest
The Lone Ranger
Man of Steel
The Chronicles of Narnia: Prince Caspian
The Avengers
Pirates of the Caribbean: On Stranger Tides
Men in Black 3
The Hobbit: The Battle of the Five Armies
The Amazing Spider-Man
Robin Hood
The Hobbit: The Desolation of Smaug
The Golden Compass
King Kong
Titanic
Captain America: Civil War
Battleship
Jurassic World
Skyfall
Spider-Man 2
Iron Man 3
Alice in Wonderland
X-Men: The Last Stand
Monsters University
Transformers: Revenge of the Fallen
Transformers: Age of Extinction
Oz: The Great and Powerful
The Amazing Spider-Man 2
TRON: Legacy
Cars 2
Green Lantern
Toy Story 3
Terminator Salvation
Furious 7
World War Z
X-Men: Days of Future Past
Star Trek Into Darkness
Jack the Giant 