In [2]:
# Step 1: Import libraries
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pickle

In [3]:
# Step 2: Load datasets (from TMDB Kaggle)
movies = pd.read_csv("../data/tmdb_5000_movies.csv")
credits = pd.read_csv("../data/tmdb_5000_credits.csv")

In [4]:
print(movies.columns)

Index(['budget', 'genres', 'homepage', 'id', 'keywords', 'original_language',
       'original_title', 'overview', 'popularity', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'vote_average',
       'vote_count'],
      dtype='object')


In [5]:
# Merge
movies = movies.merge(credits, on='title')

In [6]:
movies.head()

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,movie_id,cast,crew
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...",...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,19995,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...",...,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,285,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,245000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.sonypictures.com/movies/spectre/,206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...",...,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466,206647,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,250000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",http://www.thedarkknightrises.com/,49026,"[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...",en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,"[{""name"": ""Legendary Pictures"", ""id"": 923}, {""...",...,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106,49026,"[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,260000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://movies.disney.com/john-carter,49529,"[{""id"": 818, ""name"": ""based on novel""}, {""id"":...",en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}]",...,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124,49529,"[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


In [7]:
# Keep only useful columns
movies = movies[['movie_id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew']]

In [8]:
# Drop missing
movies.dropna(subset=['overview'], inplace=True)

In [9]:
# Convert JSON-like strings to Python objects
import ast
def convert(obj):
    L = []
    for i in ast.literal_eval(obj):
        L.append(i['name'])
    return L

In [10]:
movies['genres'] = movies['genres'].apply(convert)
movies['keywords'] = movies['keywords'].apply(convert)

In [11]:
# Top 3 cast
def convert_cast(obj):
    L = []
    counter = 0
    for i in ast.literal_eval(obj):
        if counter != 3:
            L.append(i['name'])
            counter+=1
        else:
            break
    return L

In [12]:
movies['cast'] = movies['cast'].apply(convert_cast)

In [13]:
# Only director
def fetch_director(obj):
    L = []
    for i in ast.literal_eval(obj):
        if i['job'] == 'Director':
            L.append(i['name'])
            break
    return L

In [14]:
movies['crew'] = movies['crew'].apply(fetch_director)

In [15]:
# Convert overview to list
movies['overview'] = movies['overview'].apply(lambda x:x.split())

In [16]:
# Remove spaces
movies['genres'] = movies['genres'].apply(lambda x:[i.replace(" ","") for i in x])
movies['keywords'] = movies['keywords'].apply(lambda x:[i.replace(" ","") for i in x])
movies['cast'] = movies['cast'].apply(lambda x:[i.replace(" ","") for i in x])
movies['crew'] = movies['crew'].apply(lambda x:[i.replace(" ","") for i in x])

In [19]:
# Create tags
movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']
new_df = movies[['movie_id','title','tags']].copy()


In [20]:
movies.head()

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew,tags
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin...","[Action, Adventure, Fantasy, ScienceFiction]","[cultureclash, future, spacewar, spacecolony, ...","[SamWorthington, ZoeSaldana, SigourneyWeaver]",[JamesCameron],"[In, the, 22nd, century,, a, paraplegic, Marin..."
1,285,Pirates of the Caribbean: At World's End,"[Captain, Barbossa,, long, believed, to, be, d...","[Adventure, Fantasy, Action]","[ocean, drugabuse, exoticisland, eastindiatrad...","[JohnnyDepp, OrlandoBloom, KeiraKnightley]",[GoreVerbinski],"[Captain, Barbossa,, long, believed, to, be, d..."
2,206647,Spectre,"[A, cryptic, message, from, Bond’s, past, send...","[Action, Adventure, Crime]","[spy, basedonnovel, secretagent, sequel, mi6, ...","[DanielCraig, ChristophWaltz, LéaSeydoux]",[SamMendes],"[A, cryptic, message, from, Bond’s, past, send..."
3,49026,The Dark Knight Rises,"[Following, the, death, of, District, Attorney...","[Action, Crime, Drama, Thriller]","[dccomics, crimefighter, terrorist, secretiden...","[ChristianBale, MichaelCaine, GaryOldman]",[ChristopherNolan],"[Following, the, death, of, District, Attorney..."
4,49529,John Carter,"[John, Carter, is, a, war-weary,, former, mili...","[Action, Adventure, ScienceFiction]","[basedonnovel, mars, medallion, spacetravel, p...","[TaylorKitsch, LynnCollins, SamanthaMorton]",[AndrewStanton],"[John, Carter, is, a, war-weary,, former, mili..."


In [21]:
# Convert to string
new_df['tags'] = new_df['tags'].apply(lambda x: " ".join(x).lower())

In [22]:
new_df['tags'] = new_df['tags'].fillna('')

In [23]:
new_df = new_df[new_df['tags'] != '']

In [32]:
# Load CSV
poster_df = pd.read_csv("../data/poster.csv", engine='python', encoding='utf-8', on_bad_lines='skip')

# Keep only the columns you need
poster_df = poster_df[['Title','Poster_Url']].copy()

# Rename columns to lowercase for consistency
poster_df.rename(columns={'Title':'title', 'Poster_Url':'poster'}, inplace=True)

# Overwrite original CSV (optional)
poster_df.to_csv("../data/poster.csv", index=False)

# Check
print(poster_df.head())

                     title                                             poster
0  Spider-Man: No Way Home  https://image.tmdb.org/t/p/original/1g0dhYtq4i...
1               The Batman  https://image.tmdb.org/t/p/original/74xTEgt7R3...
2                  No Exit  https://image.tmdb.org/t/p/original/vDHsLnOWKl...
3                  Encanto  https://image.tmdb.org/t/p/original/4j0PNHkMr5...
4           The King's Man  https://image.tmdb.org/t/p/original/aq4Pwv5Xeu...


In [33]:
# Merge posters
new_df = new_df.merge(poster_df[['title','poster']], on='title', how='left')

In [43]:
# Fill missing posters with placeholder
new_df['poster'] = new_df['poster'].fillna("https://via.placeholder.com/200x300?text=No+Image")
new_df.head()

Unnamed: 0,movie_id,title,tags,poster
0,19995,Avatar,"in the 22nd century, a paraplegic marine is di...",https://image.tmdb.org/t/p/original/jRXYjXNq0C...
1,285,Pirates of the Caribbean: At World's End,"captain barbossa, long believed to be dead, ha...",https://image.tmdb.org/t/p/original/2YMnBRh8F6...
2,206647,Spectre,a cryptic message from bond’s past sends him o...,https://image.tmdb.org/t/p/original/zj8ongFhtW...
3,49026,The Dark Knight Rises,following the death of district attorney harve...,https://image.tmdb.org/t/p/original/85cWkCVfti...
4,49529,John Carter,"john carter is a war-weary, former military ca...",https://image.tmdb.org/t/p/original/7GSSyUUgUE...


In [44]:
new_df = new_df[new_df['poster'] != placeholder]

# Check the result
new_df.head()

NameError: name 'placeholder' is not defined

In [35]:
# -------------------- VECTORIZE & SIMILARITY --------------------
cv = CountVectorizer(max_features=5000, stop_words='english')
vectors = cv.fit_transform(new_df['tags'])  # Keep sparse to avoid overflow
similarity = cosine_similarity(vectors, vectors)

In [42]:
# -------------------- SAVE PICKLES --------------------
pickle.dump(new_df, open('../movies.pkl','wb'))
pickle.dump(similarity, open('../similarity.pkl','wb'))

In [37]:
# -------------------- RECOMMEND FUNCTION --------------------
def recommend(movie):
    if movie not in new_df['title'].values:
        print("Movie not found!")
        return
    
    index = new_df[new_df['title'] == movie].index[0]
    distances = similarity[index]
    movies_list = sorted(list(enumerate(distances)), reverse=True, key=lambda x:x[1])[1:6]
    
    results = []
    print(f"\nTop 5 recommendations for '{movie}':\n")
    for i in movies_list:
        title = new_df.iloc[i[0]].title
        poster = new_df.iloc[i[0]].poster
        results.append((title, poster))
        print(title, "| Poster:", poster)
    return results

In [39]:
# -------------------- DISPLAY POSTERS INLINE --------------------
def show_recommendations(movie):
    results = recommend(movie)
    if results:
        for title, poster in results:
            display(Image(url=poster, width=200))
            print(title)

In [40]:
# Example
recommend("Avatar")


Top 5 recommendations for 'Avatar':

Titan A.E. | Poster: https://image.tmdb.org/t/p/original/el2iHk3LTJWfEnwrvcRkvWY501G.jpg
Small Soldiers | Poster: https://image.tmdb.org/t/p/original/2nuUjSzHsoYlRvTPmLo7m7gCQry.jpg
Ender's Game | Poster: https://image.tmdb.org/t/p/original/tBgkQqrO2RMgGQR6zod3bSjcPWx.jpg
Independence Day | Poster: https://image.tmdb.org/t/p/original/p0BPQGSPoSa8Ml0DAf2mB2kCU0R.jpg
Aliens vs Predator: Requiem | Poster: https://image.tmdb.org/t/p/original/jCyJN1vj8jqJJ0vNw4hDH2KlySO.jpg


[('Titan A.E.',
  'https://image.tmdb.org/t/p/original/el2iHk3LTJWfEnwrvcRkvWY501G.jpg'),
 ('Small Soldiers',
  'https://image.tmdb.org/t/p/original/2nuUjSzHsoYlRvTPmLo7m7gCQry.jpg'),
 ("Ender's Game",
  'https://image.tmdb.org/t/p/original/tBgkQqrO2RMgGQR6zod3bSjcPWx.jpg'),
 ('Independence Day',
  'https://image.tmdb.org/t/p/original/p0BPQGSPoSa8Ml0DAf2mB2kCU0R.jpg'),
 ('Aliens vs Predator: Requiem',
  'https://image.tmdb.org/t/p/original/jCyJN1vj8jqJJ0vNw4hDH2KlySO.jpg')]