Importing the dependencies

In [1]:
import numpy as np
import pandas as pd
import difflib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt

Data Collection and Pre-Processing

In [2]:
# loading the data from the csv file to apandas dataframe
movies_data = pd.read_csv('movie.csv')

In [3]:
# printing the first 5 rows of the dataframe
movies_data.head()

Unnamed: 0,index,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,cast,crew,director
0,0,237000000,Action Adventure Fantasy Science Fiction,http://www.avatarmovie.com/,19995,culture clash future space war space colony so...,en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,Sam Worthington Zoe Saldana Sigourney Weaver S...,"[{'name': 'Stephen E. Rivkin', 'gender': 0, 'd...",James Cameron
1,1,300000000,Adventure Fantasy Action,http://disney.go.com/disneypictures/pirates/,285,ocean drug abuse exotic island east india trad...,en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,...,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,Johnny Depp Orlando Bloom Keira Knightley Stel...,"[{'name': 'Dariusz Wolski', 'gender': 2, 'depa...",Gore Verbinski
2,2,245000000,Action Adventure Crime,http://www.sonypictures.com/movies/spectre/,206647,spy based on novel secret agent sequel mi6,en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,...,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466,Daniel Craig Christoph Waltz L\u00e9a Seydoux ...,"[{'name': 'Thomas Newman', 'gender': 2, 'depar...",Sam Mendes
3,3,250000000,Action Crime Drama Thriller,http://www.thedarkknightrises.com/,49026,dc comics crime fighter terrorist secret ident...,en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,...,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106,Christian Bale Michael Caine Gary Oldman Anne ...,"[{'name': 'Hans Zimmer', 'gender': 2, 'departm...",Christopher Nolan
4,4,260000000,Action Adventure Science Fiction,http://movies.disney.com/john-carter,49529,based on novel mars medallion space travel pri...,en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,...,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124,Taylor Kitsch Lynn Collins Samantha Morton Wil...,"[{'name': 'Andrew Stanton', 'gender': 2, 'depa...",Andrew Stanton


In [4]:
# number of rows and columns in the data frame

movies_data.shape

(4803, 24)

In [5]:
# selecting the relevant features for recommendation

selected_features = ['genres','keywords','tagline','cast','director']
print(selected_features)

['genres', 'keywords', 'tagline', 'cast', 'director']


In [6]:
# replacing the null valuess with null string

selected_features = ['genres', 'keywords', 'tagline', 'cast', 'director']

# keep only columns that are present in your dataset
selected_features = [f for f in selected_features if f in movies_data.columns]

print("Available selected features:", selected_features)

for feature in selected_features:
    movies_data[feature] = movies_data[feature].fillna('')


Available selected features: ['genres', 'keywords', 'tagline', 'cast', 'director']


In [7]:
# combining all the 5 selected features

combined_features = movies_data['genres']+' '+movies_data['tagline']+' '+movies_data['cast']+' '+movies_data['director']


In [8]:
print(combined_features)

0       Action Adventure Fantasy Science Fiction Enter...
1       Adventure Fantasy Action At the end of the wor...
2       Action Adventure Crime A Plan No One Escapes D...
3       Action Crime Drama Thriller The Legend Ends Ch...
4       Action Adventure Science Fiction Lost in our w...
                              ...                        
4798    Action Crime Thriller He didn't come looking f...
4799    Comedy Romance A newlywed couple's honeymoon i...
4800    Comedy Drama Romance TV Movie  Eric Mabius Kri...
4801     A New Yorker in Shanghai Daniel Henney Eliza ...
4802    Documentary  Drew Barrymore Brian Herzlinger C...
Length: 4803, dtype: object


In [9]:
# converting the text data to feature vectors

vectorizer = TfidfVectorizer()

In [10]:
feature_vectors = vectorizer.fit_transform(combined_features)

In [11]:
print(feature_vectors)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 98114 stored elements and shape (4803, 14567)>
  Coords	Values
  (0, 174)	0.0978145094141658
  (0, 235)	0.11225438183777425
  (0, 4404)	0.13871140717064748
  (0, 11491)	0.1291710225032237
  (0, 4544)	0.12901958190159793
  (0, 4145)	0.29856536277378287
  (0, 12808)	0.0900916042156741
  (0, 14270)	0.1833394731378756
  (0, 9549)	0.12871839488454945
  (0, 9806)	0.33815052600962486
  (0, 11280)	0.18666680671144223
  (0, 14278)	0.2938117721987114
  (0, 14539)	0.25099617549102243
  (0, 11254)	0.2705907237581977
  (0, 11886)	0.2559442670352091
  (0, 13982)	0.24800526125618033
  (0, 12310)	0.18860440469037879
  (0, 7434)	0.28220124797845453
  (0, 8751)	0.19955867261420812
  (0, 10995)	0.24137417527914498
  (0, 6515)	0.14037896111119083
  (0, 2065)	0.21464144572702734
  (1, 174)	0.08373967560143826
  (1, 235)	0.19220349979230872
  (1, 4404)	0.1187517916131042
  :	:
  (4801, 6312)	0.11731686444737813
  (4801, 9326)	0.1743707846526448
 

Cosine Similarity

In [12]:
# getting the similarity scores using cosine similarity

similarity = cosine_similarity(feature_vectors)

In [13]:
print(similarity)

[[1.         0.11004561 0.0537834  ... 0.         0.         0.        ]
 [0.11004561 1.         0.02808035 ... 0.         0.         0.        ]
 [0.0537834  0.02808035 1.         ... 0.         0.06089666 0.        ]
 ...
 [0.         0.         0.         ... 1.         0.         0.03519894]
 [0.         0.         0.06089666 ... 0.         1.         0.        ]
 [0.         0.         0.         ... 0.03519894 0.         1.        ]]


In [14]:
print(similarity.shape)

(4803, 4803)


Getting the movie name from the user

In [15]:
# creating a list with all the movie names given in the dataset

list_of_all_titles = movies_data['title'].tolist()
print(list_of_all_titles)

['Avatar', "Pirates of the Caribbean: At World's End", 'Spectre', 'The Dark Knight Rises', 'John Carter', 'Spider-Man 3', 'Tangled', 'Avengers: Age of Ultron', 'Harry Potter and the Half-Blood Prince', 'Batman v Superman: Dawn of Justice', 'Superman Returns', 'Quantum of Solace', "Pirates of the Caribbean: Dead Man's Chest", 'The Lone Ranger', 'Man of Steel', 'The Chronicles of Narnia: Prince Caspian', 'The Avengers', 'Pirates of the Caribbean: On Stranger Tides', 'Men in Black 3', 'The Hobbit: The Battle of the Five Armies', 'The Amazing Spider-Man', 'Robin Hood', 'The Hobbit: The Desolation of Smaug', 'The Golden Compass', 'King Kong', 'Titanic', 'Captain America: Civil War', 'Battleship', 'Jurassic World', 'Skyfall', 'Spider-Man 2', 'Iron Man 3', 'Alice in Wonderland', 'X-Men: The Last Stand', 'Monsters University', 'Transformers: Revenge of the Fallen', 'Transformers: Age of Extinction', 'Oz: The Great and Powerful', 'The Amazing Spider-Man 2', 'TRON: Legacy', 'Cars 2', 'Green Lant

In [16]:
# finding the close match for the movie name given by the user

find_close_match = difflib.get_close_matches('title', list_of_all_titles)
print(find_close_match)

['Stitches', 'Kites']


In [17]:
close_match = find_close_match[0]
print(close_match)

Stitches


In [18]:
# finding the index of the movie with title

index_of_the_movie = movies_data[movies_data.title == close_match]['index'].values[0]
print(index_of_the_movie)

4138


In [19]:
# getting a list of similar movies

similarity_score = list(enumerate(similarity[index_of_the_movie]))
print(similarity_score)

[(0, np.float64(0.0)), (1, np.float64(0.0)), (2, np.float64(0.0)), (3, np.float64(0.0)), (4, np.float64(0.0)), (5, np.float64(0.0)), (6, np.float64(0.0)), (7, np.float64(0.0)), (8, np.float64(0.0)), (9, np.float64(0.0)), (10, np.float64(0.0)), (11, np.float64(0.0)), (12, np.float64(0.0)), (13, np.float64(0.0)), (14, np.float64(0.015606622670530127)), (15, np.float64(0.0)), (16, np.float64(0.0)), (17, np.float64(0.04728895065363961)), (18, np.float64(0.05389885409341853)), (19, np.float64(0.0)), (20, np.float64(0.0)), (21, np.float64(0.0)), (22, np.float64(0.0)), (23, np.float64(0.0)), (24, np.float64(0.0)), (25, np.float64(0.0)), (26, np.float64(0.0)), (27, np.float64(0.0)), (28, np.float64(0.0)), (29, np.float64(0.0)), (30, np.float64(0.0)), (31, np.float64(0.0)), (32, np.float64(0.01391484159680491)), (33, np.float64(0.0)), (34, np.float64(0.013876795614094162)), (35, np.float64(0.0)), (36, np.float64(0.0)), (37, np.float64(0.0)), (38, np.float64(0.0)), (39, np.float64(0.0)), (40, np

In [20]:
len(similarity_score)

4803

In [21]:
# sorting the movies based on their similarity score

sorted_similar_movies = sorted(similarity_score, key = lambda x:x[1], reverse = True)
print(sorted_similar_movies)

[(4138, np.float64(1.0000000000000002)), (2092, np.float64(0.12563697493651096)), (3938, np.float64(0.125213055105494)), (2160, np.float64(0.12400749469487703)), (937, np.float64(0.11537670556212085)), (2202, np.float64(0.1040804800434171)), (1442, np.float64(0.09855008353821758)), (2842, np.float64(0.09446698272419443)), (2961, np.float64(0.09361053752509028)), (2503, np.float64(0.09339494043529321)), (3957, np.float64(0.09201908379693154)), (4053, np.float64(0.0914657332509406)), (3954, np.float64(0.09087716241253317)), (4521, np.float64(0.08974052723990833)), (507, np.float64(0.08914544272778631)), (2445, np.float64(0.08849988563659514)), (2629, np.float64(0.08745299101188592)), (1757, np.float64(0.08404173076131337)), (2576, np.float64(0.08213922686504271)), (215, np.float64(0.08204061701055017)), (4464, np.float64(0.08184076951297148)), (1437, np.float64(0.08155994420914912)), (2795, np.float64(0.08115794819104395)), (1965, np.float64(0.08100280465620147)), (2623, np.float64(0.080

In [22]:
# print the name of similar movies based on the index

print('Movies suggested for you : \n')

i = 1

for movie in sorted_similar_movies:
  index = movie[0]
  title_from_index = movies_data[movies_data.index==index]['title'].values[0]
  if (i<30):
    print(i, '.',title_from_index)
    i+=1

Movies suggested for you : 

1 . Stitches
2 . Wayne's World
3 . DysFunktional Family
4 . Stay Alive
5 . Hansel & Gretel: Witch Hunters
6 . One Night with the King
7 . The Shipping News
8 . Stir of Echoes
9 . Jindabyne
10 . The Homesman
11 . The Eclipse
12 . Friday the 13th: A New Beginning
13 . Chain Letter
14 . Beyond the Mat
15 . Independence Day
16 . The Apparition
17 . You Will Meet a Tall Dark Stranger
18 . Lake Placid
19 . Urban Legends: Final Cut
20 . Fantastic 4: Rise of the Silver Surfer
21 . Dead Snow
22 . Bait
23 . Sorority Row
24 . Footloose
25 . The Three Burials of Melquiades Estrada
26 . Two Evil Eyes
27 . Old School
28 . Carriers
29 . Half Baked


Movie Recommendation Sytem

In [24]:
movie_name = input(' Enter your favourite movie name : ')

list_of_all_titles = movies_data['title'].tolist()

find_close_match = difflib.get_close_matches(movie_name, list_of_all_titles)

close_match = find_close_match[0]

index_of_the_movie = movies_data[movies_data.title == close_match]['index'].values[0]

similarity_score = list(enumerate(similarity[index_of_the_movie]))

sorted_similar_movies = sorted(similarity_score, key = lambda x:x[1], reverse = True)

print('Movies suggested for you : \n')

i = 1

for movie in sorted_similar_movies:
  index = movie[0]
  title_from_index = movies_data[movies_data.index==index]['title'].values[0]
  if (i<30):
    print(i, '.',title_from_index)
    i+=1

 Enter your favourite movie name :  avatar


Movies suggested for you : 

1 . Avatar
2 . Aliens
3 . Star Trek Beyond
4 . Galaxy Quest
5 . Guardians of the Galaxy
6 . Star Trek Into Darkness
7 . The Helix... Loaded
8 . Snow White: A Tale of Terror
9 . Alien³
10 . Gettysburg
11 . Out of the Furnace
12 . Terminator Salvation
13 . Imaginary Heroes
14 . Colombiana
15 . Alien: Resurrection
16 . Cedar Rapids
17 . Clash of the Titans
18 . Alien
19 . Shadow Conspiracy
20 . The Book of Life
21 . Vantage Point
22 . Resident Evil: Retribution
23 . Machete Kills
24 . Blood Ties
25 . The Losers
26 . Everest
27 . Machete
28 . Wrath of the Titans
29 . Crossroads


In [25]:
#collaborative filtering


In [26]:
ratings_data = pd.read_csv("movies.csv")

ratings_data.head()


Unnamed: 0,movieId,title,keywords
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [27]:
ratings_data = pd.read_csv("ratings.csv")

ratings_data.head()


Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [28]:
user_movie_matrix = ratings_data.pivot_table(
    index='userId',
    columns='movieId',
    values='rating'
).fillna(0)


In [29]:
from sklearn.metrics.pairwise import cosine_similarity

user_similarity = cosine_similarity(user_movie_matrix)

user_similarity_df = pd.DataFrame(
    user_similarity,
    index=user_movie_matrix.index,
    columns=user_movie_matrix.index
)

user_similarity_df.head()


userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.027283,0.05972,0.194395,0.12908,0.128152,0.158744,0.136968,0.064263,0.016875,...,0.080554,0.164455,0.221486,0.070669,0.153625,0.164191,0.269389,0.291097,0.093572,0.145321
2,0.027283,1.0,0.0,0.003726,0.016614,0.025333,0.027585,0.027257,0.0,0.067445,...,0.202671,0.016866,0.011997,0.0,0.0,0.028429,0.012948,0.046211,0.027565,0.102427
3,0.05972,0.0,1.0,0.002251,0.00502,0.003936,0.0,0.004941,0.0,0.0,...,0.005048,0.004892,0.024992,0.0,0.010694,0.012993,0.019247,0.021128,0.0,0.032119
4,0.194395,0.003726,0.002251,1.0,0.128659,0.088491,0.11512,0.062969,0.011361,0.031163,...,0.085938,0.128273,0.307973,0.052985,0.084584,0.200395,0.131746,0.149858,0.032198,0.107683
5,0.12908,0.016614,0.00502,0.128659,1.0,0.300349,0.108342,0.429075,0.0,0.030611,...,0.068048,0.418747,0.110148,0.258773,0.148758,0.106435,0.152866,0.135535,0.261232,0.060792


In [30]:
def get_similar_users(user_id):

    similar_users = user_similarity_df[user_id].sort_values(ascending=False)

    return similar_users[1:6]   # top 5 similar users

print(get_similar_users(1))


userId
266    0.357408
313    0.351562
368    0.345127
57     0.345034
91     0.334727
Name: 1, dtype: float64


In [31]:
def collaborative_recommend(user_id, top_n=10):

    similar_users = get_similar_users(user_id).index

    recommended_movies = user_movie_matrix.loc[similar_users].mean().sort_values(ascending=False)

    return recommended_movies.head(top_n)

print(collaborative_recommend(1))


movieId
1200    4.8
2571    4.8
1198    4.8
2028    4.7
296     4.5
50      4.5
1197    4.4
1610    4.3
1240    4.3
592     4.2
dtype: float64


In [32]:
def content_recommend(movie_name):

    list_of_titles = movies_data['title'].tolist()

    # fuzzy matching
    find_close_match = difflib.get_close_matches(
        movie_name,
        list_of_titles,
        n=1,
        cutoff=0.4   # lower = more flexible matching
    )

    # ✅ HANDLE NOT FOUND CASE
    if not find_close_match:
        return None

    close_match = find_close_match[0]

    index_of_movie = movies_data[movies_data.title == close_match]['index'].values[0]

    similarity_score = list(enumerate(similarity[index_of_movie]))

    sorted_similar_movies = sorted(similarity_score, key=lambda x: x[1], reverse=True)

    return sorted_similar_movies


In [33]:

def hybrid_recommend(user_id, movie_name, top_n=10):

    # ==============================
    # Get content-based results safely
    # ==============================

    content_results = content_recommend(movie_name)

    # If movie not found or no matches
    if not content_results:
        return ["Movie not found in dataset"]

    # ==============================
    # Collaborative filtering
    # ==============================

    if user_id not in user_movie_matrix.index:
        collab_scores = pd.Series(dtype=float)
    else:
        collab_scores = collaborative_recommend(user_id)

    # ==============================
    # Rank Fusion Setup
    # ==============================

    content_rank = {movie[0]: rank for rank, movie in enumerate(content_results)}
    collab_rank = {}

    if not collab_scores.empty:
        collab_rank = {mid: rank for rank, mid in enumerate(collab_scores.index)}

    hybrid_scores = []

    # ==============================
    # Hybrid scoring loop
    # ==============================

    for movie in content_results:

        index = movie[0]
        title = movies_data.iloc[index]['title']

        # Skip input movie itself
        if movie_name.lower() in title.lower():
            continue

        # Collaborative rank lookup
        if title in title_to_movieId:
            movie_id = title_to_movieId[title]
            colr = collab_rank.get(movie_id, 1000)
        else:
            colr = 1000

        # Content rank
        cr = content_rank.get(index, 1000)

        # Rank fusion scoring
        final_score = (1/(cr+1)) + (1/(colr+1))

        hybrid_scores.append((index, final_score))

    # ==============================
    # Sort recommendations
    # ==============================

    hybrid_scores = sorted(hybrid_scores, key=lambda x: x[1], reverse=True)

    recommendations = []

    for movie in hybrid_scores[:top_n]:
        title = movies_data.iloc[movie[0]]['title']
        recommendations.append(title)

    # Remove duplicates (important)
    recommendations = list(dict.fromkeys(recommendations))

    return recommendations


In [34]:
movielens_movies = pd.read_csv("movies1.csv")


In [35]:
title_to_movieId = dict(zip(movielens_movies['title'], movielens_movies['movieId']))


In [36]:
movies = hybrid_recommend(1,"Batman")

print(movies)


['Planet of the Apes', 'Mars Attacks!', 'Beetlejuice', 'The Sentinel', "Hang 'em High", 'The Postman Always Rings Twice', 'The Land Before Time', "Something's Gotta Give", 'Wolf', 'Reds']


In [46]:
for movie in movies:
    print(movie, get_movie_poster(movie))


Planet of the Apes https://image.tmdb.org/t/p/w500/2r9iKnlSYEk4daQadsXfcjHfIjQ.jpg
Mars Attacks! https://image.tmdb.org/t/p/w500/hll4O5vSAfnZDb6JbnP06GPtz7b.jpg
Beetlejuice https://image.tmdb.org/t/p/w500/nnl6OWkyPpuMm595hmAxNW3rZFn.jpg
The Sentinel https://image.tmdb.org/t/p/w500/zoLuOdAD63prJP0o3Z0zRaAfwXE.jpg
Hang 'em High https://image.tmdb.org/t/p/w500/2J9QEqdJWjL2etOf6Kf7n33PUK4.jpg
The Postman Always Rings Twice https://image.tmdb.org/t/p/w500/k52pPgVOHPwjIyMdvNZZi3Z7zFR.jpg
The Land Before Time https://image.tmdb.org/t/p/w500/7phV1ETZnQrLsEeuk4hNeceEl25.jpg
Something's Gotta Give https://image.tmdb.org/t/p/w500/1cpdqe0SpiHzzbOLq9GDUUlZdSl.jpg
Wolf https://image.tmdb.org/t/p/w500/5TprU55U2zjAMCwr2jQra3zj1zu.jpg
Reds https://image.tmdb.org/t/p/w500/AeiKdVVM93fwfQG1m3N0cgVZgHl.jpg


In [45]:
import requests

TMDB_API_KEY = "7bb05e9adc89c4384540a216524e9644"

def get_movie_poster(movie_name):

    url = f"https://api.themoviedb.org/3/search/movie"

    params = {
        "api_key": TMDB_API_KEY,
        "query": movie_name
    }

    headers = {
        "accept": "application/json",
        "User-Agent": "Mozilla/5.0"
    }

    try:
        response = requests.get(url, params=params, headers=headers, timeout=10)

        data = response.json()

        if "results" in data and len(data["results"]) > 0:
            poster_path = data["results"][0].get("poster_path")

            if poster_path:
                return "https://image.tmdb.org/t/p/w500" + poster_path

    except Exception as e:
        print("TMDB Error:", e)

    return None





In [47]:
print(get_movie_poster("Batman"))


https://image.tmdb.org/t/p/w500/cij4dd21v2Rk2YtUQbV5kW69WB2.jpg


In [48]:
print(get_movie_poster("Avatar"))

https://image.tmdb.org/t/p/w500/gKY6q7SjCkAU6FqvqWybDYgUKIF.jpg


In [44]:
import requests
print(requests.get("https://google.com").status_code)


200
