# Improt libraries and load data

In [10]:
# import necessary libraries
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# load dataset from your github raw link
url = "https://raw.githubusercontent.com/siddquy2004/Movie_Recommendation_System_AICTE/main/movie_metadata.csv"
df = pd.read_csv(url)

# strip trailing spaces from movie titles and rename column for convenience
df['movie_title'] = df['movie_title'].str.strip()
df = df.rename(columns={'movie_title': 'title'})

# display first 5 rows to verify
df.head()


Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
0,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,3054.0,English,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000
1,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,1238.0,English,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0
2,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Action|Adventure|Thriller,...,994.0,English,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000
3,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,Action|Thriller,...,2701.0,English,USA,PG-13,250000000.0,2012.0,23000.0,8.5,2.35,164000
4,,Doug Walker,,,131.0,,Rob Walker,131.0,,Documentary,...,,,,,,,12.0,7.1,,0


# Preprocess data

In [11]:
# drop rows where plot_keywords are missing (needed for similarity)
df = df.dropna(subset=['plot_keywords'])

# reset index (optional but recommended)
df = df.reset_index(drop=True)


# create TF-IDF matrix

In [12]:
# initialize tf-idf vectorizer with stopwords removal
tfidf = TfidfVectorizer(stop_words='english')

# create tf-idf matrix by fitting on 'plot_keywords' column
tfidf_matrix = tfidf.fit_transform(df['plot_keywords'])


# compute cosine similarity matrix

In [13]:
# compute cosine similarity between all movies based on plot_keywords
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)


# create helper data structure

In [14]:
# create a series mapping movie titles to dataframe indices for quick lookup
indices = pd.Series(df.index, index=df['title']).drop_duplicates()


# define recommendation function

In [15]:
def get_recommendations(title, cosine_sim=cosine_sim):
    """
    Given a movie title, return top 5 similar movies based on plot keywords.
    """
    # get index of the movie that matches the title
    idx = indices.get(title)
    
    # if title not found, return message
    if idx is None:
        return ["movie not found."]
    
    # get list of similarity scores for this movie with all others
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    # sort movies based on similarity scores in descending order
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # get indices of top 5 most similar movies (excluding the first which is itself)
    sim_scores = sim_scores[1:6]
    
    movie_indices = [i[0] for i in sim_scores]
    
    # return the titles of the top 5 recommended movies
    return df['title'].iloc[movie_indices].tolist()


# test the recommender

In [16]:
# example: get recommendations for 'The Godfather'
movie_title = 'The Godfather'

print(f"recommended movies for '{movie_title}':")
print(get_recommendations(movie_title))


recommended movies for 'The Godfather':
['Goodfellas', 'The French Connection', 'The French Connection', 'Out of the Blue', 'Deadline - U.S.A.']
