In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Movie Recommendation System using Content-Based Filtering
This notebook loads movie metadata, preprocesses textual data, builds a TF-IDF model, and returns movie recommendations based on user input.

## 1. Data Loading
Loads the **TMDB movie dataset**

In [2]:
movies = pd.read_csv("tmdb/tmdb_5000_movies.csv")
credits = pd.read_csv("tmdb/tmdb_5000_credits.csv")
data = movies.merge(credits, on='title', how = "inner")

## 2. Data Preprocessing
Preprocesses movie descriptions, genres and keywords.

In [3]:
data.isnull().sum()

budget                     0
genres                     0
homepage                3096
id                         0
keywords                   0
original_language          0
original_title             0
overview                   3
popularity                 0
production_companies       0
production_countries       0
release_date               1
revenue                    0
runtime                    2
spoken_languages           0
status                     0
tagline                  844
title                      0
vote_average               0
vote_count                 0
movie_id                   0
cast                       0
crew                       0
dtype: int64

In [4]:
data = data[["movie_id", "title", "overview", "genres", "keywords", "vote_average"]]

In [5]:
data.head()

Unnamed: 0,movie_id,title,overview,genres,keywords,vote_average
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",7.2
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...","[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",6.9
2,206647,Spectre,A cryptic message from Bond’s past sends him o...,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",6.3
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...","[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...",7.6
4,49529,John Carter,"John Carter is a war-weary, former military ca...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 818, ""name"": ""based on novel""}, {""id"":...",6.1


In [6]:
data['overview'].dropna(inplace = True)

In [7]:
# Convert text to lowercase and strip whitespace.
data['overview'] = data['overview'].apply(lambda text: text.lower().strip() if isinstance(text, str) else '')

### Extract Names from JSON-like Objects

In [8]:
import ast

def extract_names(obj):
    """Convert JSON-like list of dictionaries into a space-separated string."""
    try:
        obj_list = ast.literal_eval(obj)  # Convert string to list
        return " ".join([i['name'].replace(" ", "") for i in obj_list])  # Remove spaces in names
    except (ValueError, SyntaxError):
        return ""

In [9]:
# Process genres and keywords
data['genres'] = data['genres'].apply(extract_names)
data['keywords'] = data['keywords'].apply(extract_names)

In [10]:
data = data[data['vote_average'] >= 5.0]  # Keep only higher-rated movies

In [11]:
# Combine textual features
data['tags'] = data['overview'] + " " + data['genres'] + " " + data['keywords']

In [12]:
data.head()

Unnamed: 0,movie_id,title,overview,genres,keywords,vote_average,tags
0,19995,Avatar,"in the 22nd century, a paraplegic marine is di...",Action Adventure Fantasy ScienceFiction,cultureclash future spacewar spacecolony socie...,7.2,"in the 22nd century, a paraplegic marine is di..."
1,285,Pirates of the Caribbean: At World's End,"captain barbossa, long believed to be dead, ha...",Adventure Fantasy Action,ocean drugabuse exoticisland eastindiatradingc...,6.9,"captain barbossa, long believed to be dead, ha..."
2,206647,Spectre,a cryptic message from bond’s past sends him o...,Action Adventure Crime,spy basedonnovel secretagent sequel mi6 britis...,6.3,a cryptic message from bond’s past sends him o...
3,49026,The Dark Knight Rises,following the death of district attorney harve...,Action Crime Drama Thriller,dccomics crimefighter terrorist secretidentity...,7.6,following the death of district attorney harve...
4,49529,John Carter,"john carter is a war-weary, former military ca...",Action Adventure ScienceFiction,basedonnovel mars medallion spacetravel prince...,6.1,"john carter is a war-weary, former military ca..."


In [13]:
data = data.reset_index(drop=True)

## 3. Build TF-IDF Model
Builds a TF-IDF matrix to transform text into numerical vectors

In [14]:
# Build TF-IDF Model
def build_tfidf_matrix(data, text_column):
    """Generate TF-IDF matrix for item descriptions."""
    vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_matrix = vectorizer.fit_transform(data[text_column])
    return vectorizer, tfidf_matrix

## 4. Get Recommendations
This function takes a user's input and finds the most similar movies based on text similarity. Using TF-IDF vectorization and cosine similarity, it ranks and returns the top N recommendations.

In [15]:
# Get Recommendations
def get_recommendations(user_input, vectorizer, tfidf_matrix, data, top_n=5):
    """Return top N most similar items to user input."""
    user_tfidf = vectorizer.transform([user_input])
    similarity_scores = cosine_similarity(user_tfidf, tfidf_matrix).flatten()
    valid_indices = np.where(similarity_scores > 0)[0]  # Keep only valid indices
    top_indices = valid_indices[np.argsort(similarity_scores[valid_indices])][-top_n:][::-1]
    return data.iloc[top_indices][['title', 'tags']], similarity_scores[top_indices]

## 5. Execute Movie Recommendation
This function orchestrates TF-IDF computation, and generates movie recommendations based on user input.

In [16]:
def recommend_movies(user_input):
    """Run recommendation system with user input."""
    
    # Build TF-IDF matrix
    vectorizer, tfidf_matrix = build_tfidf_matrix(data, 'tags')

    # Get recommendations
    recommendations, scores = get_recommendations(user_input, vectorizer, tfidf_matrix, data)
    
    # Print results
    print("Top Recommendations:")
    for i, (title, desc) in enumerate(zip(recommendations['title'], recommendations['tags'])):
        print(f"{i+1}. {title} (Score: {scores[i]:.4f})")

## 6. Run Recommendation System

In [17]:
if __name__ == "__main__":
    user_input = input("Enter a description of the kind of movies you like: ")
    recommend_movies(user_input)

Enter a description of the kind of movies you like: I love thrilling action movies set in space.
Top Recommendations:
1. Grindhouse (Score: 0.2312)
2. Space Pirate Captain Harlock (Score: 0.1832)
3. Lockout (Score: 0.1549)
4. Blow Out (Score: 0.1532)
5. Dragon Blade (Score: 0.1381)
