In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/imdb-dataset-of-top-1000-movies-and-tv-shows/imdb_top_1000.csv


In [3]:


# Load the IMDb dataset
df = pd.read_csv("/kaggle/input/imdb-dataset-of-top-1000-movies-and-tv-shows/imdb_top_1000.csv")

# Check dataset structure
df.head()


Unnamed: 0,Poster_Link,Series_Title,Released_Year,Certificate,Runtime,Genre,IMDB_Rating,Overview,Meta_score,Director,Star1,Star2,Star3,Star4,No_of_Votes,Gross
0,https://m.media-amazon.com/images/M/MV5BMDFkYT...,The Shawshank Redemption,1994,A,142 min,Drama,9.3,Two imprisoned men bond over a number of years...,80.0,Frank Darabont,Tim Robbins,Morgan Freeman,Bob Gunton,William Sadler,2343110,28341469
1,https://m.media-amazon.com/images/M/MV5BM2MyNj...,The Godfather,1972,A,175 min,"Crime, Drama",9.2,An organized crime dynasty's aging patriarch t...,100.0,Francis Ford Coppola,Marlon Brando,Al Pacino,James Caan,Diane Keaton,1620367,134966411
2,https://m.media-amazon.com/images/M/MV5BMTMxNT...,The Dark Knight,2008,UA,152 min,"Action, Crime, Drama",9.0,When the menace known as the Joker wreaks havo...,84.0,Christopher Nolan,Christian Bale,Heath Ledger,Aaron Eckhart,Michael Caine,2303232,534858444
3,https://m.media-amazon.com/images/M/MV5BMWMwMG...,The Godfather: Part II,1974,A,202 min,"Crime, Drama",9.0,The early life and career of Vito Corleone in ...,90.0,Francis Ford Coppola,Al Pacino,Robert De Niro,Robert Duvall,Diane Keaton,1129952,57300000
4,https://m.media-amazon.com/images/M/MV5BMWU4N2...,12 Angry Men,1957,U,96 min,"Crime, Drama",9.0,A jury holdout attempts to prevent a miscarria...,96.0,Sidney Lumet,Henry Fonda,Lee J. Cobb,Martin Balsam,John Fiedler,689845,4360000


In [4]:
# Select relevant columns and rename them
df = df[['Series_Title', 'Overview']].dropna()
df.columns = ['title', 'overview']

# Display dataset sample
df.head()


Unnamed: 0,title,overview
0,The Shawshank Redemption,Two imprisoned men bond over a number of years...
1,The Godfather,An organized crime dynasty's aging patriarch t...
2,The Dark Knight,When the menace known as the Joker wreaks havo...
3,The Godfather: Part II,The early life and career of Vito Corleone in ...
4,12 Angry Men,A jury holdout attempts to prevent a miscarria...


In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF Vectorizer
tfidf = TfidfVectorizer(stop_words="english")

# Convert movie descriptions to TF-IDF vectors
tfidf_matrix = tfidf.fit_transform(df['overview'])


In [6]:
from sklearn.metrics.pairwise import cosine_similarity

def recommend_movies(user_input, top_n=5):
    """
    Recommend movies based on content similarity.
    
    Parameters:
    - user_input (str): User's description of desired movies.
    - top_n (int): Number of recommendations to return.

    Returns:
    - DataFrame with top recommended movies.
    """
    # Convert user input into a TF-IDF vector
    user_vector = tfidf.transform([user_input])

    # Compute cosine similarity between user input and movie overviews
    similarities = cosine_similarity(user_vector, tfidf_matrix).flatten()

    # Get top N similar movie indices
    top_indices = similarities.argsort()[-top_n:][::-1]

    # Return top recommended movies
    return df.iloc[top_indices][['title', 'overview']]

# Example query
user_query = "I love thrilling action movies set in space, with a comedic twist."
recommendations = recommend_movies(user_query)
recommendations


Unnamed: 0,title,overview
538,Amarcord,A series of comedic and nostalgic vignettes se...
378,The Incredibles,"A family of undercover superheroes, while tryi..."
692,The Man Who Would Be King,Two British former soldiers decide to set them...
826,Barton Fink,A renowned New York playwright is enticed to C...
106,Aliens,Fifty-seven years after surviving an apocalypt...


In [7]:
recommend_movies("I enjoy mystery and suspense with mind-blowing twists.", top_n=5)


Unnamed: 0,title,overview
547,Charade,Romance and suspense ensue in Paris as a woman...
8,Inception,A thief who steals corporate secrets through t...
506,Harry Potter and the Prisoner of Azkaban,"Harry Potter, Ron and Hermione return to Hogwa..."
497,The Fall,In a hospital on the outskirts of 1920s Los An...
340,Blade Runner 2049,Young Blade Runner K's discovery of a long-bur...


In [8]:
recommend_movies("I want something unique which will keep me on my heels all the time.", top_n = 5)

Unnamed: 0,title,overview
685,Ghostbusters,Three former parapsychology professors set up ...
547,Charade,Romance and suspense ensue in Paris as a woman...
545,The Lion in Winter,1183 A.D.: King Henry II's three sons all want...
424,Butch Cassidy and the Sundance Kid,"Wyoming, early 1900s. Butch Cassidy and The Su..."
801,Happiness,The lives of several individuals intertwine as...
