In [11]:
import numpy as np
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Loading Dataset

In [12]:
df = pd.read_csv("imdb(MOVIES).csv")
df.info()

df = df[['movie name ', 'DETAIL ABOUT MOVIE']]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250 entries, 0 to 249
Data columns (total 16 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   ranking of movie    250 non-null    int64  
 1   movie name          250 non-null    object 
 2   Year                250 non-null    object 
 3   certificate         250 non-null    object 
 4   runtime             250 non-null    object 
 5   genre               250 non-null    object 
 6   RATING              250 non-null    float64
 7   DETAIL ABOUT MOVIE  250 non-null    object 
 8   DIRECTOR            250 non-null    object 
 9   ACTOR 1             250 non-null    object 
 10  ACTOR 2             250 non-null    object 
 11  ACTOR 3             250 non-null    object 
 12  ACTOR 4             250 non-null    object 
 13  votes               250 non-null    int64  
 14  metascore           218 non-null    float64
 15  GROSS COLLECTION    214 non-null    object 
dtypes: float

# Preprocess data

In [13]:
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r"[^a-z0-9\s]", "", text)  # Removes punctuation & special characters
    return text

df["cleaned_description"] = df["DETAIL ABOUT MOVIE"].apply(preprocess_text)

In [14]:
# TF-IDF Vectorization
vectorizer = TfidfVectorizer(stop_words="english")
tfidf_matrix = vectorizer.fit_transform(df["cleaned_description"])

def find_similar_movies(user_description, top_n=5):
    user_vec = vectorizer.transform([user_description])
    similarities = cosine_similarity(user_vec, tfidf_matrix).flatten()
    
    # Get indices of top N most similar movies
    top_indices = similarities.argsort()[-top_n:][::-1]
    
    # Return top N movie names and their similarities
    similar_movies = [(df.iloc[i]["movie name "], similarities[i]) for i in top_indices]
    return similar_movies

In [16]:
user_input = input("Enter a movie description that you would like a movie recommendation for: ")
recommended_movies = find_similar_movies(user_input)
for movie, score in recommended_movies:
    print(f"{movie}: Similarity = {score:.4f}")

Enter a movie description that you would like a movie recommendation for:  I like romance movies with a cute couple


It's a Wonderful Life: Similarity = 0.2078
Tokyo Story: Similarity = 0.1703
Eternal Sunshine of the Spotless Mind: Similarity = 0.1559
The Apartment: Similarity = 0.1548
Gone with the Wind: Similarity = 0.1488
