In [1]:
#importing libraries
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
#path to dataset
dataset_path = '/content/dataset.csv'

# loading the csv into a dataframe
df = pd.read_csv(dataset_path)

df.head()

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot
0,2001,Another Life,British,Philip Goodhew,"Ioan Gruffudd, Natasha Little",drama,https://en.wikipedia.org/wiki/Another_Life_(20...,Chiefly set in London during the First World W...
1,2010,Jalachhayam (ജലച്ചായം),Malayalam,Sathish Kalathil,"Baburaj Puthoor, Dr. B Jayakrishnan, Prasanna ...","experimental film, art film",https://en.wikipedia.org/wiki/Jalachhayam,"Mohan, an art teacher at a city college, disco..."
2,1958,"Run Silent, Run Deep",American,Robert Wise,"Clark Gable, Burt Lancaster, Jack Warden, Brad...",war,"https://en.wikipedia.org/wiki/Run_Silent,_Run_...","A World War II US Navy submarine officer, Comm..."
3,1948,Scott of the Antarctic,British,Charles Frend,"John Mills, James Robertson Justice, Barry Letts",adventure/biopic,https://en.wikipedia.org/wiki/Scott_of_the_Ant...,"Captain Scott is given the men, but not the fu..."
4,1996,Romeo + Juliet,American,Baz Luhrmann,"Leonardo DiCaprio, Claire Danes, Brian Dennehy...",romantic drama,https://en.wikipedia.org/wiki/Romeo_%2B_Juliet,"In Verona Beach, the Capulets and the Montague..."


In [4]:
df.shape

(10000, 8)

In [5]:
# selecting random rows from df
df = df.sample(500, random_state=42)
df.head()


Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot
6252,2013,Shadow People,American,Matthew Arnold,"Dallas Roberts, Alison Eastwood, Anne Dudek, M...",horror,https://en.wikipedia.org/wiki/Shadow_People_(f...,The film begins with a series of videos posted...
4684,2000,Platform,Chinese,Jia Zhangke,Zhao Tao\r\nWang Hongwei,drama,https://en.wikipedia.org/wiki/Platform_(2000_f...,The film starts in 1979 in the wake of the Cul...
1731,1931,The Miracle Woman,American,Frank Capra,"Barbara Stanwyck, David Manners, Sam Hardy",drama,https://en.wikipedia.org/wiki/The_Miracle_Woman,Florence Fallon is outraged when church elders...
4742,1995,Muthu Kaalai,Tamil,Gokula Krishnan,"Karthik, Soundarya",unknown,https://en.wikipedia.org/wiki/Muthu_Kaalai,The film begins with Poochi (Vadivelu) announc...
4521,2009,Triangle,South_Korean,Unknown,Ji Yeong-soo,unknown,https://en.wikipedia.org/wiki/Triangle_(2009_S...,Ji-young is a beautiful widow of an extremely ...


In [6]:
# filling missing values in 'Plot', 'Title', and 'Genre' columns with empty strings
df['Plot'] = df['Plot'].fillna('')
df['Title'] = df['Title'].fillna('')
df['Genre'] = df['Genre'].fillna('')

# combining 'Title', 'Genre', and 'Plot' into a single "metadata" column
df['Metadata'] = df['Title'] + ' ' + df['Genre'] + ' ' + df['Plot']

df[['Title', 'Genre', 'Metadata']].head()



Unnamed: 0,Title,Genre,Metadata
6252,Shadow People,horror,Shadow People horror The film begins with a se...
4684,Platform,drama,Platform drama The film starts in 1979 in the ...
1731,The Miracle Woman,drama,The Miracle Woman drama Florence Fallon is out...
4742,Muthu Kaalai,unknown,Muthu Kaalai unknown The film begins with Pooc...
4521,Triangle,unknown,Triangle unknown Ji-young is a beautiful widow...


In [7]:
# initializing the TF-IDF vectorizer using english
tfidf_vectorizer = TfidfVectorizer(stop_words='english')

# transform the metadata into TF-IDF features
tfidf_matrix = tfidf_vectorizer.fit_transform(df['Metadata'])

# checking the shape of the matrix to ensure it matches the number of movies
print("TF-IDF matrix shape:", tfidf_matrix.shape)


TF-IDF matrix shape: (500, 16802)


In [8]:
 #recommends movies based on user query by computing cosine similarity b/w the query and the "metadata"
 #parameters :
        #  query (str): The user's description of movie preferences.
        # tfidf_matrix: TF-IDF matrix of all movie metadata.
        # df (DataFrame): The movies dataframe.
        # top_n (int): The number of top recommendations to return.
 #returns a dataframe containing movie title and similarity score

def recommend_movies(query, tfidf_matrix, df, top_n=5):
    #transforming the user's query into the TF-IDF space
    query_vec = tfidf_vectorizer.transform([query])

    #computing cosine similarity between the query and all movie metadata
    cosine_sim = cosine_similarity(query_vec, tfidf_matrix).flatten()

    #getting indices of movies with highest similarity scores
    top_indices = cosine_sim.argsort()[-top_n:][::-1]

    #preparing a df with the recommended movies
    recommendations = df.iloc[top_indices][['Title']].copy()
    recommendations['Similarity'] = cosine_sim[top_indices]
    return recommendations


In [10]:
# prompt the user for their input query
user_query = input("What kind of movies do you want to watch? ")

# generacte and print top recommendations based on user input
recommended_movies = recommend_movies(user_query, tfidf_matrix, df, top_n=5)
print("Top Recommendations:")
print(recommended_movies)


What kind of movies do you want to watch? I want to watch a forest adventure movie with comedy elements
Top Recommendations:
                                                  Title  Similarity
3238                      The Jones Family in Hollywood    0.116647
5572                                    Drums of Africa    0.113227
2592  Poketto Monsutā Serebii: The Meeting that Trav...    0.093716
2232                                          The Watch    0.093385
333                                               Aaaah    0.076098
