In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from fuzzywuzzy import process


In [3]:
# Load the datasets
ratings = pd.read_csv('D:/doctor/ratings.csv')
movies = pd.read_csv('D:/doctor/movies.csv')


In [4]:
tfidf_genres = TfidfVectorizer(stop_words='english')  # Initialize TF-IDF Vectorizer for genres
tfidf_titles = TfidfVectorizer(stop_words='english')   # Initialize TF-IDF Vectorizer for titles


In [5]:
tfidf_matrix_genres = tfidf_genres.fit_transform(movies['genres'])  # Convert 'genres' into a matrix of TF-IDF features
tfidf_matrix_titles = tfidf_titles.fit_transform(movies['title'])    # Convert 'title' into a matrix of TF-IDF features


In [6]:
content_similarity_genres = cosine_similarity(tfidf_matrix_genres, tfidf_matrix_genres)
content_similarity_titles = cosine_similarity(tfidf_matrix_titles, tfidf_matrix_titles)


In [7]:
content_similarity_genres

array([[1.        , 0.80403568, 0.15641664, ..., 0.        , 0.26646851,
        0.        ],
       [0.80403568, 1.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.15641664, 0.        , 1.        , ..., 0.        , 0.58699859,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.        ,
        1.        ],
       [0.26646851, 0.        , 0.58699859, ..., 0.        , 1.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 1.        , 0.        ,
        1.        ]])

In [8]:
content_similarity_titles

array([[1.        , 0.17839679, 0.13191189, ..., 0.        , 0.        ,
        0.        ],
       [0.17839679, 1.        , 0.13882506, ..., 0.        , 0.        ,
        0.        ],
       [0.13191189, 0.13882506, 1.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        1.        ]])

In [9]:
content_similarity = (content_similarity_genres + content_similarity_titles) / 2

In [10]:

content_similarity_df = pd.DataFrame(content_similarity, index=movies['title'], columns=movies['title'])



In [11]:
content_similarity_df

title,Toy Story (1995),Jumanji (1995),Grumpier Old Men (1995),Waiting to Exhale (1995),Father of the Bride Part II (1995),Heat (1995),Sabrina (1995),Tom and Huck (1995),Sudden Death (1995),GoldenEye (1995),...,Body (2015),Sharknado 4: The 4th Awakens (2016),The Last Brickmaker in America (2001),Stranger Things,Rustom (2016),Mohenjo Daro (2016),Shin Godzilla (2016),The Beatles: Eight Days a Week - The Touring Years (2016),The Gay Desperado (1936),"Women of '69, Unboxed"
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Toy Story (1995),1.000000,0.491216,0.144164,0.138952,0.207637,0.102220,0.170622,0.394761,0.079724,0.217964,...,0.000000,0.088054,0.000000,0.000000,0.000000,0.141078,0.205127,0.000000,0.133234,0.0
Jumanji (1995),0.491216,1.000000,0.069413,0.072816,0.078302,0.107577,0.097257,0.477066,0.083902,0.254023,...,0.000000,0.109515,0.000000,0.000000,0.000000,0.175463,0.255121,0.000000,0.000000,0.0
Grumpier Old Men (1995),0.144164,0.069413,1.000000,0.499844,0.351398,0.079545,0.571915,0.056575,0.062040,0.069413,...,0.000000,0.000000,0.000000,0.000000,0.292089,0.249180,0.000000,0.000000,0.293499,0.0
Waiting to Exhale (1995),0.138952,0.072816,0.499844,1.000000,0.322539,0.083445,0.521442,0.059348,0.065081,0.072816,...,0.085131,0.000000,0.226014,0.226014,0.260544,0.309356,0.000000,0.000000,0.261802,0.0
Father of the Bride Part II (1995),0.207637,0.078302,0.351398,0.322539,1.000000,0.089732,0.374624,0.063820,0.069985,0.078302,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.500000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Mohenjo Daro (2016),0.141078,0.175463,0.249180,0.309356,0.000000,0.000000,0.249180,0.219025,0.000000,0.215679,...,0.072566,0.219283,0.192657,0.192657,0.330289,1.000000,0.239470,0.061085,0.000000,0.0
Shin Godzilla (2016),0.205127,0.255121,0.000000,0.000000,0.000000,0.105674,0.000000,0.134103,0.188449,0.237934,...,0.000000,0.467961,0.000000,0.000000,0.124188,0.239470,1.000000,0.070111,0.000000,0.0
The Beatles: Eight Days a Week - The Touring Years (2016),0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.053104,0.000000,0.000000,0.080030,0.061085,0.070111,1.000000,0.000000,0.5
The Gay Desperado (1936),0.133234,0.000000,0.293499,0.261802,0.500000,0.000000,0.293499,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.000000,0.0


In [12]:
def get_content_based_recommendations(movie_title, num_recommendations=15):
    movie_title = movie_title.strip().lower()
    titles = movies['title'].tolist()
    best_match, score = process.extractOne(movie_title, titles)

    if score < 80:  
        return [], None  

    idx = movies[movies['title'] == best_match].index[0]
    sim_scores = list(enumerate(content_similarity[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    movie_indices = [i[0] for i in sim_scores[1:num_recommendations + 1]]
    
    return movies['title'].iloc[movie_indices].tolist(), best_match


In [13]:
get_content_based_recommendations("Matrix")

(['Matrix Reloaded, The (2003)',
  'Matrix Revolutions, The (2003)',
  'eXistenZ (1999)',
  'Wing Commander (1999)',
  'Universal Soldier: The Return (1999)',
  'Deep Blue Sea (1999)',
  'Thirteenth Floor, The (1999)',
  'Lawnmower Man 2: Beyond Cyberspace (1996)',
  'Screamers (1995)',
  'Johnny Mnemonic (1995)',
  'Virtuosity (1995)',
  'Timecop (1994)',
  'Blade Runner (1982)',
  'Solo (1996)',
  'Arrival, The (1996)'],
 'Matrix, The (1999)')

In [16]:
# Example of testing with different threshold values
movie_to_search = "Spider-Man"

for threshold in [60, 70, 80, 90]:
    recommendations, searched_movie = get_content_based_recommendations(movie_to_search, num_recommendations=10)
    print(f"Threshold: {threshold}")
    print(f"Searched Movie: {searched_movie}")
    print("Content-Based Recommendations:")
    for rec in recommendations:
        print(rec)
    print("\n")

Threshold: 60
Searched Movie: Spider-Man (2002)
Content-Based Recommendations:
Spider-Man 3 (2007)
Spider-Man 2 (2004)
Amazing Spider-Man, The (2012)
The Amazing Spider-Man 2 (2014)
Clockstoppers (2002)
Iron Man (2008)
Ant-Man (2015)
Demolition Man (1993)
Time Machine, The (2002)
Equilibrium (2002)


Threshold: 70
Searched Movie: Spider-Man (2002)
Content-Based Recommendations:
Spider-Man 3 (2007)
Spider-Man 2 (2004)
Amazing Spider-Man, The (2012)
The Amazing Spider-Man 2 (2014)
Clockstoppers (2002)
Iron Man (2008)
Ant-Man (2015)
Demolition Man (1993)
Time Machine, The (2002)
Equilibrium (2002)


Threshold: 80
Searched Movie: Spider-Man (2002)
Content-Based Recommendations:
Spider-Man 3 (2007)
Spider-Man 2 (2004)
Amazing Spider-Man, The (2012)
The Amazing Spider-Man 2 (2014)
Clockstoppers (2002)
Iron Man (2008)
Ant-Man (2015)
Demolition Man (1993)
Time Machine, The (2002)
Equilibrium (2002)


Threshold: 90
Searched Movie: Spider-Man (2002)
Content-Based Recommendations:
Spider-Man 3 (2