In [1]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from IPython.display import display

In [3]:
# Load data
movies = pd.read_csv("movies.csv")
ratings = pd.read_csv("ratings.csv")

In [7]:
# Clean title function
def clean_title(title):
    title = re.sub("[^a-zA-Z0-9 ]", "", title)
    return title

movies["clean_title"] = movies["title"].apply(clean_title)
movies

Unnamed: 0,movieId,title,genres,clean_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji 1995
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men 1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale 1995
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II 1995
...,...,...,...,...
62418,209157,We (2018),Drama,We 2018
62419,209159,Window of the Soul (2001),Documentary,Window of the Soul 2001
62420,209163,Bad Poems (2018),Comedy|Drama,Bad Poems 2018
62421,209169,A Girl Thing (2001),(no genres listed),A Girl Thing 2001


In [9]:
# TF-IDF Vectorizer
vectorizer = TfidfVectorizer(ngram_range=(1,2))
tfidf = vectorizer.fit_transform(movies["clean_title"])

In [11]:
# Search function
def search(title):
    title = clean_title(title)
    query_vec = vectorizer.transform([title])
    similarity = cosine_similarity(query_vec, tfidf).flatten()
    indices = np.argpartition(similarity, -5)[-5:]
    results = movies.iloc[indices].iloc[::-1]
    return results

# Recommendation function
def find_similar_movies(movie_id):
    similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)]["userId"].unique()
    similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]
    similar_user_recs = similar_user_recs.value_counts() / len(similar_users)
    similar_user_recs = similar_user_recs[similar_user_recs > .10]
    
    all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 4)]
    all_user_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())
    
    rec_percentages = pd.concat([similar_user_recs, all_user_recs], axis=1)
    rec_percentages.columns = ["similar", "all"]
    rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]
    rec_percentages = rec_percentages.sort_values("score", ascending=False)
    
    return rec_percentages.head(10).merge(movies, left_index=True, right_on="movieId")[["score", "title", "genres"]]
    

In [None]:
# Interactive search without widgets
def interactive_search():
    while True:
        title = input("Enter a movie title (or 'quit' to exit): ")
        if title.lower() == 'quit':
            break
        if len(title) > 3:
            try:
                results = search(title)
                if not results.empty:
                    print("\nSearch results:")
                    display(results)
                    
                    movie_id = results.iloc[0]["movieId"]
                    recommendations = find_similar_movies(movie_id)
                    
                    print("\nRecommendations based on", results.iloc[0]["title"])
                    display(recommendations)
                else:
                    print("No results found. Try a different movie title.")
            except Exception as e:
                print(f"Error: {e}")
        else:
            print("Please enter at least 4 characters.")
        print("\n" + "-"*50 + "\n")

# Run the interactive search
interactive_search()

Enter a movie title (or 'quit' to exit):  time bandits



Search results:


Unnamed: 0,movieId,title,genres,clean_title
4738,4844,Bandits (2001),Comedy|Crime|Romance,Bandits 2001
2876,2968,Time Bandits (1981),Adventure|Comedy|Fantasy|Sci-Fi,Time Bandits 1981
2471,2562,Bandits (1997),Drama,Bandits 1997
45451,170417,The Bandits (1967),Adventure|Western,The Bandits 1967
60746,204058,Cyber Bandits (1995),Action|Sci-Fi|Thriller,Cyber Bandits 1995



Recommendations based on Bandits (2001)


Unnamed: 0,score,title,genres
4738,607.765625,Bandits (2001),Comedy|Crime|Romance
3208,21.991519,"Whole Nine Yards, The (2000)",Comedy|Crime
6585,21.770709,Matchstick Men (2003),Comedy|Crime|Drama
4795,18.337756,Spy Game (2001),Action|Crime|Drama|Thriller
3525,17.057871,Shanghai Noon (2000),Action|Adventure|Comedy|Western
6442,16.681582,Seabiscuit (2003),Drama
3914,15.422624,What Women Want (2000),Comedy|Romance
3915,15.251913,Finding Forrester (2000),Drama
3224,14.350022,Wonder Boys (2000),Comedy|Drama
4768,13.711785,K-PAX (2001),Drama|Fantasy|Mystery|Sci-Fi



--------------------------------------------------



In [69]:
movie_id = 2021

#def find_similar_movies(movie_id):
movie = movies[movies["movieId"] == movie_id]
movie

Unnamed: 0,movieId,title,genres,clean_title
1932,2021,Dune (1984),Adventure|Sci-Fi,Dune 1984


In [71]:
similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)]["userId"].unique()

similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]

similar_user_recs = similar_user_recs.value_counts() / len(similar_users)

similar_user_recs = similar_user_recs[similar_user_recs > .10]

all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 4)]

all_user_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())

rec_percentages = pd.concat([similar_user_recs, all_user_recs], axis=1)
rec_percentages.columns = ["similar", "all"]

In [73]:
rec_percentages

Unnamed: 0_level_0,similar,all
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
2021,1.000000,0.010315
260,0.625712,0.219211
2571,0.609880,0.240743
541,0.582647,0.113450
1196,0.578214,0.185494
...,...,...
4262,0.103230,0.035957
1234,0.102597,0.043594
1262,0.101963,0.029189
5349,0.101330,0.032762


In [75]:
rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]

rec_percentages = rec_percentages.sort_values("score", ascending=False)

In [77]:
rec_percentages.head(10).merge(movies, left_index=True, right_on="movieId")

Unnamed: 0,similar,all,score,movieId,title,genres,clean_title
1932,1.0,0.010315,96.942369,2021,Dune (1984),Adventure|Sci-Fi,Dune 1984
2015,0.181761,0.009087,20.001768,2105,Tron (1982),Action|Adventure|Sci-Fi,Tron 1982
1530,0.103863,0.006807,15.257724,1587,Conan the Barbarian (1982),Action|Adventure|Fantasy,Conan the Barbarian 1982
2103,0.149462,0.01025,14.581516,2193,Willow (1988),Action|Adventure|Fantasy,Willow 1988
2050,0.172261,0.011818,14.576188,2140,"Dark Crystal, The (1982)",Adventure|Fantasy,Dark Crystal The 1982
2780,0.10513,0.008035,13.083279,2872,Excalibur (1981),Adventure|Fantasy,Excalibur 1981
2876,0.138062,0.011021,12.52723,2968,Time Bandits (1981),Adventure|Comedy|Fantasy|Sci-Fi,Time Bandits 1981
3601,0.129829,0.011041,11.759281,3702,Mad Max (1979),Action|Adventure|Sci-Fi,Mad Max 1979
1242,0.169728,0.016509,10.281185,1275,Highlander (1986),Action|Adventure|Fantasy,Highlander 1986
2071,0.131096,0.013582,9.65227,2161,"NeverEnding Story, The (1984)",Adventure|Children|Fantasy,NeverEnding Story The 1984


In [39]:
def find_similar_movies(movie_id):
    similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)]["userId"].unique()
    similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]
    similar_user_recs = similar_user_recs.value_counts() / len(similar_users)

    similar_user_recs = similar_user_recs[similar_user_recs > .10]
    all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 4)]
    all_user_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())
    rec_percentages = pd.concat([similar_user_recs, all_user_recs], axis=1)
    rec_percentages.columns = ["similar", "all"]
    
    rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]
    rec_percentages = rec_percentages.sort_values("score", ascending=False)
    return rec_percentages.head(10).merge(movies, left_index=True, right_on="movieId")[["score", "title", "genres"]]

In [41]:
# Interactive search function (non-widget version)
def interactive_movie_search():
    print("Movie Recommendation System")
    print("Enter a movie title to get recommendations (type 'quit' to exit)")
    
    while True:
        title = input("\nEnter movie title: ")
        
        if title.lower() == 'quit':
            print("Goodbye!")
            break
            
        if len(title) > 3:  # Reduced threshold for better usability
            try:
                # Search for the movie
                results = search(title)
                
                if not results.empty:
                    print(f"\nTop search results for '{title}':")
                    for i, row in results.iterrows():
                        print(f"{i+1}. {row['title']} ({row['genres']})")
                    
                    # Get recommendations based on the first result
                    movie_id = results.iloc[0]["movieId"]
                    recommendations = find_similar_movies(movie_id)
                    
                    print(f"\nRecommendations similar to '{results.iloc[0]['title']}':")
                    display(recommendations)
                else:
                    print("No results found. Please try a different movie title.")
                    
            except Exception as e:
                print(f"An error occurred: {e}")
        else:
            print("Please enter at least 4 characters.")

# Run the interactive search
interactive_movie_search()

Movie Recommendation System
Enter a movie title to get recommendations (type 'quit' to exit)



Enter movie title:  the godfather



Top search results for 'the godfather':
33427. GodFather (1991) (Comedy|Drama|Romance)
53795. The Godfather Legacy (2012) (Documentary)
53804. The Godfather Family: A Look Inside (1990) (Action|Documentary|Drama)
48383. Herschell Gordon Lewis: The Godfather of Gore (2010) (Documentary|Horror)
841. Godfather, The (1972) (Crime|Drama)

Recommendations similar to 'GodFather (1991)':


Unnamed: 0,score,title,genres
42263,47907.0,Yodha (1992),Action|Comedy
33426,47907.0,GodFather (1991),Comedy|Drama|Romance
35135,47907.0,Devasuram (1993),Action|Drama
19461,23953.5,"Great Gatsby, The (1949)",Drama
39923,11976.75,Ghajini (2005),Action|Drama|Mystery|Thriller
22037,11976.75,"Suspect, The (Yong-eui-ja) (2013)",Action|Thriller
33288,9581.4,Bangalore Days (2014),Comedy|Drama|Romance
39594,9581.4,Thani Oruvan (2015),Action|Thriller
35134,7984.5,Manichitrathazhu (1993),Children|Drama|Fantasy|Horror|Mystery|Thriller
27327,7984.5,Love is God (2003),Adventure|Comedy|Drama



Enter movie title:  scarface



Top search results for 'scarface':
8348. Scarface (1932) (Crime|Drama)
4159. Scarface (1983) (Action|Crime|Drama)
24616. Lady Scarface (1941) (Comedy|Crime|Drama|Romance)
20808. Fuck You, Goethe (Fack Ju Göhte) (2013) (Comedy)
20805. Fun Size (2012) (Comedy)

Recommendations similar to 'Scarface (1932)':


Unnamed: 0,score,title,genres
8347,701.638009,Scarface (1932),Crime|Drama
6931,160.022704,"Public Enemy, The (1931)",Action|Crime|Drama
7093,95.489196,"Ox-Bow Incident, The (1943)",Drama|Western
6945,93.551735,Red River (1948),Action|Adventure|Western
7819,89.09689,White Heat (1949),Crime|Drama|Film-Noir
6947,70.605083,Stagecoach (1939),Action|Drama|Romance|Western
8319,69.872665,Sherlock Jr. (1924),Comedy|Fantasy|Romance
7607,43.580001,Mean Streets (1973),Crime|Drama
4224,43.126686,Rio Bravo (1959),Western
7833,42.228214,Freaks (1932),Crime|Drama|Horror



Enter movie title:  usual suspect



Top search results for 'usual suspect':
36342. Suspect (2005) (Drama)
4228. Suspect (1987) (Crime|Drama|Thriller)
17446. Suspect, The (1944) (Drama|Thriller)
50. Usual Suspects, The (1995) (Crime|Mystery|Thriller)
25184. The Suspect (2013) (Thriller)

Recommendations similar to 'Suspect (2005)':


Unnamed: 0,score,title,genres



Enter movie title:  quit


Goodbye!
