In [92]:
import pandas as pd

In [91]:
df = pd.read_csv("top10K-TMDB-movies.csv")

In [93]:
# display first 15 movies
df.head(15)

Unnamed: 0,id,title,genre,original_language,overview,popularity,release_date,vote_average,vote_count
0,278,The Shawshank Redemption,"Drama,Crime",en,Framed in the 1940s for the double murder of h...,94.075,1994-09-23,8.7,21862
1,19404,Dilwale Dulhania Le Jayenge,"Comedy,Drama,Romance",hi,"Raj is a rich, carefree, happy-go-lucky second...",25.408,1995-10-19,8.7,3731
2,238,The Godfather,"Drama,Crime",en,"Spanning the years 1945 to 1955, a chronicle o...",90.585,1972-03-14,8.7,16280
3,424,Schindler's List,"Drama,History,War",en,The true story of how businessman Oskar Schind...,44.761,1993-12-15,8.6,12959
4,240,The Godfather: Part II,"Drama,Crime",en,In the continuing saga of the Corleone crime f...,57.749,1974-12-20,8.6,9811
5,667257,Impossible Things,"Family,Drama",es,"Matilde is a woman who, after the death of her...",14.358,2021-06-17,8.6,255
6,129,Spirited Away,"Animation,Family,Fantasy",ja,"A young girl, Chihiro, becomes trapped in a st...",92.056,2001-07-20,8.5,13093
7,730154,Your Eyes Tell,"Romance,Drama",ja,"A tragic accident lead to Kaori's blindness, b...",51.345,2020-10-23,8.5,339
8,372754,Dou kyu sei – Classmates,"Romance,Animation",ja,"Rihito Sajo, an honor student with a perfect s...",14.285,2016-02-20,8.5,239
9,372058,Your Name.,"Romance,Animation,Drama",ja,High schoolers Mitsuha and Taki are complete s...,158.27,2016-08-26,8.5,8895


In [94]:
# print the columns
print(df.columns)

Index(['id', 'title', 'genre', 'original_language', 'overview', 'popularity',
       'release_date', 'vote_average', 'vote_count'],
      dtype='object')


In [95]:
# check for missing value in eacch column
df.isnull().sum()

Unnamed: 0,0
id,0
title,0
genre,3
original_language,0
overview,13
popularity,0
release_date,0
vote_average,0
vote_count,0


In [96]:
# remove rows with missing values
df.dropna(subset=['genre', 'overview'], inplace=True)
df.isnull().sum()

Unnamed: 0,0
id,0
title,0
genre,0
original_language,0
overview,0
popularity,0
release_date,0
vote_average,0
vote_count,0


In [97]:
df['combined'] = df['genre'].astype(str) + ' ' + df['overview'].astype(str) # added a new feature
df = df[['id','title','original_language','combined']] # droped unused columns
df.head()

Unnamed: 0,id,title,original_language,combined
0,278,The Shawshank Redemption,en,"Drama,Crime Framed in the 1940s for the double..."
1,19404,Dilwale Dulhania Le Jayenge,hi,"Comedy,Drama,Romance Raj is a rich, carefree, ..."
2,238,The Godfather,en,"Drama,Crime Spanning the years 1945 to 1955, a..."
3,424,Schindler's List,en,"Drama,History,War The true story of how busine..."
4,240,The Godfather: Part II,en,"Drama,Crime In the continuing saga of the Corl..."


In [98]:
from sklearn.feature_extraction.text import TfidfVectorizer # to vectorize the feature
tf_idf_vectorizer = TfidfVectorizer(stop_words = 'english') # initialize the vectorizer

# get tf-idf matrix
tf_matrix = tf_idf_vectorizer.fit_transform(df['combined'])

from sklearn.metrics.pairwise import cosine_similarity # calculating similarities between movies
cosine_simil = cosine_similarity(tf_matrix,tf_matrix)



In [99]:
# function
def get_recom(title, cosine_sim=cosine_simil):

    idx = df.index[df['title'] == title].tolist()[0] # index of the title

    sim_scores = list(enumerate(cosine_sim[idx]))

    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True) # sort the movies based on similarities

    sim_scores = sim_scores[1:11] # get the top 10 movies

    movie_indices = [i[0] for i in sim_scores]

    return df['title'].iloc[movie_indices]


In [100]:
def get_recom_by_lang(title, cosine_sim=cosine_simil):
    idx = df.index[df['title'] == title].tolist()[0]
    input_language = df.iloc[idx]['original_language']
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    movie_indices = []
    for i in sim_scores[1:]:  # skip the input movie itself
        movie_idx = i[0]
        if df.iloc[movie_idx]['original_language'] == input_language:
            movie_indices.append(movie_idx)
        if len(movie_indices) == 10:
            break

    return df['title'].iloc[movie_indices]


In [101]:
recommended_movies = get_recom('3 Idiots')  # Replace with any movie title
print(recommended_movies)

recommended_movies_language = get_recom_by_lang('3 Idiots')
print(recommended_movies_language)

6857                      Bring It On
4019                        Overboard
9471                       Poison Ivy
3115                      Re-Animator
9424        Van Wilder: Freshman Year
4907                        Contagion
6555                          Hackers
2807                     The Bad Seed
6664                    Brad's Status
9839    Van Wilder 2: The Rise of Taj
Name: title, dtype: object
2331                      Rang De Basanti
1             Dilwale Dulhania Le Jayenge
2104                  Munna Bhai M.B.B.S.
467                     Bajrangi Bhaijaan
3258                       Dil Chahta Hai
1657                               Swades
1906    Lagaan: Once Upon a Time in India
1981                       Chak De! India
1334                        Kal Ho Naa Ho
3591                            Padmaavat
Name: title, dtype: object


In [None]:
import ipywidgets as widgets
from IPython.display import display

movie_input = widgets.Combobox(
    placeholder='Type or select a movie title',
    options=list(df['title'].unique()),
    description='Select Movie:',
    ensure_option=True,
    style={'description_width': 'initial'},
    layout=widgets.Layout(width='70%')
)

output = widgets.Output()

def on_movie_change(change):
    with output:
        output.clear_output()
        selected = change['new']
        if selected in df['title'].values:
            recommendations = get_recom(selected)
            recommendations_lan = get_recom_by_lang(selected)
            print("Recommended Movies:")
            print(recommendations.to_string(index=False))
            print()
            print("Recommended Movies (by language):")
            print(recommendations_lan.to_string(index=False))
        else:
            print("Please select a valid movie from the list.")

movie_input.observe(on_movie_change, names='value')

display(movie_input, output)
