In [1]:
import pandas as pd
import numpy as np
from collections import defaultdict
import string
from nltk.corpus import stopwords
import warnings 
warnings.filterwarnings('ignore')

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

In [2]:
df = pd.read_csv('/Users/riyamhatre/Desktop/cleaned_imdb.csv').drop(columns = {'Unnamed: 0'})

In [3]:
def rec_system(media, df):
    df = pd.read_csv('/Users/riyamhatre/Desktop/cleaned_imdb.csv').drop(columns = {'Unnamed: 0'})
    data = df[['type','title', 'imdb_ratings', 'country','rating', 'description', 'genre', 'imdb_votes', 'language', 'binary_awards']]
    data['genre'] = data['genre'].apply(lambda x: x.split(','))
    
    def ratings_encoding(rating):  
        # All Audiences/Children
        all_audiences = ['TV-Y', 'TV-G', 'G', 'TV-Y7', 'TV-Y7-FV']
        # Older Children/Tweens/Teens
        older_kids_teens = ['TV-PG', 'PG', 'TV-14', 'PG-13']
        # Mature Audiences
        mature_audiences = ['R', 'TV-MA', 'NC-17']
        # Unrated/Not Rated
        unrated = ['NR', 'UR']

        if rating in all_audiences:
            return 0
        if rating in older_kids_teens:
            return 1
        if rating in mature_audiences:
            return 2
        if rating in unrated:
            return 3
    data['ratings_encoded'] = data['rating'].apply(lambda x: ratings_encoding(x))

    rating = ratings_encoding(data[data['title'] == media]['rating'].iloc[0])
    data= data[(data['ratings_encoded'] == rating) |(data['ratings_encoded'] == 3)]
    
    #genre filter
    genre_list = data[data['title'] == media]['genre'].iloc[0]
    data = data[data['genre'].apply(lambda g: len(set(g) & set(genre_list)) > 0)]
    data['genre_matches'] = data['genre'].apply(lambda g: len(set(g) & set(genre_list)))  
    
    def clean(df):
        lst = []
        desc = df['description'].apply(lambda x: x.lower().replace('\n','').replace('—', '').replace('–', '').replace('  ',' '))
        punctuation = list(string.punctuation)
        lst = []
        for i in desc:
            s = ''
            for j in i:
                if j not in punctuation:
                    s+= j
            lst.append(s)
        stop_words = set(stopwords.words('english'))
        no_stopword_text = [w for w in lst if not w in stop_words]
        return no_stopword_text
    
    data['cleaned_descr'] = clean(data)
    data = data.reset_index().drop(columns = {'index'})
    
    #convert text into numerical vectors
    tfidf = TfidfVectorizer()
    tfidf_matrix = tfidf.fit_transform(data['cleaned_descr'])
    
    # Calculate cosine similarity between the movies/shows
    cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
    
    
    def cos_sim(media, cosine_sim, data):
        # Ensure the input title is valid and exists in the dataset
        if media not in data['title'].values:
            return f"Sorry, we couldn't find '{media}' in the database."

        # Find the index of the movie/show the user entered
        idx = data[data['title'] == media].index[0]
        #return cosine_sim[idx]
        data['cos_sim_score'] =  cosine_sim[idx]

        return data.sort_values(by = 'cos_sim_score', ascending = False).iloc[1:]
    
    preproc_data = cos_sim(media, cosine_sim, data)
    preproc_data = preproc_data[preproc_data['cos_sim_score'] >0]

    ## standardized_imdb_votes
    votes_array = np.array(preproc_data['imdb_votes']).reshape(-1, 1)
    scaler1 = StandardScaler()
    standardized_votes = scaler1.fit_transform(votes_array)
    standardized_votes = standardized_votes.flatten()
    preproc_data['standardized_imdb_votes'] = standardized_votes

    ## minmax_imdb_votes
    scaler2 = MinMaxScaler()
    preproc_data['minmax_imdb_votes'] = scaler2.fit_transform(preproc_data[['imdb_votes']])

    ## standardized_imdb_ratings
    votes_array = np.array(preproc_data['imdb_ratings']).reshape(-1, 1)
    scaler3 = StandardScaler()
    standardized_votes = scaler3.fit_transform(votes_array)
    standardized_votes = standardized_votes.flatten()
    preproc_data['standardized_imdb_ratings'] = standardized_votes

    ## minmax_ratings
    scaler4 = MinMaxScaler()
    preproc_data['minmax_ratings'] = scaler4.fit_transform(preproc_data[['imdb_ratings']])

    ## minmax_genre
    scaler5 = MinMaxScaler()
    preproc_data['minmax_genre'] = scaler5.fit_transform(preproc_data[['genre_matches']])

    ## standardized_genre
    scaler6 = StandardScaler()
    preproc_data['standardized_genre'] = scaler6.fit_transform(preproc_data[['genre_matches']])

    def weights(col_indicator, preproc_data):
        if col_indicator == 'minmax':
            cols = ["minmax_imdb_votes", "minmax_ratings", "cos_sim_score", "minmax_genre"]
        elif col_indicator == 'standardized': 
            cols = ["standardized_imdb_votes", "standardized_imdb_ratings", "cos_sim_score", "minmax_genre"]

        weights = [0.1, 0.8,1,0.75]
        preproc_data["final_score"] = preproc_data[cols] @ weights

        return preproc_data.sort_values(by = "final_score", ascending = False)
    return weights('minmax', preproc_data)[['type','title','imdb_ratings','country','rating','description','genre', 'imdb_votes']]


In [4]:
import ipywidgets as widgets
from IPython.display import display

# Sample DataFrame
media = "Merlin"
# Create a dropdown with autocomplete
dropdown = widgets.Combobox(
    placeholder='Type a movie name',
    options=df['title'].tolist(),
    description='Movie:',
    ensure_option=True,
    continuous_update=False
)

display(dropdown)

def on_submit(change):
    media = change['new']
    if media not in df['title'].values:
        print("Sorry, that movie isn't in this database! Pick another one!")
    else:
        recommended = rec_system(media, df)
        print("\nRecommended Movies/Shows for You:")
        display(recommended.head(5))

dropdown.observe(on_submit, names='value')


Combobox(value='', continuous_update=False, description='Movie:', ensure_option=True, options=('Dick Johnson I…


Recommended Movies/Shows for You:


Unnamed: 0,type,title,imdb_ratings,country,rating,description,genre,imdb_votes
1055,TV Show,Twin Peaks,8.7,United States,TV-14,"""Who killed Laura Palmer?"" is the question on ...","[Crime, Drama, Mystery]",228907
654,TV Show,Criminal Minds,8.1,"United States, Canada",TV-14,This intense police procedural follows a group...,"[Crime, Drama, Mystery]",226600
1056,Movie,Ugly,7.9,India,NR,When a 10-year-old goes missing while her fath...,"[Crime, Drama, Mystery]",23912
373,Movie,The Body,7.6,India,TV-14,Mind games and mystery abound as a detective i...,"[Crime, Drama, Mystery]",74989
286,Movie,Drishyam,8.2,India,TV-14,An unexceptional man becomes a murder suspect ...,"[Crime, Drama, Mystery]",99233
