<h1 style="font-family:verdana;"> <center> Interactive Movie Recommendation System with Plotly </center> </h1>


***

In [None]:
#Basic libraries
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from scipy import spatial

import matplotlib.pyplot as plt
import plotly.graph_objects as go

import operator
from sklearn.feature_extraction.text import TfidfVectorizer



import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv('/kaggle/input/netflix-shows/netflix_titles.csv')

<h1 id="feature" style="font-family:verdana;"> 
    <center>1. Feature Engineering
        <a class="anchor-link" href="https://www.kaggle.com/miljan/interactive-tv-show-recommendation-system/#feature">¶</a>
    </center>
</h1>

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df['type'].value_counts()

In [None]:
tv_shows = df.loc[df.type=='TV Show'].copy()

In [None]:
tv_shows.shape

In [None]:
ratings = pd.read_csv('/kaggle/input/trending-tv-shows-on-netflix/TV Shows - Netflix.csv')

In [None]:
ratings.head()

In [None]:
ratings.shape

In [None]:
ratings = ratings.merge(tv_shows, left_on='Titles', right_on='title')

In [None]:
ratings['Titles']

> <h2 id="genre" style="font-family:verdana;"> 
>          1.1 Genre 
>         <a class="anchor-link" href="https://www.kaggle.com/miljan/interactive-movie-recommendation-system/#genre">¶</a>
> 
</h2>

In [None]:
tv_shows.reset_index(inplace=True)

In [None]:
def tag_extractor(feature):
    unique_words = []

    for wordlist in tv_shows[feature].str.split(',').values: 
        for word in wordlist :
            stripped_word = word.lstrip()
            if stripped_word not in unique_words :
                unique_words.append(stripped_word)
                
    return unique_words

In [None]:
genreList = tag_extractor('listed_in')

In [None]:
def binary(x, featureList):
    binaryList = []
    
    for word in featureList:
        
        wList = []
        for w in x.split(','):
            wList.append(w.lstrip())
        
        
        if word in wList:
            binaryList.append(1)
        else:
            binaryList.append(0)
            
    return binaryList

In [None]:
tv_shows['genres_bin'] = tv_shows['listed_in'].apply(lambda x: binary(x, genreList))

> <h2 id="duration" style="font-family:verdana;"> 
>          1.2 Duration 
>         <a class="anchor-link" href="https://www.kaggle.com/miljan/interactive-movie-recommendation-system/#duration">¶</a>
> 
</h2>

In [None]:
import re
tv_shows['duration'] = tv_shows['duration'].apply(lambda x: re.findall(r'\d+', x))

In [None]:
def numb_ex(numb_list):
    for numb in numb_list:
        return int(numb)

In [None]:
tv_shows['duration'] = tv_shows['duration'].apply(lambda x: numb_ex(x))

> <h2 id="rating" style="font-family:verdana;"> 
>          1.3 Rating 
>         <a class="anchor-link" href="https://www.kaggle.com/miljan/interactive-movie-recommendation-system/#rating">¶</a>
> 
</h2>

In [None]:
tv_shows['rating'].value_counts(dropna=False)

In [None]:
tv_shows.rating = tv_shows.rating.map({'TV-MA': 17, 'TV-14': 14, 'TV-PG': 12, 'TV-G': 10,
                                                  'G': 10, 'TV-Y': 2, 'TV-Y7': 7, 'TV-Y7-FV': 7,
                                                  'NR': 10, 'R': 12, 'NaN': 10, 'PG': 12})

> <h2 id="country" style="font-family:verdana;"> 
>          1.4 Country 
>         <a class="anchor-link" href="https://www.kaggle.com/miljan/interactive-movie-recommendation-system/#country">¶</a>
> 
</h2>

In [None]:
tv_shows['country'] = tv_shows['country'].astype(str)

In [None]:
countryList = tag_extractor('country')

In [None]:
tv_shows['country_bin'] = tv_shows['country'].apply(lambda x: binary(x, countryList))

> <h2 id="casting" style="font-family:verdana;"> 
>          1.5 Casting 
>         <a class="anchor-link" href="https://www.kaggle.com/miljan/interactive-movie-recommendation-system/#casting">¶</a>
> 
</h2>

In [None]:
tv_shows['cast'] = tv_shows['cast'].astype(str)

In [None]:
castList = tag_extractor('cast')

In [None]:
tv_shows['cast_bin'] = tv_shows['cast'].apply(lambda x: binary(x, castList))

***

<h1 id="distance" style="font-family:verdana;"> 
    <center>2. Customized distance function
        <a class="anchor-link" href="https://www.kaggle.com/miljan/interactive-tv-show-recommendation-system/distance">¶</a>
    </center>
</h1>

In [None]:
print('Duplicate entries: {}'.format(tv_shows.duplicated('title').sum()))
tv_shows.drop_duplicates(subset='title', inplace = True)

In [None]:
tv_shows['rating'].fillna(10, inplace=True)

In [None]:
tv_shows['description'] = tv_shows['description'].astype(str)

In [None]:
vectorizer = TfidfVectorizer(analyzer = 'word',
                                       min_df=0.0,
                                       max_df = 1.0,
                                       strip_accents = None,
                                       encoding = 'utf-8')

In [None]:
vectorizer.fit_transform(tv_shows['description'].astype(str))

In [None]:
def cosine_sim(text1, text2):
    tfidf = vectorizer.transform([text1, text2])
    return ((tfidf * tfidf.T).A)[0,1]

In [None]:
def distance(show1, show2):
    a = tv_shows.loc[tv_shows.title == show1]
    b = tv_shows.loc[tv_shows.title == show2]
    
    descriptionA = a['description'].values
    descriptionB = b['description'].values
    
    descriptionDistance = 1/cosine_sim(str(descriptionA), str(descriptionB))
    
    
    genresA = a['genres_bin'].values
    genresB = b['genres_bin'].values
    
    dist = 0
    for i in range(0,len(genresA[0])):
        if (genresA[0][i] == genresB[0][i]) and (genresA[0][i]==1):
            dist+=1

    if dist==0:
        genreDistance = len(genresA[0])
        
    else :
        genreDistance = len(genresA[0]) / dist
    
    castA = a['cast_bin'].values
    castB = b['cast_bin'].values
   
    dist = 0
    for i in range(0,len(castA[0])):
        if (castA[0][i] == castB[0][i]) and (castA[0][i]==1):
            dist+=1

    if dist==0:
        castDistance = len(castA[0])
        
    else :
        castDistance = len(castA[0]) / dist

    
    countryA = a['country_bin'].values
    countryB = b['country_bin'].values

    dist = 0
    for i in range(0,len(countryA[0])):
        if (countryA[0][i] == countryB[0][i]) and (countryA[0][i]==1):
            dist+=1

    if dist==0:
        countryDistance = len(countryA[0])
        
    else :
        countryDistance = len(countryA[0]) / dist
    
    ratingA = a['rating'].values
    ratingB = b['rating'].values
    ratingDistance = abs(int(ratingA) - int(ratingB))
                                                   
    durationA = a['duration'].values
    durationB = b['duration'].values
    
    durationDistance = abs(int(durationA) - int(durationB))
                              
                                                   
    return (0.05*descriptionDistance + genreDistance + 0.001*castDistance + 0.05*countryDistance + 0.1*ratingDistance + 0.05*durationDistance)

In [None]:
def get_recommendations(show):

    distances = []
    
    for index, row in tv_shows.iterrows():
        if row['title'] != show:
            dist = distance(row['title'], show)
            distances.append((row['title'], dist))
    
    distances.sort(key=operator.itemgetter(1))
    neighbors = []
    
    for x in range(5):
        neighbors.append(distances[x])

            
    return neighbors

In [None]:
def similar_shows_df(title):
    neighbors = get_recommendations(str(title))
    
    df = tv_shows.loc[tv_shows.title == str(title)][['title', 'listed_in', 'description']]
    
    for i in range(0,5):
        df = df.append(tv_shows.loc[tv_shows.title == neighbors[i][0]][['title', 'listed_in', 'description']], ignore_index=True)
    df['distance'] = 0
    
    dist = [0] + [neighbors[i][1] for i in range(0,5)]
    df['distance'] = dist
    
    return df
    

***

<h1 id="plot" style="font-family:verdana;"> 
    <center>3. Interactive TV Show Recommendations
        <a class="anchor-link" href="https://www.kaggle.com/miljan/interactive-tv-show-recommendation-system/#plot">¶</a>
    </center>
</h1>

In [None]:
ratings['Titles']

In [None]:
breaking_bad_df = similar_shows_df('Breaking Bad')
dark_df = similar_shows_df('Dark')
stranger_things_df = similar_shows_df('Stranger Things')
sherlock_df = similar_shows_df('Sherlock')
friends_df =similar_shows_df('Friends')
better_call_saul_df = similar_shows_df('Better Call Saul')
supernatural_df = similar_shows_df('Supernatural')
black_mirror_df = similar_shows_df('Black Mirror')
aot_df = similar_shows_df('Attack on Titan')


In [None]:
peaky_b_df = similar_shows_df('Peaky Blinders')

In [None]:

tua_df = similar_shows_df('The Umbrella Academy')
narcos_df = similar_shows_df('Narcos')
daredevil_df = similar_shows_df("Marvel's Daredevil")
twd_df = similar_shows_df('The Walking Dead')
par_df = similar_shows_df('Parks and Recreation')
suits_df = similar_shows_df('Suits')
dexter_df = similar_shows_df('Dexter')
man_men_df = similar_shows_df('Mad Men')
fma_df = similar_shows_df('Fullmetal Alchemist: Brotherhood')
ozark_df = similar_shows_df('Ozark')
witcher_df = similar_shows_df('The Witcher')
lucifer_df = similar_shows_df('Lucifer')

In [None]:
dfs = [breaking_bad_df, dark_df, stranger_things_df, sherlock_df, friends_df, better_call_saul_df, supernatural_df, black_mirror_df,
      aot_df, peaky_b_df, tua_df, narcos_df, daredevil_df, twd_df, par_df, suits_df,  dexter_df, man_men_df, fma_df, ozark_df, witcher_df, lucifer_df]

In [None]:
data = []

for movie_df in dfs:
    data.append(go.Table(
        header=dict(
            values=[k for k in movie_df.columns],
            font=dict(size=10),
            align="left"
        ),
        cells=dict(
            values=[movie_df[k].tolist() for k in movie_df.columns],
            align = "left")
    ))

In [None]:
fig = go.Figure(data=data)

In [None]:
update_list = []
i = 0
for title in ratings['Titles']:
    update_list.append(dict(label=title, 
                     method="update", 
                     args=[{"visible": ([False]*(i) + [True] + [False]*(len(ratings['Titles'])-i))}, 
                           {"title": title}])
    )
    i+=1

In [None]:
# Add dropdown 
fig.update_layout( 
    updatemenus=[ 
        dict( 
            active=0, 
            buttons=update_list 
        ) 
    ]) 
  

<center style="font-family:cursive; font-size:18px; color:#159364;">Making the best out of plotly on Kaggle, it would be so much better with Dash though </center>