# Show Content Recommender
> Author: Sharnique Beck

This notebook is where I build the content based recommender system

In [44]:
# import Libraries
import pandas as pd
import re

from scipy import sparse
from sklearn.metrics.pairwise import pairwise_distances, cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

from nltk.tokenize import RegexpTokenizer


In [59]:
data = pd.read_csv('../data/merged_content.csv')
shows = pd.read_csv('../data/k_titles.csv')
data.head()

Unnamed: 0,title,content
0,Thirty But Seventeen,KoreanDrama Drama Romance Comedy Crime&Mystery...
1,Fates and Furies,Romance Melodrama KoreanDrama JooSangWook LeeM...
2,The Last Empress,KoreanDrama Drama Romance ShinSungRok JangNara...
3,Encounter,Romance KoreanDrama ParkBoGum SongHyeKyo JangS...
4,My Strange Hero,Romance Melodrama Drama KoreanDrama YooSeungHo...


In [74]:
shows.head()

Unnamed: 0,title,url,container,rating,# ratings
0,Thirty But Seventeen,https://www.viki.com/tv/36109c-thirty-but-seve...,36109c,9.58,7368
1,Fates and Furies,https://www.viki.com/tv/36240c-fates-and-furies,36240c,9.14,1401
2,The Last Empress,https://www.viki.com/tv/36241c-the-last-empress,36241c,9.45,3858
3,Encounter,https://www.viki.com/tv/36239c-encounter,36239c,9.59,5436
4,My Strange Hero,https://www.viki.com/tv/36330c-my-strange-hero,36330c,9.56,2239


In [35]:
# Create tokens at spaces in string only
REGEX = re.compile(r"\s+")
def tokenize(text):
    return [tok.strip() for tok in REGEX.split(text)]

In [60]:
vect = CountVectorizer(lowercase=False,tokenizer=tokenize, stop_words=['','[]']) 
cont = vect.fit_transform(data['content'])
content = pd.DataFrame(cont.toarray(), columns=vect.get_feature_names())
content.index = data['title']
content.head()

Unnamed: 0_level_0,"""Jimin(AOAs)""","""KimTaehyung(BTSsV)""","""KwonMina(AoAsMinA)""",(G)I-DLE,10cm,A-Tom,AOA,AbigailAlderete,Action&Adventure,AgathaChristie,...,YunMinSu,YunSangHo,YunSeongSik,Yuna,Yura,ZAZI,gugudan,iKON,kk,nikitaklæstrup
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Thirty But Seventeen,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Fates and Furies,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
The Last Empress,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Encounter,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
My Strange Hero,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [61]:
recommender = pairwise_distances(content, metric='cosine')
recommender[:5,:5]

array([[0.        , 0.8232233 , 0.82850141, 0.88888889, 0.82850141],
       [0.8232233 , 0.        , 0.87873219, 0.88214887, 0.81809828],
       [0.82850141, 0.87873219, 0.        , 0.88566761, 0.82352941],
       [0.88888889, 0.88214887, 0.88566761, 0.        , 0.88566761],
       [0.82850141, 0.81809828, 0.82352941, 0.88566761, 0.        ]])

In [62]:
# Convert matrix into a Dataframe
recommender_df = pd.DataFrame(recommender, index = content.index, columns=content.index)
recommender_df.head()

title,Thirty But Seventeen,Fates and Furies,The Last Empress,Encounter,My Strange Hero,What’s Wrong With Secretary Kim,Devilish Joy,I Am Not a Robot,Suspicious Partner,Weightlifting Fairy Kim Bok Joo,...,I Came Alone,Assembly,2017 SBS Entertainment Awards,Ricky Traveling Alone,2016 DMC Festival,All Broadcasts of the World,2017 MBC Entertainment Awards,Join Us Korea,LC9's Life of Research,The Seg 101 Express
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Thirty But Seventeen,0.0,0.823223,0.828501,0.888889,0.828501,0.899496,0.833333,0.789181,0.777778,0.828501,...,0.916667,0.878284,0.951887,1.0,1.0,0.937006,0.951887,1.0,1.0,1.0
Fates and Furies,0.823223,0.0,0.878732,0.882149,0.818098,0.8934,0.823223,0.888197,0.882149,0.878732,...,1.0,0.806351,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
The Last Empress,0.828501,0.878732,0.0,0.885668,0.823529,0.896582,0.742752,0.837302,0.828501,0.823529,...,1.0,0.874755,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
Encounter,0.888889,0.882149,0.885668,0.0,0.885668,0.899496,0.833333,0.894591,0.833333,0.885668,...,1.0,0.939142,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
My Strange Hero,0.828501,0.818098,0.823529,0.885668,0.0,0.896582,0.828501,0.728837,0.828501,0.823529,...,1.0,0.812133,0.900985,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [63]:
recommender_df['Healer'].sort_values()[1:11]

title
The Great Doctor        0.636197
Queen for Seven Days    0.655876
The K2                  0.677251
Old Goodbye             0.693814
EXIT                    0.698489
Again, My Love          0.711325
Star’s Lover            0.711325
Traces of the Hand      0.716527
Hotelier                0.716527
Tamra the Island        0.720492
Name: Healer, dtype: float64

In [None]:
# system for titles with the given word and display top 10 matches
search = "LeeMinHo"
for t in shows[shows['title'].str.contains(search)]['title'].values:
    print(t)
    print('Average Rating', data[data['title']==t]['rating'].mean())
    print('Number of Ratings', data[data['title']==t]['rating'].shape[0])
    print(' ')
    print('Recommendations:')
    print(recommender_df[t].sort_values()[1:11])
    print(' ')
    print('***********************************************************')
    

In [64]:
cosine_sim = cosine_similarity(content, content)
cosine_sim[:5,:5]

array([[1.        , 0.1767767 , 0.17149859, 0.11111111, 0.17149859],
       [0.1767767 , 1.        , 0.12126781, 0.11785113, 0.18190172],
       [0.17149859, 0.12126781, 1.        , 0.11433239, 0.17647059],
       [0.11111111, 0.11785113, 0.11433239, 1.        , 0.11433239],
       [0.17149859, 0.18190172, 0.17647059, 0.11433239, 1.        ]])

In [71]:
indices = pd.Series(content.index)
indices[:5]

0    Thirty But Seventeen
1        Fates and Furies
2        The Last Empress
3               Encounter
4         My Strange Hero
Name: title, dtype: object

In [72]:
# function that takes in movie title as input and returns the top 10 recommended movies
def recommendations(title, cosine_sim = cosine_sim):
    
    recommended_movies = []
    
    # gettin the index of the movie that matches the title
    idx = indices[indices == title].index[0]

    # creating a Series with the similarity scores in descending order
    score_series = pd.Series(cosine_sim[idx]).sort_values(ascending = False)

    # getting the indexes of the 10 most similar movies
    top_10_indexes = list(score_series.iloc[1:11].index)
    
    # populating the list with the titles of the best 10 matching movies
    for i in top_10_indexes:
        recommended_movies.append(list(content.index)[i])
        
    return recommended_movies

In [73]:
recommendations('Healer')

['The Great Doctor',
 'Queen for Seven Days',
 'The K2',
 'Old Goodbye',
 'EXIT',
 'Star’s Lover',
 'Again, My Love',
 'Traces of the Hand',
 'Hotelier',
 'Tamra the Island']