In [2]:
import pandas as pd
import requests
import re
import string
from operator import itemgetter
from nltk.stem import PorterStemmer
from __future__ import print_function
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

In [3]:
def cleanhtml(raw_html):
    clean = re.compile('<.*?>')
    cleantext = re.sub(clean, '', raw_html)
    return cleantext

In [11]:
def tokenize(text):
    """Returns a list of words that make up the text.    
    Params: {text: String}
    Returns: List
    """
    return list(filter(str.strip, list(map(lambda x: x, re.findall(r'[a-zA-Z]*', text)))))

In [12]:
def stem(text):
    stemmer=PorterStemmer()
    stems = [stemmer.stem(w) for w in tokenize(text)]
    return " ".join(stems)

In [13]:
def preprocess_text(text):
    text = str(text)
    text = cleanhtml(text)
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = text.strip()
    return text

In [14]:
def preprocess(data):
    for (index,value) in data['Summary'].items():
        value = preprocess_text(value)
        value = stem(value)
        data.loc[index,'Summary'] = value
    return data

In [16]:
n_feats = 5000
doc_by_vocab = np.empty([len(data), n_feats])

In [18]:
def build_vectorizer(max_features, stop_words, max_df=0.8, min_df=10, norm='l2'):
    """Returns a TfidfVectorizer object with the above preprocessing properties.
    
    Params: {max_features: Integer,
             max_df: Float,
             min_df: Float,
             norm: String,
             stop_words: String}
    Returns: TfidfVectorizer
    """
    
    result = TfidfVectorizer(max_features = max_features, stop_words = stop_words, max_df = max_df, min_df = min_df, norm = norm)
    return result

data = preprocess(data)
tfidf_vec = build_vectorizer(n_feats, "english")
doc_by_vocab = tfidf_vec.fit_transform([value for _,value in data['Summary'].items()]).toarray()
index_to_vocab = {i:v for i, v in enumerate(tfidf_vec.get_feature_names())}
movie_index_to_name = data['Title'].to_dict()
movie_name_to_index = {v: k for k, v in movie_index_to_name.items()}
num_movies = len(data)

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


In [19]:
def get_sim(mov1, mov2, input_doc_mat, movie_name_to_index):
    """Returns a float giving the cosine similarity of 
       the two movie transcripts.
    
    Params: {mov1: String,
             mov2: String,
             input_doc_mat: Numpy Array,
             movie_name_to_index: Dict}
    Returns: Float (Cosine similarity of the two movie transcripts.)
    """
    idx1 = movie_name_to_index[mov1]
    idx2 = movie_name_to_index[mov2]
    movie1 = input_doc_mat[idx1,]
    movie2 = input_doc_mat[idx2,]
    dot_product = np.dot(movie1, movie2)
    return dot_product

In [2]:
def build_movie_sims_cos(n_mov, movie_index_to_name, input_doc_mat, movie_name_to_index, input_get_sim_method):
    """Returns a movie_sims matrix of size (num_movies,num_movies) where for (i,j):
        [i,j] should be the cosine similarity between the movie with index i and the movie with index j
    
    Params: {n_mov: Integer,
             movie_index_to_name: Dict,
             input_doc_mat: Numpy Array,
             movie_name_to_index: Dict,
             input_get_sim_method: Function}
    Returns: Numpy Array
    """
    result = np.zeros((n_mov, n_mov))
    for i in range(n_mov):
        for j in range(n_mov):
            if i == j:
                result[i,j] = 0
            else:
                mov1 = movie_index_to_name[i]
                mov2 = movie_index_to_name[j]
                result[i,j] = input_get_sim_method(mov1, mov2, input_doc_mat, movie_name_to_index)
    
            
    return result


  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


In [111]:
movie_sims_cos = build_movie_sims_cos(num_movies, movie_index_to_name, doc_by_vocab, movie_name_to_index, get_sim)
print(movie_sims_cos)

                     Drama_Title  Summary_Similarity  Genre_Similarity  \
667                     Stranger            0.432997               1.0   
415          Sweet Savage Family            0.276500               1.0   
722          The Queen of Office            0.237680               1.0   
824           The Wedding Scheme            0.218926               1.0   
242        Bad Thief, Good Thief            0.212300               1.0   
312                  Be Positive            0.212232               1.0   
428             You Will Love Me            0.210892               1.0   
498                    The Lover            0.208423               1.0   
333  Cinderella and Four Knights            0.207837               1.0   
32              The Best Chicken            0.205444               1.0   

     Network_Similarity     Total  
667                   0  1.432997  
415                   0  1.276500  
722                   0  1.237680  
824                   0  1.218926  
242  

In [193]:
def best_match(n_mov, movie_sims_cos, data, movie_index_to_name, movie_name_to_index, dramas_enjoyed, dramas_disliked, preferred_genres, preferred_network, num_results):
    feature_list = ['Summary_Similarity', 'Genre_Similarity', 'Network_Similarity', 'Total']
    result = pd.DataFrame(0, index=np.arange(n_mov), columns=feature_list)
    genres = set()
    preferred_genres = [preprocess_text(value) for value in preferred_genres]
    genres.update(preferred_genres)
    for drama in dramas_enjoyed:
        if drama in movie_name_to_index.keys():
            index = movie_name_to_index[drama]
            sim = movie_sims_cos[index,:]
            result['Summary_Similarity']+= pd.Series(sim)
            
    for drama in dramas_disliked:
        if drama in movie_name_to_index.keys():
            index = movie_name_to_index[drama]
            sim = movie_sims_cos[index,:]
            result['Summary_Similarity']-= pd.Series(sim)
            
    for index, value in data.iterrows():
        gen = str(value['Genre'])
        gen = preprocess_text(gen)
        drama_genres = set()
        drama_genres.update(gen.split())
        result.loc[index,'Genre_Similarity'] = len(genres.intersection(drama_genres))/len(genres.union(drama_genres))
        if preferred_network == data.iloc[index]['Network']:
            result['Network_Similarity']+=1
    result['Total'] = result.sum(axis = 1)
    result = result.sort_values(by='Total', ascending=False)
    result = result[:num_results]
    indices =  result.index.tolist()
    best_dramas = pd.Series([movie_index_to_name[index] for index in indices],index = result.index)
    result.insert(loc=0, column='Drama_Title', value=best_dramas)
    result.reset_index()
    return result


best_match(num_movies, movie_sims_cos, data, movie_index_to_name, movie_name_to_index, ['Legend of the Blue Sea'], [], ["drama", 'Medical'], [], 10)      


Unnamed: 0,Drama_Title,Summary_Similarity,Genre_Similarity,Network_Similarity,Total
1111,General Hospital 2,0,1.0,0,1.0
1431,Baekmansongi Jangmi / One Million Roses,0,0.5,0,0.5
833,Immortal Classic,0,0.5,0,0.5
1171,Before and After: Plastic Surgery Clinic,0,0.5,0,0.5
966,"Gwaenchanha, Appa Ttal / Itâ€™s Okay, Daddyâ€™...",0,0.5,0,0.5
1288,The Invisible Man,0,0.5,0,0.5
1301,Dr. Gang,0,0.5,0,0.5
1430,Yo-jo-suk-nyeo / My Fair Lady,0,0.5,0,0.5
1328,My Rosy Life,0,0.5,0,0.5
774,The 3rd Hospital,0,0.5,0,0.5


In [228]:
def display (n_mov, movie_sims_cos, data, movie_index_to_name, movie_name_to_index, dramas_enjoyed, dramas_disliked, preferred_genres, preferred_network, num_results):
    dramas_enjoyed = dramas_enjoyed.split(', ')
    print(dramas_enjoyed)
    dramas_disliked = dramas_disliked.split(', ')
    preferred_genres = preferred_genres.split(', ')
    print(preferred_genres)
    preferred_network = preferred_network.split(', ')
    print(preferred_network)
    best = best_match(n_mov, movie_sims_cos, data, movie_index_to_name, movie_name_to_index, dramas_enjoyed, dramas_disliked, preferred_genres, preferred_network, num_results)
    title = list(zip(best['Drama_Title'], best["Total"]))
    final = {}
    for x in title:
        title_name = x[0]
        final.update({x[0]: ''})
        final[title_name] += data['Summary'][list(data['Title']).index(title_name)]
    return ['Drama Title: {},  Summary: {},  Score: {}'.format(x[0], final[x[0]], x[1]) for x in title]

In [229]:
display(num_movies, movie_sims_cos, data, movie_index_to_name, movie_name_to_index, "Doctors, Doctor Stranger", 'black', "comedy, medical", '', 10)
# print(best_match(num_movies, movie_sims_cos, data, movie_index_to_name, movie_name_to_index, ["Doctors", "Doctor Stranger"], ["black"], ["comedy", 'Medical'], [], 10))     


['Doctors', 'Doctor Stranger']
['comedy', 'medical']
['']


['Drama Title: Golden Time,  Summary: thi drama is set within the backdrop of an emerg room at a hospitalth one hour you have to save a person s life after a traumat injuri is what emerg physician call golden time lee min woo lee sun gyun learn thi all too quickli as he start work as an emerg physician at an urban hospit kang jae in hwang jung eum is a first year resid who discov that she is the heiress to a foundat that own the hospit can they learn to becom great doctor from dr choi in hyuk lee sung min a renown trauma surgeon who put hi patient befor everyth els in hi life,  Score: 0.966399020676921',
 'Drama Title: Emergency Man and Woman,  Summary: a romant comedi about a coupl who marri dure medic school but due to their clash person divorc soon afterward they are reunit year later as intern at a hospit will love strike again for the twojin hee song ji hyo had a good life as a dietitian when she met medic school student chang min choi jin hyuk chang min come from a wealthi famili

In [212]:
data['Summary'][list(data['Title']).index("My First First Love")]

'due to variou person reason a group of yun tae oh ji soo s friend move into hi hous where they experi love friendship and everyth in between'