In [2]:
from __future__ import print_function
import re
import string
from operator import itemgetter
import os
import numpy as np
import pandas as pd
from collections import Counter
import matplotlib
matplotlib.use('TkAgg')
import matplotlib.pyplot as plt
import math
import json
from nltk.stem import PorterStemmer

In [3]:
path =  "cleaned_comprehensive_data.csv"
data = pd.read_csv(path)
num_dramas = len(data)
path2 = 'cosine_matrix.npy'
drama_sims_cos = np.load(path2)
path3 = 'korean_data.csv'
non_processed_data = pd.read_csv(path3)
path4 = 'genre_inclusion_matrix.npy'
genre_inclusion_matrix  = np.load(path4)
path5 = 'actors_inclusion_matrix.npy'
actors_inclusion_matrix  = np.load(path5)
path6 = 'years_inclusion_matrix.npy'
years_inclusion_matrix  = np.load(path6)
non_processed_data = pd.read_csv(path3)
drama_index_to_name = non_processed_data['Title'].to_dict()
process_dict = data['Title'].to_dict()
drama_name_to_index = {v: k for k, v in process_dict.items()}
drama_name_to_index_unprocess = {v: k for k, v in drama_index_to_name.items()}

with open('genre_name_to_index.json') as fp:
    genre_name_to_index = json.load(fp)
with open('actors_name_to_index.json') as fp2:
    actors_name_to_index = json.load(fp2)
with open('years_name_to_index.json') as fp3:
    years_name_to_index = json.load(fp3)
with open('genres_dict.json') as fp:
    genre_dict = json.load(fp)
with open('actors_dict.json') as fp2:
    actors_dict = json.load(fp2)
with open('years_dict.json') as fp3:
    years_dict = json.load(fp3)

def cleanhtml(raw_html):
    clean = re.compile('<.*?>')
    cleantext = re.sub(clean, '', raw_html)
    return cleantext

def tokenize(text):
    """Returns a list of words that make up the text.
    Params: {text: String}
    Returns: List
    """
    return list(filter(str.strip, list(map(lambda x: x, re.findall(r'[a-zA-Z]*', text)))))

def stem(text):
    """Removes stems from a string.
    Params: {text: String}
    Returns: String
    """
    stemmer=PorterStemmer()
    stems = [stemmer.stem(w) for w in tokenize(text)]
    return " ".join(stems)

def preprocess_text(text):
    text = str(text)
    text = cleanhtml(text)
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = text.strip()
    text = stem(text)
    return text

def map_network(network,x):
    if x == network:
        return 1
    else:
        return 0

def best_match(actors_dict, genre_inclusion_matrix, actors_inclusion_matrix, years_inclusion_matrix, genre_name_to_index, actors_name_to_index, years_name_to_index, drama_sims_cos, data, drama_index_to_name, drama_name_to_index, dramas_enjoyed, dramas_disliked, preferred_genres, preferred_network, preferred_actors, preferred_time_frame, num_results):

    feature_list = ['Summary_Similarity', 'Actor_Similarity', 'Genre_Similarity', 'Network_Similarity','Year_Similarity', 'Total']
    result = pd.DataFrame(0, index=np.arange(1466), columns=feature_list)
    genres = set()
    preferred_genres = [preprocess_text(value) for value in preferred_genres]
    genres.update(preferred_genres)
    years = preferred_time_frame
    start_year = int(years[0])
    end_year = int(years[1])
    preferred_actors_set = set()
    preferred_actors_set.update(preferred_actors)
    d = {k:len(v) for k, v in actors_dict.items()}
    actors_len_df = pd.DataFrame.from_dict(d, orient='index')
    actors_len_df.columns = ['Length']
    for drama in dramas_enjoyed:
        drama = drama.lower()
        drama = drama.strip()
        if drama in drama_name_to_index.keys():
            index = drama_name_to_index[drama]
            sim = drama_sims_cos[index,:1466]
            result['Summary_Similarity']+= pd.Series(sim)
    for drama in dramas_disliked:
        drama = drama.lower()
        drama = drama.strip()
        if drama in drama_name_to_index.keys():
            index = drama_name_to_index[drama]
            sim = drama_sims_cos[index,:1466]
            result['Summary_Similarity']-= pd.Series(sim)
    for genre in preferred_genres:
        if genre in genre_name_to_index.keys():
            index = genre_name_to_index[genre]
            result['Genre_Similarity']= genre_inclusion_matrix[:,index]
    for actor in preferred_actors:
        if actor in actors_name_to_index.keys():
            index = actors_name_to_index[actor]
            result['Actor_Similarity']+= actors_inclusion_matrix[:,index]
    actors_len_df['Length'] =  actors_len_df['Length'] + len(preferred_actors)
    actors_len_df['Length'] = actors_len_df['Length'].subtract(result['Actor_Similarity'], fill_value = 0)
    actor_sim2=result['Actor_Similarity']
    for idx in range(1466):
        result['Actor_Similarity'] = actor_sim2.iloc[idx]/actors_len_df['Length'].iloc[idx]
    if str(start_year) in years_name_to_index.keys():
        index = years_name_to_index[str(start_year)]
        result['Year_Similarity'] = years_inclusion_matrix[:,index]
    if str(end_year) in years_name_to_index.keys():
        index = years_name_to_index[str(end_year)]
        result['Year_Similarity'] = pd.concat([pd.Series(years_inclusion_matrix[:,index]), result['Year_Similarity']], axis=1).min(axis=1)
    result['Network_Similarity'] = data['Network'].apply(lambda x: map_network(x, preferred_network))
    result['Year_Similarity'] = 1 - result['Year_Similarity']/(result['Year_Similarity'].max()+1)
    result['Total'] = round(result['Summary_Similarity']*.6 + result['Actor_Similarity']*.1 + result['Year_Similarity']*.05 + result['Genre_Similarity']*.2 + result['Network_Similarity']*.05,5)
    result = result.sort_values(by='Total', ascending=False)
    result = result[:num_results]
    indices =  result.index.tolist()
    best_dramas = pd.Series([drama_index_to_name[index] for index in indices],index = result.index)
    result.insert(loc=0, column='Drama_Title', value=best_dramas)
    result.reset_index()
    return result

def display (dramas_enjoyed, dramas_disliked, preferred_genres, preferred_network, preferred_actors, preferred_time_frame, num_results):
    dramas_enjoyed = dramas_enjoyed.split(', ')
    dramas_disliked = dramas_disliked.split(', ')
    preferred_genres = preferred_genres.split(', ')
    preferred_actors =  preferred_actors.split(', ')
    preferred_time_frame = preferred_time_frame.split('-')
    best = best_match(actors_dict, genre_inclusion_matrix, actors_inclusion_matrix, years_inclusion_matrix, genre_name_to_index, actors_name_to_index, years_name_to_index,drama_sims_cos, data, drama_index_to_name, drama_name_to_index,  dramas_enjoyed, dramas_disliked, preferred_genres, preferred_network, preferred_actors, preferred_time_frame, num_results)
    result = list(zip(best['Drama_Title'], best["Total"]))
    titles = {}
    summaries = {}
    genres = {}
    ratings = {}
    runtimes = {}
    networks = {}
    actors = {}
    votes = {}
    years = {}
    for title, score in result:
        idx = drama_name_to_index_unprocess[title]
        summary = str(non_processed_data['Summary'].loc[idx])
        if summary != "nan":
            summaries[title] = summary
        else:
            summaries[title] = ""
        genre = str(non_processed_data['Genre'].loc[idx])
        if genre != "nan":
            genres[title] = genre
        else:
            genres[title] = ""
        rating = str(non_processed_data['Rating'].loc[idx])
        if rating != "nan":
            ratings[title] = rating
        else:
            ratings[title] = ""
        runtime = str(non_processed_data['Runtime'].loc[idx])
        if runtime != "nan":
            runtimes[title] = rating
        else:
            runtimes[title] = ""
        network = str(non_processed_data['Network'].loc[idx])
        if network != "nan":
            networks[title] = network
        else:
            networks[title] = ""
        actor = str(non_processed_data['Actors'].loc[idx])
        if actor != "nan":
            actors[title] = actor
        else:
            actors[title] = ""
        vote = str(non_processed_data['Votes'].loc[idx])
        if vote != "nan":
            votes[title] = vote
        else:
            votes[title] = ""
        year = str(data['Year'].loc[idx])
        if year != "nan":
            years[title] = year
        else:
            years[title] = ""
    return ['Drama Title: {},  Summary: {},  Genre: {}, Rating: {}, Runtime: {}, Network: {}, Actors: {}, Votes: {}, Years: {}, Total Similarity Score: {}'.format(title, summaries[title], genres[title], ratings[title], runtimes[title], networks[title], actors[title], votes[title], years[title], score) for title, score in result]

display("black", "", "medical, romance, comedy", "", "Park Shin-Hye","2010-2015", 10)


["Drama Title: Arang and the Magistrate,  Summary: The drama is based on famous folklore.A nobleman named Kim Eun Oh (Lee Joon Ki) comes to town searching for his mother after hearing a rumor that she is staying at the village of Miryang. He has the special ability to see spirits, but pretends he doesn’t because he gets annoyed when spirits come up to him asking for a favor.Arang (Shin Min Ah) has lost all her memories when she became a ghost and is unable to rest in peace until she finds out how she ended up dead. However, after appearing to three magistrates, none of them survive the fright of seeing her. When she finds out that Eun oh is able to see her, she begs for his help.At first, Eun Oh rejects her request. However he changes his mind after seeing that Arang is somehow in possession of the hairpin that he gave his mother at their last meeting. He believes that if he helps her, Arang will regain her memories and give him information about his mother. He exasperatedly (then affe

In [11]:
genre = pd.read_pickle("Genres.pkl")


In [5]:
g = pd.read_pickle('Networks.pkl')
print(g)
g = list(g) 
print(len(g))
g.remove(g[0])
print(len(g))
print(g)

[nan, 'Channel A &amp; NAVER tvcast', 'Mnet &amp; tvN &amp; KM &amp; Onstyle', 'SBS &amp; Netflix', 'KBS1', 'MBC every1', 'SBS Plus', 'Channel A', 'DramaX &amp; MBN', 'Netflix', 'Oksusu (2016-Sep-30) &amp; DramaX (2016-Oct-05)', 'MBC Dramanet', 'Mnet &amp; tvN', 'MBC every1, MBC Drama Net, MBC Music, MBC QueeN', 'UMAX &amp; O’live', 'CGV', 'TBS and MBC', 'MBN &amp; DramaX', 'KBS2', 'MBC', 'Sohu TV &amp; SBS', 'Tooniverse', 'DRAMAcube', 'KBSN', 'E-Channel &amp;  DRAMAcube', 'SBS/Fuji TV', 'OCN', 'MBC &amp; MBC every1', 'MBC &amp; NAVER tvcast', 'Mnet', 'SBS', 'TV Mnet, SBS', 'NAVER tvcast &amp; MBC every1', '(Aired at Yunsae University Main Auditorium)', 'EBS', 'KBS-W', 'MBN, DramaX', 'tvN', 'KBS-N', 'DramaH', 'MBN', 'NAVER tvcast', 'Onstyle', 'tvN ', 'CSTV', 'jTBC', 'O’live', 'DramaX, UMAX', 'NAVER tvcast &amp; Sohu TV', 'jTBC, NAVER tvcast', 'Viki', 'Oksusu', 'MBC DramaNet']
53
52
['Channel A &amp; NAVER tvcast', 'Mnet &amp; tvN &amp; KM &amp; Onstyle', 'SBS &amp; Netflix', 'KBS1', 'M

In [12]:
print(genre)

['drama', 'horror', 'food', 'friendship', 'business', 'melodrama', 'mature', 'family', 'political', 'rural', 'investigation', 'historical', 'fantasy', 'comedy', 'school', 'mystery', 'travel', 'sport', 'documentary', 'romantic', 'sci-fi', 'thriller', 'action', 'war', 'crime', 'NaN', 'children', 'music', 'web-drama', 'fusion', 'human', 'time-travel', 'omnibus', 'law', 'medical', 'revenge']


In [17]:
titles = list(data["Title"])
titlesCopy = titles.copy()
for c in titlesCopy:
    print(c)
    if "www.koreandrama.org" in c:
        titles.remove(c)
    

voice 
save me 
the secret life of my secretary
abyss
different dreams
mung bean flower
my first first love
her private life
i hate going to work
special labor inspector jo
beautiful world
my fellow citizens
the banker
eulachacha waikiki 
mother of mine
confession
kill it
doctor prisoner
he is psychometric
big issue
possessed
love in sadness
the fiery priest
i hate you, juliet!
haechi
item
dazzling
trap
legal high
rude miss young ae 
touch your heart
babel
romance is a bonus book
kingdom
spring turns to spring
blessing of the sea
what’s wrong, poong sang
neighborhood lawyer jo deul ho : crime and punishment
the crowned clown
left-handed wife
the best chicken
www.koreandrama.org/2018-kbs-drama-awards-winners-list/" rel="bookmark" title="permanent link to 2018 kbs drama awards "> 2018 kbs drama awards 
www.koreandrama.org/2018-mbc-drama-awards-winners-list/" rel="bookmark" title="permanent link to 2018 mbc drama awards "> 2018 mbc drama awards 
my strange hero
dance sports girls
bad dete

the gifted
outcast
the royals
madam secretary
empire
reign
wayward pines
the last ship
legends
hello ladies
agents of s.h.i.e.l.d.
top of the lake
the following
perception
being human
outsourced
pretty little liars
the glades
the middle
harper's island
invasion
desperate housewives
las vegas
tru calling
the o.c.
real time with bill maher
everwood
smallville
grounded for life
star trek: enterprise
roswell
ed, edd n eddy
he-man and the masters of the universe
the pretender
early edition
nypd blue
beavis and butt-head
rugrats
the ren & stimpy show
doug
dinosaurs
miami vice
magnum, p.i.
three's company
happy days
batman
i dream of jeannie
the flintstones
manifest
the resident
the tick
riverdale
star trek: discovery
wynonna earp
wet hot american summer: first day of camp
colony
the tonight show starring jimmy fallon
luke cage
the last man on earth
faking it
star-crossed
the strain
resurrection
sleepy hollow
crossing lines
dracula
the mindy project
baby daddy
last resort
young & hungry
the s

In [16]:
print(list(data["Title"]))

