# Show Content Scraper
> Author: Sharnique Beck
<hr>
Scrape show content(title, director, writer, cast, user average rating, network, genre, # episodes) from viki.com and asianwiki.com

In [168]:
# Import libaries
import pandas as pd
import numpy as np
import requests
import time
import re


from selenium import webdriver
from selenium.webdriver.common.keys import Keys

from bs4 import BeautifulSoup

In [2]:
headers = {'User-agent': 'SB 2.0'}

In [3]:
shows = pd.read_csv('./data/k_titles.csv')
shows.head()

Unnamed: 0,title,url,container,rating,# ratings
0,Thirty But Seventeen,https://www.viki.com/tv/36109c-thirty-but-seve...,36109c,9.58,7368
1,Fates and Furies,https://www.viki.com/tv/36240c-fates-and-furies,36240c,9.14,1401
2,The Last Empress,https://www.viki.com/tv/36241c-the-last-empress,36241c,9.45,3858
3,Encounter,https://www.viki.com/tv/36239c-encounter,36239c,9.59,5436
4,My Strange Hero,https://www.viki.com/tv/36330c-my-strange-hero,36330c,9.56,2239


In [14]:
genre = pd.read_csv('./data/genres.csv')
genre.drop(columns=['Unnamed: 0'], inplace=True)

In [15]:
genre.head()

Unnamed: 0,id,name,type
0,1g,Action & Adventure,film and tv
1,2g,Anime,film and tv
2,1057g,Beauty,creators
3,3g,Biography,film and tv
4,1017g,C-Pop,music


### Save Genre Codes

In [None]:
url='https://api.viki.io/v4/containers/genres.json?&app=100000a' 
res = requests.get(url, headers= headers)
soup = res.json()
genres = []
for g in soup:
    genre={}
    genre['id'] = g['id']
    genre['type'] = g['genre_type']
    genre['name'] = g['name']['en']
    genres.append(genre)

k_genres = pd.DataFrame(genres)
k_genres.to_csv('./data/genres.csv')

### Get Cast Info

In [22]:
cast_cont= []

In [57]:
for row in range(len(shows)):
    s_cast={}
    show_url = shows['url'][row] + '#modal-casts'

    # use selenium webdriver as library that acts as a 
    # headless browser to access modal window
    driver = webdriver.Chrome()
    driver.implicitly_wait(30)
    driver.get(show_url)
    
    soup_1=BeautifulSoup(driver.page_source, 'lxml')
    time.sleep(1)
    try:
        cast = soup_1.find_all('div',{'id':'modal-casts'})
        actors = cast[0].find_all('div',{'itemprop':'actor'})

        cast_lst = []
        for actor in range(len(actors)):
            name = actors[actor].find('span', {'itemprop':'name'}).text
            cast_lst.append(name)
        s_cast['title'] = shows['title'][row]
        s_cast['cast'] = cast_lst
        cast_cont.append(s_cast)
        
    except IndexError:
        actors = soup_1.find_all('div',{'itemprop':'actor'})

        cast_lst = []
        for actor in range(1,len(actors)):
            name = actors[actor].find('span', {'itemprop':'name'}).text
            cast_lst.append(name)
        s_cast['title'] = shows['title'][row]
        s_cast['cast'] = cast_lst
        cast_cont.append(s_cast)
        
    except:
        s_cast['title'] = shows['title'][row]
        s_cast['cast'] = []        
        
    #end the Selenium browser session
    driver.quit()

In [72]:
c_content = pd.DataFrame(cast_cont)

In [114]:
c_content.head()

Unnamed: 0,cast,title
0,"[Shin Hye Sun, Yang Se Jong, Ahn Hyo Seop, Ye ...",Thirty But Seventeen
1,"[Joo Sang Wook, Lee Min Jung, Lee Ki Woo, So Y...",Fates and Furies
2,"[Shin Sung Rok, Jang Nara, Choi Jin Hyuk, Shin...",The Last Empress
3,"[Park Bo Gum, Song Hye Kyo, Jang Seung Jo, P.O...",Encounter
4,"[Yoo Seung Ho, Jo Bo Ah, Kwak Dong Yeon, Yoo S...",My Strange Hero


In [74]:
c_content.to_csv('./data/cast.csv', index=False)

### Get other show info

In [None]:
s_cont =[]

In [None]:
# Find show ratings and # of ratings
for row in range(len(shows['container'])):
    url='https://api.viki.io/v4/containers/%s.json?app=100000a' %shows['container'][row]
    res = requests.get(url, headers= headers)
    json_pg = res.json()
    
    info ={}
    info['title'] = shows['title'][row]
    info['genre'] = json_pg['genres']
    info['s_rating'] = json_pg['rating']
    info['# episodes'] = json_pg['episodes']['count']
    info['network'] = json_pg['distributors'][0]['name']
    info['rating']= json_pg['review_stats']['average_rating']
    s_cont.append(info)
    time.sleep(1)

s_contents = pd.DataFrame(s_cont)

In [None]:
s_contents.head()

In [None]:
s_contents.to_csv('./data/s_contents.csv', index = False)

### Get Production info

In [133]:
p_cont = []

In [135]:
for row in range(len(shows)):
    try:
        s_url='http://asianwiki.com/%s' %shows['s_title'][row] 
        res = requests.get(s_url, headers = {'User-agent': 'SB 2.0'})
        soup = BeautifulSoup(res.content, 'lxml')

        info = soup.find('div',{'id':'mw-content-text'})
        table = info.find_all('ul')

        content = {}
        content['title'] = shows['title'][row]
        content['director'] = re.findall('Director:\s(.+)', table[1].text)[0]
        content['screenwriter'] = re.findall('Writer:\s(.+)', table[1].text)[0]
        p_cont.append(content)

        time.sleep(1)
    except:
        s_url='http://asianwiki.com/%s' %shows['s_title'][row] 
        res = requests.get(s_url, headers = {'User-agent': 'SB 2.0'})
        soup = BeautifulSoup(res.content, 'lxml')
        
        content = {}
        content['title'] = shows['title'][row]
        content['director'] = []
        content['screenwriter'] = []
        p_cont.append(content)

        time.sleep(1)


In [136]:
p_cont

[{'title': 'Thirty But Seventeen',
  'director': 'Jo Soo-Won',
  'screenwriter': 'Jo Sung-Hee'},
 {'title': 'Fates and Furies', 'director': [], 'screenwriter': []},
 {'title': 'The Last Empress', 'director': [], 'screenwriter': []},
 {'title': 'Encounter', 'director': [], 'screenwriter': []},
 {'title': 'My Strange Hero',
  'director': 'Ham Joon-Ho',
  'screenwriter': 'Kim Yoon-Young'},
 {'title': 'What’s Wrong With Secretary Kim',
  'director': [],
  'screenwriter': []},
 {'title': 'Devilish Joy',
  'director': 'Kim Ga-Ram',
  'screenwriter': 'Choi Ji-Yeon'},
 {'title': 'I Am Not a Robot',
  'director': 'Jung Dae-Yoon',
  'screenwriter': 'Kim Seon-Mi'},
 {'title': 'Suspicious Partner',
  'director': 'Park Sun-Ho',
  'screenwriter': 'Kwon Ki-Young'},
 {'title': 'Weightlifting Fairy Kim Bok Joo',
  'director': [],
  'screenwriter': []},
 {'title': 'My Only One',
  'director': 'Hong Seok-Ku',
  'screenwriter': 'Kim Sa-Kyung'},
 {'title': 'W', 'director': [], 'screenwriter': []},
 {'title

In [139]:
p_content = pd.DataFrame(p_cont)

In [457]:
p_content.head()

Unnamed: 0,director,screenwriter,title
0,Jo Soo-Won,Jo Sung-Hee,Thirty But Seventeen
1,Jung Dong-Yoon,Kang Cheol-Woong,Fates and Furies
2,Joo Dong-Min,Kim Sun-Ok,The Last Empress
3,Park Shin-Woo,Yoo Young-A,Encounter
4,Ham Joon-Ho,Kim Yoon-Young,My Strange Hero


In [464]:
for show in range(400,p_content.shape[0]):
    if type(p_content['director'][show])!= str:
        print (p_content['title'][show],show)

Love in Memory 400
Special Laws of Romance 401
2018 Idol Star Athletics Champio... 402
2018 Soribada Best K-Music Awards 403
Photo People in Tokyo 407
Song Ji Hyo's Beautiful Life 409
2018 SBS Gayo Daejeon_Music Fest... 413
KBS Drama Special: White Christmas 417
Music Bank K-Chart 421
Traces of the Hand 423
Sister's Slam Dunk Season 2 426
2018 MBC Drama Awards 427
Real Men 428
Show! Music Core 431
The Partner 433
Secret 437
Midnight's Girl 438
2018 KBS Drama Awards 439
Romance Blue 440
109 Strange Things 442
2016 Idol Star Athletics Champio... 443
God’s Workplace 444
I Am 446
2017 Idol Star Athletics Champio... 447
Swan 448
When It's at Night 449
Pops in Seoul 451
Idol King of Cooking 454
E-news Exclusive 455
Where On Earth?? 456
9 End 2 Outs 458
School Attack 2018 459
Section TV Entertainment News 465
Cain and Abel 466
Tamra the Island 468
H.I.T. 469
Truth 470
Matchmaker’s Lover 471
2014 Idol Star Athletics Champio... 472
Who Are You? 473
Bad Boy 474
Spark 475
Heaven's Garden 476
Phot

In [469]:
# s_url= "http://asianwiki.com/Secret_(2000-South_Korea-MBC)"
# res = requests.get(s_url, headers = {'User-agent': 'SB 2.0'})
# soup = BeautifulSoup(res.content, 'lxml')

# info = soup.find('div',{'id':'mw-content-text'})
# table = info.find_all('ul')

p_content.loc[440,'director'] = re.findall('Director:\s(.+)', table[1].text)[0]
p_content.loc[4,'screenwriter'] = re.findall('Writer:\s(.+)', table[1].text)[0]


### Concat data

In [98]:
s_cont.shape

(624, 6)

In [99]:
c_content.shape

(624, 2)

In [112]:
show_info = pd.concat([s_cont,c_content['cast']],axis=1)

In [320]:
p_content.to_csv('./data/p_content.csv', index = False)

In [381]:
show_info[312:333]

Unnamed: 0,# episodes,genre,network,rating,s_rating,title,cast
312,16,"['23g', '18g', '9g']",MBC,9.15,PG-13,Woman With a Suitcase,"[Choi Ji Woo, Joo Jin Mo, Lee Joon, Jeon Hye B..."
313,16,"['6g', '23g', '18g', '9g']",tvN,9.17,PG-13,Twenty Again,"[Choi Ji Woo, Lee Sang Yoon, Kim Min Jae (1996..."
314,12,"['26g', '7g', '9g', '23g']",JTBC,9.58,PG-13,Solomon’s Perjury,"[Kim Hyun Soo, Jang Dong Yoon, Seo Ji Hoon, Se..."
315,355,"['10g', '14g', '17g', '1044g']",Arirang TV,8.69,PG-13,After School Club,"[Park Ji Min, Han Hee Jun, Seungmin, Yook Sung..."
316,26,"['23g', '9g', '18g', '1041g']",SBS,8.75,PG-13,My Heart Twinkle Twinkle,"[Oh Chang Suk, Nam Bo Ra, Bae Soo Bin, Jang Sh..."
317,16,"['9g', '23g', '1041g']",KBS,8.79,PG-13,"I’m Sorry, I Love You","[So Ji Sub, Im Soo Jung, Jung Kyung Ho, Choi Y..."
318,30,"['17g', '23g', '9g', '18g', '1041g']",SBS,9.0,PG-13,Five Fingers,"[Joo Ji Hoon, Jin Se Yeon, Ji Chang Wook, Chae..."
319,12,"['23g', '26g', '7g']",tvN,9.4,PG-13,Liar Game,"[Jo Jae Yoon, Cha Soo Yun, Lee Sang Yoon, Kim ..."
320,20,"['23g', '18g', '9g']",SBS,9.13,PG-13,Second to Last Love,"[Ji Jin Hee, Kim Hee Ae, Kwak Si Yang, Kim Seu..."
321,23,"['1040g', '23g', '9g']",MBC,8.67,PG-13,Golden Time,"[Lee Sun Gyun, Hwang Jung Eum, Lee Sung Min, S..."


In [246]:
s_cont = pd.read_csv('./data/s_contents.csv')
s_cont.drop(columns=['cast'], inplace=True)
s_cont.head(86)

Unnamed: 0,# episodes,genre,network,rating,s_rating,title
0,32,"['23g', '9g', '18g', '6g', '7g']",SBS,9.58,PG-13,Thirty But Seventeen
1,36,"['18g', '1041g', '23g']",SBS,8.73,PG-13,Fates and Furies
2,42,"['23g', '9g', '18g']",SBS,9.45,PG-13,The Last Empress
3,16,"['18g', '23g']",tvN,9.56,PG-13,Encounter
4,32,"['18g', '1041g', '9g', '23g']",SBS,9.55,PG-13,My Strange Hero
5,16,"['23g', '1038g', '18g']",tvN,9.72,PG-13,What’s Wrong With Secretary Kim
6,16,"['18g', '23g']","MBN, Dramax",9.15,PG-13,Devilish Joy
7,33,"['9g', '23g', '6g', '18g']",MBC,9.67,PG-13,I Am Not a Robot
8,40,"['23g', '9g', '18g', '6g', '26g']",SBS,9.63,PG-13,Suspicious Partner
9,16,"['18g', '20g', '9g', '23g']",MBC,9.75,PG-13,Weightlifting Fairy Kim Bok Joo


In [None]:
s_contents['s_rating'].value_counts()

In [118]:
shows['s_title']=shows['title'].map(lambda x: x.replace(' ','_'))

In [119]:
shows.head()

Unnamed: 0,title,url,container,rating,# ratings,s_title
0,Thirty But Seventeen,https://www.viki.com/tv/36109c-thirty-but-seve...,36109c,9.58,7368,Thirty_But_Seventeen
1,Fates and Furies,https://www.viki.com/tv/36240c-fates-and-furies,36240c,9.14,1401,Fates_and_Furies
2,The Last Empress,https://www.viki.com/tv/36241c-the-last-empress,36241c,9.45,3858,The_Last_Empress
3,Encounter,https://www.viki.com/tv/36239c-encounter,36239c,9.59,5436,Encounter
4,My Strange Hero,https://www.viki.com/tv/36330c-my-strange-hero,36330c,9.56,2239,My_Strange_Hero


In [None]:
s_info =[]

In [122]:
url3='http://asianwiki.com/Thirty_But_Seventeen' #%shows['s_title'][21] 
res = requests.get(url3, headers = {'User-agent': 'SB 2.0'})
soup = BeautifulSoup(res.content, 'lxml')

info = soup.find('div',{'id':'mw-content-text'})
table = info.find_all('ul')


In [125]:
table[1]

<ul><li> <b>Drama:</b> Still 17 (English title) / Thirty But Seventeen (literal title)
</li><li> <b>Revised romanization:</b> Seoreunijiman Yeolilgobibmida
</li><li> <b>Hangul:</b> 서른이지만 열일곱입니다
</li><li> <b>Director:</b> <a href="/Jo_Soo-Won_(director)" title="Jo Soo-Won (director)">Jo Soo-Won</a>
</li><li> <b>Writer:</b> <a href="/Jo_Sung-Hee_(screenwriter)" title="Jo Sung-Hee (screenwriter)">Jo Sung-Hee</a>
</li><li> <b>Network:</b> <a href="/SBS" title="SBS">SBS</a>
</li><li> <b>Episodes:</b> 32
</li><li> <b>Release Date:</b> July 23 - September 18, 2018
</li><li> <b>Runtime:</b> Monday &amp; Tuesday 22:00 (35 minutes each / 2 episodes per day)
</li><li> <b>Language:</b> Korean
</li><li> <b>Country:</b> South Korea
</li></ul>