# Show Scraper
> Author: Sharnique Beck

In this notebook I perform my two main scrapes to collect show titles and user ratings

In [57]:
# Import libaries
import pandas as pd
import requests
import time
import re

from bs4 import BeautifulSoup

headers = {'User-agent': 'SB 2.0'}

## Scrape the Viki.com korean pages

Note: Collect Titles and show URLs

In [None]:
shows = []

for pg_num in range(1,36):
    KD_pg_url = 'https://www.viki.com/explore?country=korea&page=%s' %pg_num
    res = requests.get(KD_pg_url, headers=headers)
    soup = BeautifulSoup(res.content, 'lxml')

    s_page = soup.find_all('a', {'class':'thumb-title strong'})
    for item in s_page:
        show = {}
        show['title'] = item.text.strip()
        show['url'] = 'https://www.viki.com'+ item.attrs['href']
        shows.append(show)

    time.sleep(1)
k_titles = pd.DataFrame(shows)

In [None]:
# pull out container ids
k_titles['container']= k_titles['url'].map(lambda x: re.findall('[tv]\/(.+?)\-', x)[0])

In [None]:
# Add show rating and # ratings
k_titles['rating'] = ''
k_titles['# ratings']= ''

In [None]:
k_titles.tail()

## Clean Data 

In [None]:
# Drop the shows that are not tv-series
k_titles = k_titles[~k_titles.url.str.contains("movies")]
k_titles = k_titles[~k_titles.url.str.contains("creators")]
k_titles.reset_index(drop=True, inplace=True)

In [None]:
# Find show ratings and # of ratings
for row in range(len(k_titles['container'])):
    url='https://api.viki.io/v4/containers/%s.json?app=100000a' %k_titles['container'][row]
    res = requests.get(url, headers= headers)
    json_pg = res.json()
    
    k_titles.loc[row,'rating']= json_pg['review_stats']['average_rating']
    k_titles.loc[row,'# ratings']= json_pg['review_stats']['count']
    k_titles.loc[row,'genre']= json_pg['genres']
    k_titles.loc[row,'s_rating']= json['ratings']
    k_titles.loc[row,'# episodes']= json['episodes']
    k_titles.loc[row,'network']= json['distributors']['name']
    k_titles.loc[row,'# episodes']= json['episodes']
    time.sleep(1)

In [None]:
k_titles.tail()

In [None]:
k_titles.to_csv('./data/k_titles.csv', index=False)

## Scrape show pages for user ratings

In [3]:
k_titles = pd.read_csv('./data/k_titles.csv')

In [4]:
k_titles.head()

Unnamed: 0,title,url,container,rating,# ratings
0,Thirty But Seventeen,https://www.viki.com/tv/36109c-thirty-but-seve...,36109c,9.58,7368
1,Fates and Furies,https://www.viki.com/tv/36240c-fates-and-furies,36240c,9.14,1401
2,The Last Empress,https://www.viki.com/tv/36241c-the-last-empress,36241c,9.45,3858
3,Encounter,https://www.viki.com/tv/36239c-encounter,36239c,9.59,5436
4,My Strange Hero,https://www.viki.com/tv/36330c-my-strange-hero,36330c,9.56,2239


### Access to viki api

In [None]:
ratings = []
shows_comp= 0 # count of the number of shows completed

# Loop through each show
for s_row in range(len(k_titles['container'])):

    pg_nums =1
    
    # Loop through each review page 
    while True:
        url1='https://api.viki.io/v4/containers/%s/reviews.json?sort=review_rank&direction=desc&per_page=10&page=%s&app=100000a' %(k_titles['container'][s_row], pg_nums)
        res = requests.get(url1, headers= headers)
        json_pg = res.json()
        
        next_pg = json_pg['more']    # indicates if there is another page of reviews
        
        # Break loop if show has no reviews
        if json_pg['response'] == []:
            print("%s has no reviews" %(k_titles['title'][s_row]))
            shows_comp+=1
            break
        
        # Loop through each user review on review pg
        for row in json_pg['response']:
            rating = {}
            rating['title'] = k_titles['title'][s_row]
            rating['user'] = row['user']['username']
            rating['rating'] = row['user_content_rating']
            ratings.append(rating)
            
        pg_nums += 1
        
        time.sleep(1)
        
        # Break loop after last page of reviews
        if next_pg != True and json_pg['response'] == []:
            shows_comp+=1
            break
    
    time.sleep(2)
    
    # save data to csv after every 10 shows
    if shows_comp % 10 == 0:
        show_rates = pd.DataFrame(ratings)
        show_rates.to_csv('./data/ratings.csv', index=False)

# save entire data set to csv after scraping all shows is complete
show_rates = pd.DataFrame(ratings)
show_rates.to_csv('./data/k_ratings_total.csv', index=False)

In [None]:
show_rates = pd.DataFrame(ratings)
print(show_rates.shape[0])
print(show_rates['user'].nunique())

In [None]:
show_rates.tail()

In [None]:
show_rates.to_csv('./data/k_ratings_total.csv', index=False)