In [2]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options

from multiprocessing import Process
import multiprocessing
import os
import numpy as np
import time

from multiprocessing import Pool
from concurrent import futures
import concurrent

from tqdm.notebook import tqdm

import itertools
import json

import re

from selenium.common.exceptions import NoSuchElementException


import pickle
from fake_useragent import UserAgent

from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options

In [3]:
def perf_clock(func):
    
    """
    calculate total time taken
    
    params : func
    
    return : wrapper
    """
    
    def wrapper(*args, **kwargs):
        
        #start
        st = time.perf_counter()
        
        result = func(*args, **kwargs)
        
        et = time.perf_counter() - st
        
        # name of function
        name = func.__name__

        print('site name : %s \ntotal time : [%0.5fs]' % (name, et))

        return result

    return wrapper

In [4]:
def get_total_number_tag():

    """
    Crawls total number of free streaming movies specified at vudu site using selenium.
    Needs to divide total number by 50 because json file provides 50 movies every iteration and saves it in crawl_turn_number
    
    outputs : crawl_turn_number(int)
    """
    
    global chrome_driver_path
    
    options = Options()
    options.add_argument('--headless')
    
    with webdriver.Chrome(chrome_driver_path,chrome_options=options) as driver:

        driver.implicitly_wait(3)

        driver.get('https://www.vudu.com/content/movies/uxrow/Movies/88')

        time.sleep(5)

        # get total number tag
        total_number_tag = WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.XPATH, '//*[@id="reactApp"]/div/div/div/div[2]/div/span/h1'))).text

        # preprocess
        total_number = int(total_number_tag.split(' ')[-1].replace("(", "").replace(")", ""))

        # get the number of total crawling number
        # receives 50 movies at each iteration, so divide by 50.
        crawl_turn_num = total_number // 50

    return crawl_turn_num + 1

In [5]:
def crawling(crawl_list):
    
    """
    Crawls vudu_id and title from json file. Creates url and crawls title, poster url using selenium. Checks if crawled title and title from json file matches. 
    Crawls overview, genre, runtime, release date, country and director Using another api.
    Append all cralwed data to rows and return rows.
    
    if title does not match, it prints title and url.
    
    if url does not open, it prints url.
    
    input : crawl_list(range)
    
    output : rows(list)
    
    """
    
    global chrome_driver_path

    options = Options()
    options.add_argument('--headless')
    options.add_argument('--no-sandbox')

    # row list to save 
    rows = []

    print('crawling starts')
        
    for i in tqdm(crawl_list):

        offset_num = i * 50

        url = 'https://apicache.vudu.com/api2/?_type=uxElementSearch&contentEncoding=gzip&count=50&format=application%2Fjson&offset={}&sortBy=streamScore&uxRowId=88'.format(offset_num)
        
        session = requests.Session()

        session.trust_env = False  # Don't read proxy settings from OS

        req = session.get(url).text

        session.trust_env = True

        session.close()

        json_list = (json.loads(req.replace('/*-secure-', '').replace('*/', '')))['uxElement']
        
        options = Options()
        options.add_argument('--headless')
        options.add_argument('--no-sandbox')
        
        driver = webdriver.Chrome(chrome_driver_path, options=options)
        
        for j, movie in enumerate(json_list):

            vudu_id, title, release, genre, director, actor, country, runtime, production, overview, url, poster_url = [None for _ in range(12)]

            # vudu_id
            try:
                vudu_id = movie['assetId'][0]
            except Exception as e:
                # print(e)
                vudu_id = None

            # title
            try:
                title = movie['label'][0]
            except Exception as e:
                # print(e)
                title = None


            # url
            try:
                characters_to_remove = "!()@:,.?"
                pattern = "[" + characters_to_remove + "]"
                url_title = re.sub(pattern, "", title).replace(" ", "-")

                url_save = 'https://www.vudu.com/content/movies/details/' + url_title + '/' + vudu_id

            except Exception as e:
                url_save = None
            
            # check whether url is valid
            driver.get(url_save)
            
            try:
                title_crawled = WebDriverWait(driver, 480).until(EC.presence_of_element_located((By.XPATH, '//*[@id="reactApp"]/div/div/div/div[2]/div[1]/div[1]/div[2]/div/div/div[2]/div[1]/div/div/div'))).text
                
                # if title_Crawled from movie page and title got from json data are different, 
                # url is wrong
                if title_crawled != title:
                    print('title_crawled :', title_crawled)
                    print('title : ', title)
                    raise Exception
                
                # crawl poster url
                try:
                    poster_url = WebDriverWait(driver, 300).until(EC.presence_of_element_located((By.XPATH, '//*[@id="reactApp"]/div/div/div/div[2]/div[1]/div[1]/div[2]/div/div/div[1]/span/div[1]/span/div[2]/img'))).get_attribute('src')

                except Exception as e:
                    # print(e)
                    poster_url = None
                
                if vudu_id:
                    
                    session = requests.Session()
                    
                    session.trust_env = False  # Don't read proxy settings from OS
                    
                    url = 'https://apicache.vudu.com/api2/?_type=contentSearch&contentEncoding=gzip&contentId={}&dimensionality=any&followup=ultraVioletability&followup=longCredits&followup=usefulTvPreviousAndNext&followup=superType&followup=episodeNumberInSeason&followup=advertContentDefinitions&followup=tag&followup=hasBonusWithTagExtras&followup=subtitleTrack&followup=ratingsSummaries&followup=geneGenres&followup=seasonNumber&followup=trailerEditionId&followup=genres&followup=usefulStreamableOffers&followup=walmartOffers&followup=preOrderOffers&followup=editions&followup=merchandiseContentMaps&followup=ageLimit&followup=parentalGuide&followup=hasClearplay&followup=promoTags&followup=advertEnabled&followup=uxPromoTags&format=application%2Fjson'.format(vudu_id)
                    
                    req = session.get(url).text

                    session.trust_env = True

                    session.close()

                    json_req1 = json.loads(req.replace('/*-secure-', '').replace('*/', ''))
                    
                    # get overview
                    try:
                        overview = json_req1['content'][0]['description'][0] 
                    except Exception as e:
                        # print(e)
                        overview = None

                    # get genre
                    try:
                        genres = json_req1['content'][0]['genres'][0]['genre']

                        genres_ = ""

                        for i, genre in enumerate(genres):
                            if i == 0:
                                genres_ += genre['name'][0]
                            else:
                                genres_ += "," + genre['name'][0]
                    except Exception as e:
                        # print(e)
                        genres_ = None

                    # get runtime
                    try:
                        runtime = int(json_req1['content'][0]['lengthSeconds'][0]) // 60
                    except Exception as e:
                        # print(e)
                        runtime = None

                    # get release
                    try:
                        release = json_req1['content'][0]['releaseTime'][0].replace('-', '.')
                    except Exception as e:
                        # print(e)
                        release = None

                    # get country
                    try:
                        country = json_req1['content'][0]['country'][0]
                    except Exception as e:
                        # print(e)
                        country = None

                    # get director
                    try:
                        credits_list = json_req1['content'][0]['credits'][0]['credit']

                        for credits in credits_list:

                            role = credits['role']
                            if 'Director' in role:
                                director = credits['firstName'][0] + " " + credits['lastName'][0]
                    except Exception as e:
                        # print(e)
                        director = None

                    rows.append([vudu_id, title, release, genres_, director, None, country, runtime, None, overview, url_save, poster_url])
                    
            except Exception as e:
                # print(e)
                print('url does not open : ', title, url_save)
                continue  
            
        driver.close()
        
    return rows

In [6]:
@perf_clock
def crawl_vudu(output_path):
    
    global chrome_driver_path
    
    """
    Get crawl_turn_number from get_total_number_tag() and split it into cpu_count. Each process will crawl splited number of movies using crawling method.
    
    Creates excel file and saves at output_path.
    
    input : output_path(str) -> excel path.
    """
    
    
    # get total number of movies
    total_number_tags = get_total_number_tag()
    # total_number_tags = multiprocessing.cpu_count()
    # split total number of tags into equal number 
    
    cpu_count = multiprocessing.cpu_count()
    print("cpu_count : %d" % (cpu_count))
    
    # split total number of movies into fractions
    total_number_tags_list = [list(array) for array in np.array_split(range(total_number_tags), cpu_count)]
    
    # use multiprocessing
    with concurrent.futures.ProcessPoolExecutor(max_workers= cpu_count) as executor:
        
        tests = executor.map(crawling, total_number_tags_list)
    
    # create dataframe
    vudu_df = pd.DataFrame(columns = ['vudu_id', 'title', 'release', 'genre','director', 'actor', 'country', 'runtime', 'production', 'overview','url', 'image_url'])
    
    print('saving dataframe...')
    
    tests_list = list(tests)
    
    count = 0

    for i in tqdm(tests_list):
        for j in i:
            vudu_df.loc[count] = j
            count += 1
    
    return vudu_df

In [12]:
def google_search(movie):
    
    title = movie["title"]
    director = movie["director"]
    
    # userAgent 만들기.
    ua = UserAgent(verify_ssl=False)
    userAgent = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'}
    options = Options()
    options.add_argument(f'user-agent={userAgent}')

    driver = webdriver.Chrome('/Users/mycelebs-it01/Desktop/chromedriver' , chrome_options=options)

    driver.get("http://www.google.com")
    
    # cookies 불러오기
#     cookies = pickle.load(open("cookies.pkl", "rb"))
#     for cookie in cookies:
#         driver.add_cookie(cookie)
    
    
    search_list = []
    for i in range(len(title)):
        try:
            search_list.append(title[i]+" "+director[i])
        except:
            search_list.append(title[i])

    release = []; release_list = []

    time.sleep(5)

    for i in tqdm(range((len(movie)))):
        url = 'https://www.google.com/search?q='+search_list[i]
        driver.get(url)
        time.sleep(7)
        try:
            soup = BeautifulSoup(driver.page_source, 'lxml')
            if soup.find("span",class_ = "w8qArf").text.strip() == "개봉일:":
                year = soup.find("span",class_ = "LrzXr kno-fv").text
                release.append(year)
                print(year)
                time.sleep(3)
            else:
                release.append(None)
                print(None)
        except Exception as error:
            release.append(None)
            print(None)

    driver.close()
    
    for release_one in release:
        print(release_one)
        try:
            if "일" in release_one:
                tmp = release_one.split(" ")
                tmp = (tmp[0]+tmp[1]+tmp[2]).replace("년",".").replace("월",".").replace("일","")
                release_list.append(tmp)
                print("final: ",tmp)
            elif "월" in release_one:
                release_list.append(None)
                print("final: ",None)
            elif "년" in release_one:
                release_list.append(None)
                print("final: ",None)
            else:
                release_list.append(None)
                print("final: ",None)
        except:
            release_list.append(None)
            print("final: ",None)
    
    print(len(release_list))
    
    movie['release'] = release_list
    
    with pd.ExcelWriter(output_path, engine='xlsxwriter', options={'strings_to_urls' : False}) as writer:
        movie.to_excel(writer)

In [8]:
chrome_driver_path = '/Users/mycelebs-it01/Desktop/chromedriver'
output_path = './crawl_vudu.xlsx'

vudu_df = crawl_vudu(output_path)

google_search(vudu_df)

  from ipykernel import kernelapp as app


cpu_count : 8
crawling starts
crawling starts
crawling starts
crawling starts
crawling starts
crawling starts
crawling starts
crawling starts


HBox(children=(FloatProgress(value=0.0, max=14.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=14.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=14.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=14.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=14.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=13.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=13.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=13.0), HTML(value='')))

title_crawled : Meshuggah: Alive 
title :  Meshuggah: Alive 
url does not open :  Meshuggah: Alive  https://www.vudu.com/content/movies/details/Meshuggah-Alive /277635
title_crawled : Toto: 35th Anniversary Tour - Live in Poland
title :  Toto: 35th Anniversary Tour - Live in Poland
url does not open :  Toto: 35th Anniversary Tour - Live in Poland https://www.vudu.com/content/movies/details/Toto-35th Anniversary-Tour---Live-in-Poland/1050436
title_crawled : Classic Albums: Motörhead's Ace of Spades
title :  Classic Albums: MotÃ¶rhead's Ace of Spades
url does not open :  Classic Albums: MotÃ¶rhead's Ace of Spades https://www.vudu.com/content/movies/details/Classic-Albums-MotÃ¶rhead's-Ace-of-Spades/1049949
title_crawled : Yes: Songs From Tsongas - 35th Anniversary Concert
title :  Yes: Songs From Tsongas - 35thÂ Anniversary Concert
url does not open :  Yes: Songs From Tsongas - 35thÂ Anniversary Concert https://www.vudu.com/content/movies/details/Yes-Songs-From-Tsongas---35thÂ Anniversary

HBox(children=(FloatProgress(value=0.0, max=8.0), HTML(value='')))


site name : crawl_vudu 
total time : [4576.97785s]
