In [27]:
from bs4 import BeautifulSoup
import requests
from datetime import datetime
import csv
import pandas as pd
import re
import time
import random

In [28]:
def crawl(id_list):
    count = 0
#     debug_range = 5
    print('Start crawling...' + ' for ' + str(len(id_list)) + ' movies')
    for movie in id_list:
        data = []
#         if count == debug_range:
#             break
        url = 'http://www.imdb.com/title/' + str(movie) + '/'
        time.sleep(3)
        response = requests.get(url, headers=get_headers())
        soup = BeautifulSoup(response.content, "html.parser")
        data.append(get_movie_data(soup, movie))
        count += 1
        print(str(count) + ' / ' + str(len(id_list)))
        print(data[0][1])
        df = pd.DataFrame(data, columns=col_name)
        with open(file_name, 'a', encoding='utf-8') as f:
            df.to_csv(f, header=False, index=False, encoding='utf-8')

In [29]:
def get_movie_data(soup, mid):
    data = []
    tid = mid
    name = get_movie_name(soup)
    date = get_movie_date(soup)
    country = get_movie_country(soup)
    rating = get_movie_rating(soup)
    genre = get_movie_genre(soup)
    budget = get_movie_budget(soup)
    runtime = get_movie_runtime(soup)
    director = get_movie_director(soup)
    stars = get_movie_stars(soup)
    data = [mid, name, date, country, rating, genre, budget, runtime, 
            director[0], director[1], director[2], director[3], director[4],
            stars[0][0], stars[0][1], stars[0][2], stars[0][3], stars[0][4],
            stars[1][0], stars[1][1], stars[1][2], stars[1][3], stars[1][4],
            stars[2][0], stars[2][1], stars[2][2], stars[2][3], stars[2][4]]
    return data

In [30]:
def get_movie_name(soup):
    try:
        name = soup.find(itemprop="name").get_text()
        name = name.replace(u'\xa0', u' ')
        idx = name.find(' (')
        return name[:idx]
    except:
        print('error: can not find name')
        return 'NA'
def get_movie_rating(soup):
    try:
        rating = soup.find(itemprop="ratingValue").get_text()
        return float(rating)
    except:
        return 'NA'
def get_movie_genre(soup):
    try:
        all_genre = ''
        genre_list = soup.find_all('span', itemprop="genre")
        for s in genre_list:
            all_genre += s.get_text() + ','
        return all_genre[:-1]
    except:
        print('error: can not find genre')
        return 'NA'
def get_movie_budget(soup):
    try:
        temp = soup.find('h4', text='Budget:').find_next_sibling(text=True)
        s_idx = temp.find('$')
        e_idx = temp.find(' (')
        temp = temp[s_idx:e_idx].replace(',', '')
        budget = int(temp.replace('$', ''))
        return budget
    except:
        print('error: can not find budget')
        return 'NA'
def get_movie_date(soup):
    try:
        temp = soup.find('h4', text='Release Date:').find_next_sibling(text=True)
        s_idx = temp.find(': ')
        e_idx = temp.find(' (')
        d = temp[s_idx+1:e_idx].replace('\n', '').replace(' ', '')
        d = datetime.strptime(d, '%d%B%Y')
        date = d.strftime('%Y/%m/%d')
        return date
    except:
        print('error: can not find date')
        return 'NA'
def get_movie_country(soup):
    try:
        country = ''
        country_list = soup.find('h4', text='Country:').find_parent('div').find_all('a')
        for i in country_list:
            country += i.get_text() + ','
        return country[:-1]
    except:
        print('error: can not find country')
        return 'NA'
def get_movie_runtime(soup):
    try:
        hours, mins = 0, 0
        temp = soup.find('time', itemprop="duration").get_text()
        idx_h = temp.find('h')
        idx_m = temp.find('min')
        if idx_h != -1:
            hours = int(temp[:idx_h])
        if idx_m != -1:
            mins = int(temp[idx_h+1:idx_m])
        runtime = 60*hours + mins
        return runtime
    except:
        print('error: can not find runtime')
        return 'NA'

In [42]:
def get_movie_director(soup):
    try:
        s = soup.find(itemprop="director")
        #get name
        director_name = s.find_next(itemprop="name").get_text()
        #get personal data
        director_url = target_site + s.find_next('a').get('href')
        time.sleep(3)
        response = requests.get(director_url, headers=get_headers())
        director_soup = BeautifulSoup(response.content, "html.parser")
        oscars, wins, nominations, ratings = get_person_awards(director_soup)
        director = [director_name, oscars, wins, nominations, ratings]
        return director
    except:
        NAs = ['NA', 'NA', 'NA', 'NA', 'NA']
        print('error: can not find directors')
        return NAs
def get_movie_stars(soup):
    try:
        stars_list = soup.find_all(itemprop="actors")
        stars = []
        if stars_list == []:
            raise Exception
        for s in stars_list:
            actor_data = []
            #get name
            actor_name = s.find_next(itemprop="name").get_text()
            #get personal data
            actor_url = target_site + s.find_next('a').get('href')
            time.sleep(3)
            response = requests.get(actor_url, headers=get_headers())
            actor_soup = BeautifulSoup(response.content, "html.parser")
            oscars, wins, nominations, ratings = get_person_awards(actor_soup)
            actor_data = [actor_name, oscars, wins, nominations, ratings]
            stars.append(actor_data)
        return stars
    except Exception:
        NAs = [['NA', 'NA', 'NA', 'NA', 'NA'],
        ['NA', 'NA', 'NA', 'NA', 'NA'],
        ['NA', 'NA', 'NA', 'NA', 'NA']]
        print('error: can not find stars')
        return NAs

In [43]:
def get_person_awards(actor_soup):
    try:
        award_list = actor_soup.find_all('span', itemprop='awards')
        oscars, wins, nominations, ratings = 0, 0, 0, 0
        #get all award data
        for award in award_list:
            temp = award.get_text().replace(' ', '')
            match_oscar = re.search(r"(\d+)Oscar", temp)
            match_win = re.search(r"(\d+)win", temp)
            match_nomination = re.search(r"(\d+)nomination", temp)
            if match_oscar:
                oscars = int(match_oscar.group(1))
            if match_win:
                wins = int(match_win.group(1))
            if match_nomination:
                nominations = int(match_nomination.group(1))
        #calculate known-for movie avg rating 
        movie_list = actor_soup.find_all('div', {'class':'knownfor-title'})
        rating_count = 0
        for movie in movie_list:
            movie_url = target_site + movie.find('a').get('href')
            time.sleep(3)
            response = requests.get(movie_url, headers=get_headers())
            movie_soup = BeautifulSoup(response.content, "html.parser")
            movie_rating = get_movie_rating(movie_soup)
            if movie_rating != 'NA':
                ratings += movie_rating
                rating_count += 1
        ratings /= rating_count
        return oscars, wins, nominations, ratings
    except:
        print('error: can not find awards')
        return 'NA', 'NA', 'NA', 'NA'

In [44]:
def get_headers():
    useragent_list = [
        'Mozilla/5.0 (Windows NT 6.1; rv,2.0.1) Gecko/20100101 Firefox/4.0.1',
        'Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11',
        'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
        'Mozilla/5.0 (Windows NT 6.1; rv,2.0.1) Gecko/20100101 Firefox/4.0.1',
        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.106 Safari/537.36',
        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.9.2.1000 Chrome/39.0.2146.0 Safari/537.36',
        'Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11',
        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3',
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3',
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/532.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/532.3',
        'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5',
        'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.81 Safari/537.36'
    ]
    useragent = random.choice(useragent_list)
    header = {'User-Agent': useragent}
    return header

In [45]:
#set para
target_site = 'http://www.imdb.com'
movie_file = pd.read_csv('MOVIES.csv')
# id_list = ['tt5540084']
id_list = list(movie_file['IMDB_ID'][61:200])

In [46]:
#start crawl
file_name = 'imdb.csv'
col_name = ['id', 'title', 'date', 'country', 'rating', 'genre', 'budget', 'runtime', 
            'director', 'director_oscars', 'director_wins', 'director_nominations', 'director_ratings',
            'star_1', 'star_1_oscars', 'star_1_wins', 'star_1_nominations', 'star_1_ratings',
            'star_2', 'star_2_oscars','star_2_wins', 'star_2_nominations', 'star_2_ratings',
            'star_3', 'star_3_oscars','star_3_wins', 'star_3_nominations', 'star_3_ratings']
crawl(id_list)

Start crawling... for 139 movies
error: can not find budget
error: can not find stars
1 / 139
Wansei Back Home
2 / 139
Crimson Peak
error: can not find budget
3 / 139
Sheng zhe wei wang
4 / 139
Goosebumps
5 / 139
Dark Places
6 / 139
Vacation
7 / 139
Daddy's Home
8 / 139
Pan
9 / 139
The SpongeBob Movie: Sponge Out of Water
10 / 139
Scouts Guide to the Zombie Apocalypse
11 / 139
Southpaw
12 / 139
The Walk
13 / 139
The Pyramid
14 / 139
Whiplash
15 / 139
The Ghouls
16 / 139
Home
error: can not find budget
17 / 139
Lion Dancing
18 / 139
Magic Mike XXL
error: can not find budget
19 / 139
234 Shuo ai ni
20 / 139
Chek dou
error: can not find budget
21 / 139
Shi yi
22 / 139
The Age of Adaline
error: can not find budget
23 / 139
Chong fan 20 sui
24 / 139
Blackhat
error: can not find date
error: can not find budget
error: can not find stars
25 / 139
Fly, Kite Fly
26 / 139
Secret in Their Eyes
27 / 139
Self/less
28 / 139
Mortdecai
29 / 139
Project Almanac
error: can not find budget
30 / 139
Bakemo