In [1]:
# import relevant libraries

# basic libraries
import pandas as pd
import numpy as np

# visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns

# machine learning libraries
import sklearn.tree
import sklearn.metrics
import sklearn.model_selection
import sklearn.linear_model

# data collection libraries
import requests
from selenium import webdriver
import bs4
import imdbpie
import json

%matplotlib inline

# Identify: Problem Statement & Goals

**Using data from IMDB, can we build a model that predicts movie ratings based on certain inputs?**

- Predict movie ratings using information about the movie (e.g. genre, length of movie etc.)
- What are the factors that affect movie rating?

# Acquire: Query IMDB, write results, scrape IMDB

## Using IMDBpie

In [23]:
imdb = imdbpie.Imdb()

In [132]:
imdb_df = pd.read_json(json.dumps(imdb.top_250()))

In [42]:
imdb_df.head()

Unnamed: 0,can_rate,image,num_votes,rating,tconst,title,type,year
0,True,{u'url': u'https://images-na.ssl-images-amazon...,1720313,9.3,tt0111161,The Shawshank Redemption,feature,1994
1,True,{u'url': u'https://images-na.ssl-images-amazon...,1175776,9.2,tt0068646,The Godfather,feature,1972
2,True,{u'url': u'https://images-na.ssl-images-amazon...,805875,9.0,tt0071562,The Godfather: Part II,feature,1974
3,True,{u'url': u'https://images-na.ssl-images-amazon...,1706334,9.0,tt0468569,The Dark Knight,feature,2008
4,True,{u'url': u'https://images-na.ssl-images-amazon...,458263,8.9,tt0050083,12 Angry Men,feature,1957


In [135]:
imdb_details = []

In [134]:
#id, title, rating, num_votes, genre, stars, director, plots, length, certification

In [290]:
for i in imdb_df['tconst']:
    title = imdb.get_title_by_id(i)
    details = [i, title.title, title.rating, title.votes,
               title.genres, title.cast_summary, title.directors_summary, 
               title.plots, title.runtime, title.certification]
    imdb_details.append(details)

In [294]:
imdb_details = pd.DataFrame(imdb_details)

In [295]:
imdb_details.to_csv('imdbpie_scraped.csv', encoding='utf-8')

## Bonus: Scrape with BeautifulSoup

In [252]:
phan_path = '//Applications/phantomjs'
def create_driver():
    return webdriver.PhantomJS(executable_path=phan_path)

We want there to be as even a mix of movies as possible.

- Consider all genres
- Consider highest and lowest ratings, and if possible, some in the middle
- Consider highest and lowest number of votes, and if possible, some in the middle

### Define a function to get genres

In [253]:
def get_genres():
    driver = create_driver()
    driver.get('http://www.imdb.com/genre/')
    soup = bs4.BeautifulSoup(driver.page_source, 'lxml')
    genre_list = []
    for i in soup.findAll('a'):
        try:
            if 'genre/' in i['href'] and 'genre/?' not in i['href'] and i.find('span'):
                genre_list.append(i.text.split()[0])
        except:
            pass
    return genre_list

### Define a function to get the IMDB ID of a movie

Each movie has a unique IMDB ID which can be used to search for it in the database.

In [221]:
# url templates
low_ratings = 'http://www.imdb.com/search/title?genres={}&title_type=feature&sort=user_rating,asc&page={}&ref_=adv_nxt'
high_ratings = 'http://www.imdb.com/search/title?genres={}&title_type=feature&sort=user_rating,desc&page={}&ref_=adv_nxt'
high_num = 'http://www.imdb.com/search/title?genres={}&title_type=feature&sort=num_votes,desc&page={}&ref_=adv_nxt'
low_num = 'http://www.imdb.com/search/title?genres={}&title_type=feature&sort=num_votes,asc&page={}&ref_=adv_nxt'

In [256]:
def get_id(genre, target,page):
    driver = create_driver()
    driver.get(target.format(genre, page))
    soup = bs4.BeautifulSoup(driver.page_source, 'lxml')
    ids = []
    for i in soup.findAll('a'):
        try:
            if 'title/tt' in i['href'] and 'http' not in i['href']:
                try:
                    for j in i['href'].split('/'):
                        if 'tt' in j and '_' not in j:
                            ids.append(j.strip())
                except:
                    pass
        except:
            pass
    return ids

In [258]:
all_genres = get_genres()
all_ids = []
url_list = [low_ratings, high_ratings, high_num, low_num]

In [260]:
for i in all_genres:
    print i
    for j in range(len(url_list)):
        for k in [1,15]:
            id_l = get_id(i,url_list[j],k)
            print j, k, len(id_l)
            all_ids.extend(id_l)

Action
0 1 121
0 15 116
1 1 111
1 15 105
2 1 102
2 15 101
3 1 114
3 15 115
Adventure
0 1 119
0 15 119
1 1 114
1 15 114
2 1 101
2 15 105
3 1 123
3 15 111
Animation
0 1 119
0 15 113
1 1 111
1 15 120
2 1 100
2 15 119
3 1 113
3 15 113
Biography
0 1 117
0 15 114
1 1 110
1 15 111
2 1 100
2 15 120
3 1 114
3 15 115
Comedy
0 1 106
0 15 108
1 1 118
1 15 114
2 1 100
2 15 102
3 1 119
3 15 106
Crime
0 1 118
0 15 112
1 1 110
1 15 113
2 1 100
2 15 105
3 1 116
3 15 122
Documentary
0 1 0
0 15 0
1 1 0
1 15 0
2 1 0
2 15 0
3 1 0
3 15 0
Drama
0 1 114
0 15 114
1 1 115
1 15 117
2 1 100
2 15 100
3 1 116
3 15 115
Family
0 1 114
0 15 111
1 1 116
1 15 115
2 1 100
2 15 114
3 1 121
3 15 112
Fantasy
0 1 115
0 15 111
1 1 113
1 15 119
2 1 101
2 15 104
3 1 121
3 15 116
Film-Noir
0 1 114
0 15 56
1 1 101
1 15 64
2 1 100
2 15 73
3 1 127
3 15 56
History
0 1 117
0 15 123
1 1 112
1 15 123
2 1 100
2 15 117
3 1 116
3 15 114
Horror
0 1 111
0 15 116
1 1 121
1 15 116
2 1 100
2 15 103
3 1 119
3 15 113
Music
0 1 118
0 15 121
1 1 1

In [261]:
len(all_ids)

18761

In [262]:
all_ids = set(all_ids)

In [263]:
len(all_ids)

6664

In [268]:
all_ids = pd.Series(list(all_ids))

In [269]:
all_ids.to_csv('movie_id.csv')

### Define a function to get the details of a movie

In [282]:
def get_details(id_num):
    main_url = 'http://www.imdb.com/title/{}/'
    summary_url = 'http://www.imdb.com/title/{}/plotsummary?ref_=tt_stry_pl'
    keywords_url = 'http://www.imdb.com/title/{}/keywords?ref_=tt_stry_kw'
    
    driver = create_driver()
    driver.get(main_url.format(id_num))
    soup = bs4.BeautifulSoup(driver.page_source, 'lxml')
    
    # Get one-liner for the movie
    plot = []
    plot.append(soup.find('div', class_='summary_text').text.strip())


    # Get genre(s) of the movie
    genres = []
    for i in soup.find('div', class_='subtext').findAll('span', itemprop='genre'):
        genres.append(i.text)
    genres = ' '.join(genres)

    # Get title of the movie
    try:
        title = soup.find('title').text
    except:
        title = ''
    
    # Get rating of the movie
    try:
        rating = soup.find('div', class_='ratingValue').find('strong')['title'].split()[0]
    except:
        rating = ''

    # Get number of user ratings
    try:
        num_rating = soup.find('div', class_='ratingValue').find('strong')['title'].split()[-3]
    except:
        num_rating = ''

    # Get length of movie
    try:
        length = soup.find('div', class_='subtext').find('time').text.strip()
    except:
        length = ''
    
    # Get certification
    try:
        certification = soup.find('span', itemprop='contentRating').text
    except:
        certification = ''

    # Get directors, names are linked by underscores
    directors = []
    for j in soup.findAll('div', class_='credit_summary_item'):
        if 'Direct' in j.find('h4').text:
            for i in j.findAll('span', itemprop='name'):
                directors.append('_'.join(i.text.split()))
    directors = ' '.join(directors)
    
    # Get writers, names are linked by underscores
    writers = []
    for j in soup.findAll('div', class_='credit_summary_item'):
        if 'Writ' in j.find('h4').text:
            for i in j.findAll('span', itemprop='name'):
                writers.append('_'.join(i.text.split()))
    writers = ' '.join(writers)
    
    # Get stars, names are linked by underscores
    stars = []
    for j in soup.findAll('div', class_='credit_summary_item'):
        if 'Star' in j.find('h4').text:
            for i in j.findAll('span', itemprop='name'):
                stars.append('_'.join(i.text.split()))
    stars = ' '.join(stars)
    
    driver = create_driver()
    driver.get(summary_url.format(id_num))
    soup = bs4.BeautifulSoup(driver.page_source, 'lxml')
    for j in soup.findAll('p', class_='plotSummary'):
        plot.append(j.text.strip())
    plot = ' '.join(plot)
    
    driver = create_driver()
    driver.get(keywords_url.format(id_num))
    soup = bs4.BeautifulSoup(driver.page_source, 'lxml')
    keywords = []
    for j in soup.findAll('div', class_='sodatext'):
        keywords.append('_'.join(j.text.strip().split()))
    keywords = ' '.join(keywords)
    
    return [id_num, title, length, genres, directors, writers, stars, plot, keywords, rating, num_rating]




In [283]:
all_details = []

In [284]:
for i in range(len(all_ids)):
    print len(all_ids)-i,
    all_details.append(get_details(all_ids[i]))

6664 6663 6662 6661 6660 6659 6658 6657 6656 6655 6654 6653 6652 6651 6650 6649 6648 6647 6646 6645 6644 6643 6642 6641 6640 6639 6638 6637 6636 6635 6634 6633 6632 6631 6630 6629 6628 6627 6626 6625 6624 6623 6622 6621 6620 6619 6618 6617 6616 6615 6614 6613 6612 6611 6610 6609 6608 6607 6606 6605 6604 6603 6602 6601 6600 6599 6598 6597 6596 6595 6594 6593 6592 6591 6590 6589 6588 6587 6586 6585 6584 6583 6582 6581 6580 6579 6578 6577 6576 6575 6574 6573 6572 6571 6570 6569 6568 6567 6566 6565 6564 6563 6562 6561 6560 6559 6558 6557 6556 6555 6554 6553 6552 6551 6550 6549 6548 6547 6546 6545 6544 6543 6542 6541 6540 6539 6538 6537 6536 6535 6534 6533 6532 6531 6530 6529 6528 6527 6526 6525 6524 6523 6522 6521 6520 6519 6518 6517 6516 6515 6514 6513 6512 6511 6510 6509 6508 6507 6506 6505 6504 6503 6502 6501 6500 6499 6498 6497 6496 6495 6494 6493 6492 6491 6490 6489 6488 6487 6486 6485 6484 6483 6482 6481 6480 6479 6478 6477 6476 6475 6474 6473 6472 6471 6470 6469 6468 6467 6466 6465 

KeyboardInterrupt: 

In [286]:
test = pd.DataFrame(all_details, columns=['id_num', 'title', 'length', 'genres', 'directors', 'writers', 'stars', 'plot', 'keywords', 'rating', 'num_rating'])

In [287]:
test.head()

Unnamed: 0,id_num,title,length,genres,directors,writers,stars,plot,keywords,rating,num_rating
0,tt4061854,Guess How Much I Love You: Autumn's Here (2014...,1h 12min,Animation,Steve_Moltzen,,Ky_Baldwin Dylan_Elchaar Kate_Fitzpatrick,Autumn is here and it's time for a change. In ...,,8.7,7
1,tt0030726,Secrets of a Nurse (1938) - IMDb,1h 15min,Drama Mystery Sport,Arthur_Lubin,Lester_Cole Thomas_Lennon,Edmund_Lowe Helen_Mack Dick_Foran,When battered prize-fighter Lee Burke is taken...,,7.2,5
2,tt0050212,The Bridge on the River Kwai (1957) - IMDb,2h 41min,Adventure Drama War,David_Lean,Pierre_Boulle Carl_Foreman,William_Holden Alec_Guinness Jack_Hawkins,After settling his differences with a Japanese...,bridge jungle allies construction sabotage swe...,8.2,152257
3,tt0036409,Swingtime Johnny (1943) - IMDb,1h 1min,Comedy Music,Edward_F._Cline,Warren_Wilson Clyde_Bruckman,Patty_Andrews Maxene_Andrews Laverne_Andrews,The Andrews Sisters take a hiatus from show bu...,dancing singing artillery_shell california pop...,7.2,18
4,tt2818724,The Tragedy of Macbeth (2012) - IMDb,,Animation,Dan_Gallagher,William_Shakespeare,Mirai_Booth-Ong James_Curcione Dan_Gallagher,Add a Plot »,sci_fi_fan shakespearean teaching learning ble...,6.2,5


In [289]:
test.to_csv('imdb_scraped.csv', encoding='utf-8')

In [202]:
driver.get('http://www.imdb.com/genre/')
soup = bs4.BeautifulSoup(driver.page_source, 'lxml')

In [203]:
print soup.prettify()

<!DOCTYPE html>
<html class=" scriptsOn" xmlns:fb="http://www.facebook.com/2008/fbml" xmlns:og="http://opengraphprotocol.org/schema/">
 <head>
  <meta content="text/html; charset=utf-8" http-equiv="content-type"/>
  <script async="" id="tinygpt" src="http://pubads.g.doubleclick.net/gampad/ads?gdfp_req=1&amp;correlator=1259198095556608&amp;output=json_html&amp;callback=tinygpt.callback&amp;impl=fifs&amp;json_a=1&amp;iu_parts=4215%2Cimdb2.consumer.main%2Cgenre&amp;enc_prev_ius=%2F0%2F1%2F2%2C%2F0%2F1%2F2&amp;prev_iu_szs=300x250%7C300x600%7C11x1%2C728x90%7C1008x150%7C1008x200%7C1008x30%7C970x250%7C9x1&amp;prev_scp=p%3Dtr%7Cp%3Dtop%2Ct&amp;cust_params=ab%3Df%26c%3D1%26bpx%3D1%26u%3D672656290504%26oe%3Dutf-8" type="text/javascript">
  </script>
  <script type="text/javascript">
   var IMDbTimer={starttime: new Date().getTime()};
  </script>
  <script>
   (function(t){ (t.events = t.events || {})["csm_head_pre_title"] = new Date().getTime(); })(IMDbTimer);
  </script>
  <script>
   var addCl

In [210]:
for i in soup.findAll('a'):
    try:
        if 'genre/' in i['href'] and 'genre/?' not in i['href'] and i.find('span'):
            print i.text.split()[0]
#             try:
#                 for j in i['href'].split('/'):
#                     if 'tt' in j and '_' not in j:
#                         print j
#             except:
#                 pass
    except:
        pass

Action
Adventure
Animation
Biography
Comedy
Crime
Documentary
Drama
Family
Fantasy
Film-Noir
History
Horror
Music
Musical
Mystery
Romance
Sci-Fi
Sport
Thriller
War
Western


# Mine: Rename & describe data

In [9]:
df = pd.read_csv('imdb_scraped.csv').iloc[:,1:]

In [10]:
df.head()

Unnamed: 0,id_num,title,length,genres,directors,writers,stars,plot,keywords,rating,num_rating
0,tt4061854,Guess How Much I Love You: Autumn's Here (2014...,1h 12min,Animation,Steve_Moltzen,,Ky_Baldwin Dylan_Elchaar Kate_Fitzpatrick,Autumn is here and it's time for a change. In ...,,8.7,7
1,tt0030726,Secrets of a Nurse (1938) - IMDb,1h 15min,Drama Mystery Sport,Arthur_Lubin,Lester_Cole Thomas_Lennon,Edmund_Lowe Helen_Mack Dick_Foran,When battered prize-fighter Lee Burke is taken...,,7.2,5
2,tt0050212,The Bridge on the River Kwai (1957) - IMDb,2h 41min,Adventure Drama War,David_Lean,Pierre_Boulle Carl_Foreman,William_Holden Alec_Guinness Jack_Hawkins,After settling his differences with a Japanese...,bridge jungle allies construction sabotage swe...,8.2,152257
3,tt0036409,Swingtime Johnny (1943) - IMDb,1h 1min,Comedy Music,Edward_F._Cline,Warren_Wilson Clyde_Bruckman,Patty_Andrews Maxene_Andrews Laverne_Andrews,The Andrews Sisters take a hiatus from show bu...,dancing singing artillery_shell california pop...,7.2,18
4,tt2818724,The Tragedy of Macbeth (2012) - IMDb,,Animation,Dan_Gallagher,William_Shakespeare,Mirai_Booth-Ong James_Curcione Dan_Gallagher,Add a Plot »,sci_fi_fan shakespearean teaching learning ble...,6.2,5


In [11]:
genres = ['Action', 'Adventure', 'Animation', 'Biography', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Family', 'Fantasy', 'Film-Noir', 'History', 'Horror', 'Music', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Sport', 'Thriller', 'War', 'Western']

In [12]:
for i in genres:
    df['genre_'+i] = df['genres'].map(lambda x: 1 if i in x else 0)

In [20]:
def convert_length(length):
    out = 0
    try:
        l = length.split()
        for i in l:
            if 'h' in i:
                out += float(i.strip('h'))*60
            elif 'min' in i:
                out += float(i.strip('min'))
        return out
    except:
        return length

In [21]:
df['length_m'] = df['length'].map(convert_length)

In [25]:
df.head()

Unnamed: 0,id_num,title,length,genres,directors,writers,stars,plot,keywords,rating,...,genre_Music,genre_Musical,genre_Mystery,genre_Romance,genre_Sci-Fi,genre_Sport,genre_Thriller,genre_War,genre_Western,length_m
0,tt4061854,Guess How Much I Love You: Autumn's Here (2014...,1h 12min,Animation,Steve_Moltzen,,Ky_Baldwin Dylan_Elchaar Kate_Fitzpatrick,Autumn is here and it's time for a change. In ...,,8.7,...,0,0,0,0,0,0,0,0,0,72.0
1,tt0030726,Secrets of a Nurse (1938) - IMDb,1h 15min,Drama Mystery Sport,Arthur_Lubin,Lester_Cole Thomas_Lennon,Edmund_Lowe Helen_Mack Dick_Foran,When battered prize-fighter Lee Burke is taken...,,7.2,...,0,0,1,0,0,1,0,0,0,75.0
2,tt0050212,The Bridge on the River Kwai (1957) - IMDb,2h 41min,Adventure Drama War,David_Lean,Pierre_Boulle Carl_Foreman,William_Holden Alec_Guinness Jack_Hawkins,After settling his differences with a Japanese...,bridge jungle allies construction sabotage swe...,8.2,...,0,0,0,0,0,0,0,1,0,161.0
3,tt0036409,Swingtime Johnny (1943) - IMDb,1h 1min,Comedy Music,Edward_F._Cline,Warren_Wilson Clyde_Bruckman,Patty_Andrews Maxene_Andrews Laverne_Andrews,The Andrews Sisters take a hiatus from show bu...,dancing singing artillery_shell california pop...,7.2,...,1,0,0,0,0,0,0,0,0,61.0
4,tt2818724,The Tragedy of Macbeth (2012) - IMDb,,Animation,Dan_Gallagher,William_Shakespeare,Mirai_Booth-Ong James_Curcione Dan_Gallagher,Add a Plot »,sci_fi_fan shakespearean teaching learning ble...,6.2,...,0,0,0,0,0,0,0,0,0,


# Refine: Visualize data
Bonus: Use multiple data viz tools

# Model: Define training set

# Model: Fit and evaluate model
Bonus: How is model at risk of overfitting?

# Present: Describe findings in blog post: summary, model, recommendations