In [1]:
# import relevant libraries

# basic libraries
import pandas as pd
import numpy as np

# visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns

# machine learning libraries
import sklearn.tree
import sklearn.metrics
import sklearn.model_selection
import sklearn.linear_model

# data collection libraries
import requests
from selenium import webdriver
import bs4
import imdbpie
import json

%matplotlib inline

# Identify: Problem Statement & Goals

**Using data from IMDB, can we build a model that predicts movie ratings based on certain inputs?**

- Predict movie ratings using information about the movie (e.g. genre, length of movie etc.)
- What are the factors that affect movie rating?

# Acquire: Query IMDB, write results, scrape IMDB

## Using IMDBpie

In [23]:
imdb = imdbpie.Imdb()

In [132]:
imdb_df = pd.read_json(json.dumps(imdb.top_250()))

In [42]:
imdb_df.head()

Unnamed: 0,can_rate,image,num_votes,rating,tconst,title,type,year
0,True,{u'url': u'https://images-na.ssl-images-amazon...,1720313,9.3,tt0111161,The Shawshank Redemption,feature,1994
1,True,{u'url': u'https://images-na.ssl-images-amazon...,1175776,9.2,tt0068646,The Godfather,feature,1972
2,True,{u'url': u'https://images-na.ssl-images-amazon...,805875,9.0,tt0071562,The Godfather: Part II,feature,1974
3,True,{u'url': u'https://images-na.ssl-images-amazon...,1706334,9.0,tt0468569,The Dark Knight,feature,2008
4,True,{u'url': u'https://images-na.ssl-images-amazon...,458263,8.9,tt0050083,12 Angry Men,feature,1957


In [135]:
imdb_details = []

In [134]:
#id, title, rating, num_votes, genre, stars, director, plots, length, certification

In [137]:
for i in imdb_df['tconst'][:5]:
    title = imdb.get_title_by_id(i)
    details = [i, title.title, title.rating, title.votes,
               title.genres, title.cast_summary, title.directors_summary, 
               title.plots, title.runtime, title.certification]
    imdb_details.append(details)

In [138]:
pd.DataFrame(imdb_details)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,tt0111161,The Shawshank Redemption,9.3,1720336,"[Crime, Drama]","[<Person: Tim Robbins (nm0000209)>, <Person: M...",[<Person: Frank Darabont (nm0001104)>],[Chronicles the experiences of a formerly succ...,8520,R
1,tt0068646,The Godfather,9.2,1175796,"[Crime, Drama]","[<Person: Marlon Brando (nm0000008)>, <Person:...",[<Person: Francis Ford Coppola (nm0000338)>],[When the aging head of a famous crime family ...,10500,R
2,tt0071562,The Godfather: Part II,9.0,805891,"[Crime, Drama]","[<Person: Al Pacino (nm0000199)>, <Person: Rob...",[<Person: Francis Ford Coppola (nm0000338)>],[The continuing saga of the Corleone crime fam...,12120,R
3,tt0468569,The Dark Knight,9.0,1706354,"[Action, Crime, Drama, Thriller]","[<Person: Christian Bale (nm0000288)>, <Person...",[<Person: Christopher Nolan (nm0634240)>],[Set within a year after the events of Batman ...,9120,PG-13
4,tt0050083,12 Angry Men,8.9,458267,"[Crime, Drama]","[<Person: Henry Fonda (nm0000020)>, <Person: L...",[<Person: Sidney Lumet (nm0001486)>],[The defense and the prosecution have rested a...,5760,Approved


## Bonus: Scrape with BeautifulSoup

We want there to be as even a mix of movies as possible.

- Consider all genres
- Consider highest and lowest ratings, and if possible, some in the middle
- Consider highest and lowest number of votes, and if possible, some in the middle

Note: Each movie has a unique IMDB ID which can be used to search for it in the database.

### Defining our functions and variables

In [2]:
# path to phantomjs
phan_path = '//Applications/phantomjs'

In [3]:
# url templates
low_ratings = 'http://www.imdb.com/search/title?genres={}&title_type=feature&sort=user_rating,asc&page={}&ref_=adv_nxt'
high_ratings = 'http://www.imdb.com/search/title?genres={}&title_type=feature&sort=user_rating,desc&page={}&ref_=adv_nxt'
high_num = 'http://www.imdb.com/search/title?genres={}&title_type=feature&sort=num_votes,desc&page={}&ref_=adv_nxt'
low_num = 'http://www.imdb.com/search/title?genres={}&title_type=feature&sort=num_votes,asc&page={}&ref_=adv_nxt'
main_url = 'http://www.imdb.com/title/{}/'
summary_url = 'http://www.imdb.com/title/{}/plotsummary?ref_=tt_stry_pl'
keywords_url = 'http://www.imdb.com/title/{}/keywords?ref_=tt_stry_kw'

In [4]:
# variables
all_genres = []
all_ids = []
url_list = [low_ratings, high_ratings, high_num, low_num]

In [4]:
# function to create a soup variable
def create_soup(url):
    driver = webdriver.PhantomJS(executable_path=phan_path)
    driver.get(url)
    soup = bs4.BeautifulSoup(driver.page_source, 'lxml')
    return soup

In [5]:
# function to get list of genres from IMDB
def get_genre_list():
    soup = create_soup('http://www.imdb.com/genre/')
    genre_list = []
    for i in soup.findAll('a'):
        try:
            if 'genre/' in i['href'] and 'genre/?' not in i['href'] and i.find('span'):
                genre_list.append(i.text.split()[0])
        except:
            pass
    return genre_list

In [6]:
# function to get a list of IDs from each genre
def get_id(genre, target_url,page):
    soup = create_soup(target_url.format(genre, page))
    ids = []
    for i in soup.findAll('a'):
        try:
            if 'title/tt' in i['href'] and 'http' not in i['href']:
                try:
                    for j in i['href'].split('/'):
                        if 'tt' in j and '_' not in j:
                            ids.append(j.strip())
                except:
                    pass
        except:
            pass
    return ids

In [7]:
def get_title(id_num):
    soup = create_soup(main_url.format(id_num))

    # Get title of the movie
    try:
        title = soup.find('title').text
    except:
        title = ''

    return title

In [8]:
def get_genres(id_num):
    soup = create_soup(main_url.format(id_num))
    # Get genre(s) of the movie
    genres = []
    for i in soup.find('div', class_='subtext').findAll('span', itemprop='genre'):
        genres.append(i.text)
    genres = ' '.join(genres)

    return genres

In [9]:
def get_rating(id_num):
    soup = create_soup(main_url.format(id_num))
    # Get rating of the movie
    try:
        rating = soup.find('div', class_='ratingValue').find('strong')['title'].split()[0]
    except:
        rating = ''

    return rating

In [10]:
def get_num_rating(id_num):
    soup = create_soup(main_url.format(id_num))
    
    # Get number of user ratings
    try:
        num_rating = soup.find('div', class_='ratingValue').find('strong')['title'].split()[-3]
    except:
        num_rating = ''

    return num_rating

In [11]:
def get_length(id_num):
    soup = create_soup(main_url.format(id_num))
    
    # Get length of movie
    try:
        length = soup.find('div', class_='subtext').find('time').text.strip()
    except:
        length = ''

    return length

In [12]:
def get_cert(id_num):
    soup = create_soup(main_url.format(id_num))
    
    # Get certification
    try:
        certification = soup.find('span', itemprop='contentRating').text
    except:
        certification = ''

    return certification

In [13]:
def get_directors(id_num):
    soup = create_soup(main_url.format(id_num))
    
    # Get directors, names are linked by underscores
    directors = []
    for j in soup.findAll('div', class_='credit_summary_item'):
        if 'Direct' in j.find('h4').text:
            for i in j.findAll('span', itemprop='name'):
                directors.append('_'.join(i.text.split()))
    directors = ' '.join(directors)

    return directors

In [14]:
def get_writers(id_num):
    soup = create_soup(main_url.format(id_num))
    
    # Get writers, names are linked by underscores
    writers = []
    for j in soup.findAll('div', class_='credit_summary_item'):
        if 'Writ' in j.find('h4').text:
            for i in j.findAll('span', itemprop='name'):
                writers.append('_'.join(i.text.split()))
    writers = ' '.join(writers)
    return writers

In [15]:
def get_stars(id_num):
    soup = create_soup(main_url.format(id_num))

    # Get stars, names are linked by underscores
    stars = []
    for j in soup.findAll('div', class_='credit_summary_item'):
        if 'Star' in j.find('h4').text:
            for i in j.findAll('span', itemprop='name'):
                stars.append('_'.join(i.text.split()))
    stars = ' '.join(stars)
    return stars

In [39]:
def get_plot(id_num):
    soup = create_soup(main_url.format(id_num))

    # Get one-liner for the movie
    plot = []
    try:
        plot.append(soup.find('div', class_='summary_text').text.strip())
    except:
        pass
    
    soup = create_soup(summary_url.format(id_num))
    for j in soup.findAll('p', class_='plotSummary'):
        plot.append(j.text.strip())
    plot = ' '.join(plot)
    return plot

In [17]:
def get_keywords(id_num):
    soup = create_soup(keywords_url.format(id_num))

    keywords = []
    for j in soup.findAll('div', class_='sodatext'):
        keywords.append('_'.join(j.text.strip().split()))
    keywords = ' '.join(keywords)

    return keywords

In [19]:
# Combine content functions
def get_content(id_num):
    title = get_title(id_num)
    length = get_length(id_num)
    rating = get_rating(id_num)
    num_rating = get_num_rating(id_num)
    cert = get_cert(id_num)
    directors = get_directors(id_num)
    writers = get_writers(id_num)
    stars = get_stars(id_num)
    plot = get_plot(id_num)
    keywords = get_keywords(id_num)
    return [title, length, cert, directors, writers, stars, plot, keywords, rating, num_rating]

In [None]:
all_genres = get_genre_list()

In [23]:
for i in all_genres:
    print i, 
    for j in range(len(url_list)):
        for k in [1,15]:
            id_l = get_id(i,url_list[j],k)
            print len(id_l),
            all_ids.extend(id_l)

Action 121 116 111 105 102 101 114 115 Adventure 119 119 114 114 101 105 123 111 Animation 119 113 111 120 100 119 113 113 Biography 117 114 110 111 100 120 114 115 Comedy 106 108 118 114 100 102 119 106 Crime 118 112 110 113 100 105 116 122 Documentary 0 0 0 0 0 0 0 0 Drama 114 114 115 117 100 100 116 115 Family 114 111 116 115 100 114 121 112 Fantasy 115 111 113 119 101 104 121 116 Film-Noir 114 56 101 64 100 73 127 56 History 117 123 112 123 100 117 116 114 Horror 111 116 121 116 100 103 120 113 Music 118 121 114 123 100 121 123 118 Musical 108 117 110 128 101 118 116 106 Mystery 115 120 116 118 100 108 116 114 Romance 117 112 119 117 101 103 119 120 Sci-Fi 112 123 110 109 102 105 123 111 Sport 116 114 114 110 100 116 117 122 Thriller 119 117 113 108 101 100 115 110 War 115 114 115 113 100 117 111 116 Western 122 127 113 116 101 109 126 127


In [24]:
all_ids = set(all_ids)

In [27]:
df = pd.DataFrame([all_ids], index=['id']).T

In [28]:
df.head()

Unnamed: 0,id
0,tt4061854
1,tt0030726
2,tt0050212
3,tt0036409
4,tt2818724


In [30]:
df.shape

(6664, 1)

In [58]:
df['random'] = [np.random.randint(0,100) for i in range(df.shape[0])]

In [59]:
df['random'].value_counts()

93    86
44    86
55    83
16    82
73    80
79    80
78    79
56    78
82    78
85    77
75    76
42    76
20    76
31    75
14    75
98    75
71    74
6     74
51    73
15    73
34    73
35    72
26    72
54    72
94    72
86    71
29    71
46    70
5     70
60    70
      ..
36    62
65    62
92    62
12    62
88    62
17    62
3     61
45    61
23    60
47    60
48    60
13    60
83    60
81    60
10    60
49    59
68    58
33    58
74    56
4     56
24    56
18    55
77    55
40    55
27    55
76    54
9     54
70    54
96    52
58    52
Name: random, dtype: int64

In [60]:
df.to_csv('id_with_groups.csv')

In [27]:
df2 = pd.read_csv('id_with_groups.csv').iloc[:,1:]

In [31]:
df2['random'] = [np.random.randint(0,100) for i in range(df2.shape[0])]

In [34]:
df3 = df2.copy()
df3 = df3[df3.random==0]

In [36]:
df3.shape

(69, 2)

In [41]:
df3['id'].map(get_content)

KeyboardInterrupt: 

# Mine: Rename & describe data

# Refine: Visualize data
Bonus: Use multiple data viz tools

# Model: Define training set

# Model: Fit and evaluate model
Bonus: How is model at risk of overfitting?

# Present: Describe findings in blog post: summary, model, recommendations