In [7]:
# import relevant libraries

# basic libraries
import pandas as pd
import numpy as np

# visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns

# machine learning libraries
import sklearn.tree
import sklearn.metrics
import sklearn.model_selection
import sklearn.linear_model

# data collection libraries
import requests
from selenium import webdriver
import bs4
import imdbpie
import json

%matplotlib inline

# Identify: Problem Statement & Goals

**Using data from IMDB, can we build a model that predicts movie ratings based on certain inputs?**

- Predict movie ratings using information about the movie (e.g. genre, length of movie etc.)
- What are the factors that affect movie rating?

# Acquire: Query IMDB, write results, scrape IMDB

## Using IMDBpie

In [43]:
imdb = imdbpie.Imdb()

In [44]:
imdb_df = pd.read_json(json.dumps(imdb.top_250()))

In [42]:
imdb_df.head()

Unnamed: 0,can_rate,image,num_votes,rating,tconst,title,type,year
0,True,{u'url': u'https://images-na.ssl-images-amazon...,1720313,9.3,tt0111161,The Shawshank Redemption,feature,1994
1,True,{u'url': u'https://images-na.ssl-images-amazon...,1175776,9.2,tt0068646,The Godfather,feature,1972
2,True,{u'url': u'https://images-na.ssl-images-amazon...,805875,9.0,tt0071562,The Godfather: Part II,feature,1974
3,True,{u'url': u'https://images-na.ssl-images-amazon...,1706334,9.0,tt0468569,The Dark Knight,feature,2008
4,True,{u'url': u'https://images-na.ssl-images-amazon...,458263,8.9,tt0050083,12 Angry Men,feature,1957


In [135]:
imdb_details = []

In [134]:
#id, title, rating, num_votes, genre, stars, director, plots, length, certification

In [137]:
for i in imdb_df['tconst'][:5]:
    title = imdb.get_title_by_id(i)
    details = [i, title.title, title.rating, title.votes,
               title.genres, title.cast_summary, title.directors_summary, 
               title.plots, title.runtime, title.certification]
    imdb_details.append(details)

In [138]:
pd.DataFrame(imdb_details)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,tt0111161,The Shawshank Redemption,9.3,1720336,"[Crime, Drama]","[<Person: Tim Robbins (nm0000209)>, <Person: M...",[<Person: Frank Darabont (nm0001104)>],[Chronicles the experiences of a formerly succ...,8520,R
1,tt0068646,The Godfather,9.2,1175796,"[Crime, Drama]","[<Person: Marlon Brando (nm0000008)>, <Person:...",[<Person: Francis Ford Coppola (nm0000338)>],[When the aging head of a famous crime family ...,10500,R
2,tt0071562,The Godfather: Part II,9.0,805891,"[Crime, Drama]","[<Person: Al Pacino (nm0000199)>, <Person: Rob...",[<Person: Francis Ford Coppola (nm0000338)>],[The continuing saga of the Corleone crime fam...,12120,R
3,tt0468569,The Dark Knight,9.0,1706354,"[Action, Crime, Drama, Thriller]","[<Person: Christian Bale (nm0000288)>, <Person...",[<Person: Christopher Nolan (nm0634240)>],[Set within a year after the events of Batman ...,9120,PG-13
4,tt0050083,12 Angry Men,8.9,458267,"[Crime, Drama]","[<Person: Henry Fonda (nm0000020)>, <Person: L...",[<Person: Sidney Lumet (nm0001486)>],[The defense and the prosecution have rested a...,5760,Approved


## Bonus: Scrape with BeautifulSoup

We want there to be as even a mix of movies as possible.

- Consider all genres
- Consider highest and lowest ratings, and if possible, some in the middle
- Consider highest and lowest number of votes, and if possible, some in the middle

Note: Each movie has a unique IMDB ID which can be used to search for it in the database.

### Defining our functions and variables

In [8]:
# path to phantomjs
phan_path = '//Applications/phantomjs'

In [9]:
# url templates
low_ratings = 'http://www.imdb.com/search/title?genres={}&title_type=feature&sort=user_rating,asc&page={}&ref_=adv_nxt'
high_ratings = 'http://www.imdb.com/search/title?genres={}&title_type=feature&sort=user_rating,desc&page={}&ref_=adv_nxt'
high_num = 'http://www.imdb.com/search/title?genres={}&title_type=feature&sort=num_votes,desc&page={}&ref_=adv_nxt'
low_num = 'http://www.imdb.com/search/title?genres={}&title_type=feature&sort=num_votes,asc&page={}&ref_=adv_nxt'
api_url = 'http://www.omdbapi.com/?i={}&plot=full&r=json'

In [10]:
# variables
all_genres = []
all_ids = []
url_list = [low_ratings, high_ratings, high_num, low_num]

In [11]:
# function to create a soup variable
def create_soup(url):
    driver = webdriver.PhantomJS(executable_path=phan_path)
    driver.get(url)
    soup = bs4.BeautifulSoup(driver.page_source, 'lxml')
    return soup

In [12]:
# function to get list of genres from IMDB
def get_genre_list():
    soup = create_soup('http://www.imdb.com/genre/')
    genre_list = []
    for i in soup.findAll('a'):
        try:
            if 'genre/' in i['href'] and 'genre/?' not in i['href'] and i.find('span'):
                genre_list.append(i.text.split()[0])
        except:
            pass
    return genre_list

In [13]:
# function to get a list of IDs from each genre
def get_id(genre, target_url,page):
    soup = create_soup(target_url.format(genre, page))
    ids = []
    count = 5
    for i in soup.findAll('a'):
        try:
            if count > 0 and 'title/tt' in i['href'] and 'http' not in i['href']:
#                 print i
                try:
                    for j in i['href'].split('/'):
                        if 'tt' in j and '_' not in j:
                            ids.append(j.strip())
                    count -= 1
                except:
                    pass
        except:
            pass
    return ids

In [29]:
def get_content(id_num):
    r = requests.get(api_url.format(id_num))
#     tmp = pd.DataFrame(json.loads(r.text), index=[0])
    tmp = json.loads(r.text)
    return tmp

In [34]:
all_genres = get_genre_list()

In [35]:
for i in all_genres:
    print i, 
    for j in range(len(url_list)):
        for k in [1,15]:
            id_l = get_id(i,url_list[j],k)
            print len(id_l),
            all_ids.extend(id_l)

Action 5 5 5 5 5 5 5 5 Adventure 5 5 5 5 5 5 5 5 Animation 5 5 5 5 5 5 5 5 Biography 5 5 5 5 5 5 5 5 Comedy 5 5 5 5 5 5 5 5 Crime 5 5 5 5 5 5 5 5 Documentary 0 0 0 0 0 0 0 0 Drama 5 5 5 5 5 5 5 5 Family 5 5 5 5 5 5 5 5 Fantasy 5 5 5 5 5 5 5 5 Film-Noir 5 5 5 5 5 5 5 5 History 5 5 5 5 5 5 5 5 Horror 5 5 5 5 5 5 5 5 Music 5 5 5 5 5 5 5 5 Musical 5 5 5 5 5 5 5 5 Mystery 5 5 5 5 5 5 5 5 Romance 5 5 5 5 5 5 5 5 Sci-Fi 5 5 5 5 5 5 5 5 Sport 5 5 5 5 5 5 5 5 Thriller 5 5 5 5 5 5 5 5 War 5 5 5 5 5 5 5 5 Western 5 5 5 5 5 5 5 5


In [93]:
all_ids = set(all_ids)

In [94]:
df = pd.DataFrame([get_content(i) for i in all_ids])

In [95]:
df.head()

Unnamed: 0,id
0,tt2582802
1,tt1620487
2,tt4061854
3,tt2245137
4,tt0427637


In [96]:
df.shape

(618, 1)

In [58]:
df['random'] = [np.random.randint(0,100) for i in range(df.shape[0])]

In [59]:
df['random'].value_counts()

93    86
44    86
55    83
16    82
73    80
79    80
78    79
56    78
82    78
85    77
75    76
42    76
20    76
31    75
14    75
98    75
71    74
6     74
51    73
15    73
34    73
35    72
26    72
54    72
94    72
86    71
29    71
46    70
5     70
60    70
      ..
36    62
65    62
92    62
12    62
88    62
17    62
3     61
45    61
23    60
47    60
48    60
13    60
83    60
81    60
10    60
49    59
68    58
33    58
74    56
4     56
24    56
18    55
77    55
40    55
27    55
76    54
9     54
70    54
96    52
58    52
Name: random, dtype: int64

# Mine: Rename & describe data

# Refine: Visualize data
Bonus: Use multiple data viz tools

# Model: Define training set

# Model: Fit and evaluate model
Bonus: How is model at risk of overfitting?

# Present: Describe findings in blog post: summary, model, recommendations