# IMDB API + Random Forests

This file includes all the initial code that gathers and cleans data into a usable form.

In [1]:
# Imports at the top
import json
import urllib
import pandas as pd
import numpy as np
import requests
import re
from imdbpie import Imdb

import nltk
from nltk.tokenize import RegexpTokenizer
import collections
from collections import Counter

from sklearn.preprocessing import MinMaxScaler
from sklearn.cross_validation import KFold, cross_val_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.grid_search import GridSearchCV
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import r2_score
from sklearn.cross_validation import cross_val_score
from sklearn.tree import export_graphviz
from sklearn.externals.six import StringIO

from IPython.display import Image
import pydot
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
imdb = Imdb()
imdb = Imdb(anonymize=True)

In [3]:
#So this function gets a list of all 100 movies IDs.
#However, IMDB.com doesn't like people getting all of their data very easily, so we'll just use this page to get the IDs
#So this function doesn't iterate through pages, as all movies in the Bottom 100 are on a single page.
#It takes their unique IDs that are encoded in the HTML, and puts them in a list, called 'entries

'''
def get_entries(site):
    response = requests.get(site)
    html = response.text
    entries = re.findall("<a href.*?/title/(.*?)/", html) #Wrong regex
    return list(set(entries))
'''

'\ndef get_entries(site):\n    response = requests.get(site)\n    html = response.text\n    entries = re.findall("<a href.*?/title/(.*?)/", html) #Wrong regex\n    return list(set(entries))\n'

In [4]:
'''
bottomEntries = get_entries('http://www.imdb.com/chart/bottom')
topEntries = get_entries('http://www.imdb.com/chart/top')
'''

"\nbottomEntries = get_entries('http://www.imdb.com/chart/bottom')\ntopEntries = get_entries('http://www.imdb.com/chart/top')\n"

In [5]:
#Now that we have the 250 IDs, we need a way to search omdapi (which has gathered all data for each IMDB movie in a 
#nice little JSON tree). 

#So we need to scrape each movie's JSON tree with Beautiful soup
#Just like with indeed.com, it's going to use omdabpi's search engine 250 times, once for each id in the entries list
#from above. After it searches a movie id in the lsit above, it will scrape its JSON tree.
'''
def get_entry(entry):
    res = requests.get('http://www.omdbapi.com/?i='+entry)
    if res.status_code != 200:
        print entry, res.status_code
    else:
        print '.',
    try:
        j = json.loads(res.text)
    except ValueError:
        j = None
    return j
'''

"\ndef get_entry(entry):\n    res = requests.get('http://www.omdbapi.com/?i='+entry)\n    if res.status_code != 200:\n        print entry, res.status_code\n    else:\n        print '.',\n    try:\n        j = json.loads(res.text)\n    except ValueError:\n        j = None\n    return j\n"

In [6]:
#So you're going to repreat the function above for every item(movie id) in the 'entries' list
#It returns a dictionary that can then be turned into a dataframe

'''
bottomEntriesDictList = [get_entry(e) for e in bottomEntries]
'''

'\nbottomEntriesDictList = [get_entry(e) for e in bottomEntries]\n'

In [7]:
#Here we turn the JSON file for each of th 100 movies into a dataframe
'''
bottom100 = pd.DataFrame(bottomEntriesDictList)
'''

'\nbottom100 = pd.DataFrame(bottomEntriesDictList)\n'

In [8]:
#So you're going to repreat the function above for every item(movie id) in the 'entries' list
#It returns a dictionary that can then be turned into a dataframe
'''
topEntriesDictList = [get_entry(e) for e in topEntries]
'''

'\ntopEntriesDictList = [get_entry(e) for e in topEntries]\n'

In [9]:
#Here we turn the JSON file for each of th 100 movies into a dataframe

'''
top250 = pd.DataFrame(topEntriesDictList)
'''

'\ntop250 = pd.DataFrame(topEntriesDictList)\n'

In [10]:
#There is still some information we would want, but OMDb API does not provide
#So, we have to go back to imdb.com to scrape the gross revenue for each movie
#This function will ultimately search for each movie by their id in the entries list, and scrape the gross revenue into
#a new list called 'grosses

'''
def get_gross(entry): #define the function
    response = requests.get('http://www.imdb.com/title/'+entry) #This will generate a request from the page for an entry
    html = response.text
    try:
        gross_list = re.findall("Gross:</h4>[ ]*\$([^ ]*)", html) #This will create a list with the value after the word 'Gross'
        gross = int(gross_list[0].replace(',', '')) #This creates a new value by convertinf the above to an integer and eliminating commas
        return gross
    except Exception as ex:
        return None
'''

'\ndef get_gross(entry): #define the function\n    response = requests.get(\'http://www.imdb.com/title/\'+entry) #This will generate a request from the page for an entry\n    html = response.text\n    try:\n        gross_list = re.findall("Gross:</h4>[ ]*\\$([^ ]*)", html) #This will create a list with the value after the word \'Gross\'\n        gross = int(gross_list[0].replace(\',\', \'\')) #This creates a new value by convertinf the above to an integer and eliminating commas\n        return gross\n    except Exception as ex:\n        return None\n'

In [11]:
'''
bottomGrosses = [(e, get_gross(e)) for e in bottomEntries]#Repeat the function above for each id in the entries list
topGrosses = [(e, get_gross(e)) for e in topEntries]
bottomGrosses = pd.DataFrame(bottomGrosses, columns=['imdbID', 'gross'])
topGrosses = pd.DataFrame(topGrosses, columns=['imdbID', 'gross'])
bottomGrosses["gross"] = bottomGrosses["gross"].fillna(bottomGrosses["gross"].mean())
topGrosses["gross"] = topGrosses["gross"].fillna(topGrosses["gross"].mean())
'''

'\nbottomGrosses = [(e, get_gross(e)) for e in bottomEntries]#Repeat the function above for each id in the entries list\ntopGrosses = [(e, get_gross(e)) for e in topEntries]\nbottomGrosses = pd.DataFrame(bottomGrosses, columns=[\'imdbID\', \'gross\'])\ntopGrosses = pd.DataFrame(topGrosses, columns=[\'imdbID\', \'gross\'])\nbottomGrosses["gross"] = bottomGrosses["gross"].fillna(bottomGrosses["gross"].mean())\ntopGrosses["gross"] = topGrosses["gross"].fillna(topGrosses["gross"].mean())\n'

In [12]:
'''
def cleanData(df, df1):
    movieType = []
    df.drop(['Actors','Awards','Country','Director','Genre','Language','Metascore',
             'Plot','Poster','Rated','Released','Response','Type','Writer'], axis =1 ,inplace=True)
    df = df[df.Runtime != 'N/A']
    for row in ['Runtime']:
        df['Runtime'] = df['Runtime'].str.rstrip('min').astype(float)
    for row in ['Year']:
        df['Year'] = df['Year'].astype(int)
    for row in ['imdbRating']:
        df['imdbRating'] = df['imdbRating'].astype(float)
    for row in ['imdbVotes']:
        df['imdbVotes'] = df['imdbVotes'].replace(',','',regex=True).astype(float)
    for row in df['imdbRating']:
        if row <= 3:
            movieType.append(0)
        else:
            movieType.append(1)
    df['movieType'] = movieType
    df = df.rename(columns = {'imdbID'      :'imdbID',
                                  'Title'       :'title',
                                  'Year'        :'year',
                                  'Runtime'     :'runtime',
                                  'imdbVotes'   :'imdbVotes',
                                  'imdbRating'  :'imdbRating',
                                  'movieType'   :'movieType'})
    df = df[['imdbID', 'movieType', 'title', 'year', 'runtime', 'imdbVotes', 'imdbRating']]
    df = pd.merge(df, df1)
    return df

top250 = cleanData(top250, topGrosses)
bottom100 = cleanData(bottom100, bottomGrosses)
'''

"\ndef cleanData(df, df1):\n    movieType = []\n    df.drop(['Actors','Awards','Country','Director','Genre','Language','Metascore',\n             'Plot','Poster','Rated','Released','Response','Type','Writer'], axis =1 ,inplace=True)\n    df = df[df.Runtime != 'N/A']\n    for row in ['Runtime']:\n        df['Runtime'] = df['Runtime'].str.rstrip('min').astype(float)\n    for row in ['Year']:\n        df['Year'] = df['Year'].astype(int)\n    for row in ['imdbRating']:\n        df['imdbRating'] = df['imdbRating'].astype(float)\n    for row in ['imdbVotes']:\n        df['imdbVotes'] = df['imdbVotes'].replace(',','',regex=True).astype(float)\n    for row in df['imdbRating']:\n        if row <= 3:\n            movieType.append(0)\n        else:\n            movieType.append(1)\n    df['movieType'] = movieType\n    df = df.rename(columns = {'imdbID'      :'imdbID',\n                                  'Title'       :'title',\n                                  'Year'        :'year',\n            

In [13]:
'''
top250.to_csv('../assets/06-project6-assets/data/top250.csv', encoding='utf8', index=False)
bottom100.to_csv('../assets/06-project6-assets/data/bottom100.csv', encoding='utf8', index=False)
'''

"\ntop250.to_csv('../assets/06-project6-assets/data/top250.csv', encoding='utf8', index=False)\nbottom100.to_csv('../assets/06-project6-assets/data/bottom100.csv', encoding='utf8', index=False)\n"

In [14]:
#Now we need to scrape the reviews for each of our movie collections, but put them in a different dataframe
#So first, we put the imdbIDs in their respective lists so we can iterate through them when scraping reviews
#We need the ID again so we can use it as the common key with which we can join tablesl ater

'''top250MovieIDs = top250.imdbID.values.tolist()
bottom100MovieIDs = bottom100.imdbID.values.tolist()
'''

'top250MovieIDs = top250.imdbID.values.tolist()\nbottom100MovieIDs = bottom100.imdbID.values.tolist()\n'

In [15]:
'''
top250Reviews = []
top250IDs = []
for x in top250MovieIDs: #For every ID in the ID list
    review = imdb.get_title_reviews(x, max_results=15) #We take a list of 15 reviews
    for i in review: #For every review in the list of reviews
        top250IDs.append(x) #We add that reviews id to one list 
        top250Reviews.append(i.text) #and the review to another, so they all correspond
'''

'\ntop250Reviews = []\ntop250IDs = []\nfor x in top250MovieIDs: #For every ID in the ID list\n    review = imdb.get_title_reviews(x, max_results=15) #We take a list of 15 reviews\n    for i in review: #For every review in the list of reviews\n        top250IDs.append(x) #We add that reviews id to one list \n        top250Reviews.append(i.text) #and the review to another, so they all correspond\n'

In [16]:
#Turn those two lists into a dataframe with the ID and 15 reviews for each ID

'''
top250ReviewData = pd.DataFrame({"imdbID": top250IDs, "reviews": top250Reviews})
'''

'\ntop250ReviewData = pd.DataFrame({"imdbID": top250IDs, "reviews": top250Reviews})\n'

In [17]:
#We repeat the process above, except with the bottom 100
#We don't want to combine these dataframes yet, because we want the top 50 adjectives used to describe
#the worst and best movies, and see to which extent there is overlap or exclusivity in the ways
#people describe good and bad movies

'''
bottom100Reviews = []
bottom100IDs = []
for x in bottom100MovieIDs:
    review = imdb.get_title_reviews(x, max_results=15)
    for i in review:
        bottom100IDs.append(x)
        bottom100Reviews.append(i.text)
'''

'\nbottom100Reviews = []\nbottom100IDs = []\nfor x in bottom100MovieIDs:\n    review = imdb.get_title_reviews(x, max_results=15)\n    for i in review:\n        bottom100IDs.append(x)\n        bottom100Reviews.append(i.text)\n'

In [18]:
#Now we need to isolate each word and tag it with its respective part of speech
'''
bottom100ReviewData = pd.DataFrame({"imdbID": bottom100IDs, "reviews": bottom100Reviews})
tokenizer = RegexpTokenizer('\w+|\$[\d\.]+|\S+')
top250Tokens = [tokenizer.tokenize(review) for review in top250Reviews]
top250PosTokens = [nltk.tag.pos_tag(token) for token in top250Tokens]
bottom100Tokens = [nltk.word_tokenize(review) for review in bottom100Reviews]
bottom100PosTokens = [nltk.tag.pos_tag(token) for token in bottom100Tokens]
'''

'\nbottom100ReviewData = pd.DataFrame({"imdbID": bottom100IDs, "reviews": bottom100Reviews})\ntokenizer = RegexpTokenizer(\'\\w+|\\$[\\d\\.]+|\\S+\')\ntop250Tokens = [tokenizer.tokenize(review) for review in top250Reviews]\ntop250PosTokens = [nltk.tag.pos_tag(token) for token in top250Tokens]\nbottom100Tokens = [nltk.word_tokenize(review) for review in bottom100Reviews]\nbottom100PosTokens = [nltk.tag.pos_tag(token) for token in bottom100Tokens]\n'

In [19]:
#Now we need to go through the part of speech list, and isolate the adjectives, since they're the descriptive words

'''
top250AdjList = []
for x in top250PosTokens:
    # each x is a list of (word, POS tag) tuples
    for word, pos in x:
        if pos in ['JJ', 'JJS', 'JJR']: # feel free to add any other tags you may be looking for
            top250AdjList.append(word)
'''

"\ntop250AdjList = []\nfor x in top250PosTokens:\n    # each x is a list of (word, POS tag) tuples\n    for word, pos in x:\n        if pos in ['JJ', 'JJS', 'JJR']: # feel free to add any other tags you may be looking for\n            top250AdjList.append(word)\n"

In [20]:
'''
top250CommonAdj= [a for a, b in Counter(top250AdjList).most_common(50)]
'''

'\ntop250CommonAdj= [a for a, b in Counter(top250AdjList).most_common(50)]\n'

In [21]:
#Repeat process for the bottom 100 movies
'''
bottom100AdjList = []
for x in bottom100PosTokens:
    # each x is either a list of (word, POS tag) tuples
    for word, pos in x:
        if pos in ['JJ', 'JJS', 'JJR']: # feel free to add any other tags you may be looking for
            bottom100AdjList.append(word)
'''

"\nbottom100AdjList = []\nfor x in bottom100PosTokens:\n    # each x is either a list of (word, POS tag) tuples\n    for word, pos in x:\n        if pos in ['JJ', 'JJS', 'JJR']: # feel free to add any other tags you may be looking for\n            bottom100AdjList.append(word)\n"

In [22]:
'''
bottom100CommonAdj= [a for a, b in Counter(bottom100AdjList).most_common(50)]
'''

'\nbottom100CommonAdj= [a for a, b in Counter(bottom100AdjList).most_common(50)]\n'

In [23]:
#Combine these 2 lists and remove the awkward variable
'''
movieAdj = bottom100CommonAdj + top250CommonAdj
movieAdj.remove("'t")
'''

'\nmovieAdj = bottom100CommonAdj + top250CommonAdj\nmovieAdj.remove("\'t")\n'

In [24]:
#Create a dataframe that places each descriptor as an index
#We see that a top 50 adjective is the letter 't.' Not sure what the pos tagger is doing there, so we'll drop the col
'''
movieDescriptors = pd.DataFrame(columns=movieAdj)
'''

'\nmovieDescriptors = pd.DataFrame(columns=movieAdj)\n'

In [25]:
#append THAT dataframe to each reviews dataframe
#Now we want a dataframe that joins the Review data (movieID, 15reviews each), from above with these descriptors
'''
top250Movies = pd.DataFrame(top250ReviewData)
top250Movies = top250Movies.join(movieDescriptors)
'''

'\ntop250Movies = pd.DataFrame(top250ReviewData)\ntop250Movies = top250Movies.join(movieDescriptors)\n'

In [26]:
'''
bottom100Movies = pd.DataFrame(bottom100ReviewData)
bottom100Movies = bottom100Movies.join(movieDescriptors)
'''
#We see right now it's just filled with NaN values, so we'll populate the cells in the loop below

'\nbottom100Movies = pd.DataFrame(bottom100ReviewData)\nbottom100Movies = bottom100Movies.join(movieDescriptors)\n'

In [27]:
#create function to loop through each word in each review
'''
def getDescriptors(df):
    for c, col in enumerate(df.columns[2:]):
        for r, row in enumerate(df.index):
            reviewLower = df.loc[row,"reviews"].lower()
            if (col in reviewLower):
                df.loc[row,col] = 1
            else:
                df.loc[row,col] = 0

getDescriptors(top250Movies)
getDescriptors(bottom100Movies)
'''

'\ndef getDescriptors(df):\n    for c, col in enumerate(df.columns[2:]):\n        for r, row in enumerate(df.index):\n            reviewLower = df.loc[row,"reviews"].lower()\n            if (col in reviewLower):\n                df.loc[row,col] = 1\n            else:\n                df.loc[row,col] = 0\n\ngetDescriptors(top250Movies)\ngetDescriptors(bottom100Movies)\n'

In [28]:
#We need to join both of these dataframes to the original dataframes that have more info about each movie ID
#First, though, we need to drop the 'Reviews' column because we don't need all of that text

'''
top250Movies.drop(['reviews'], axis =1 ,inplace=True)
bottom100Movies.drop(['reviews'], axis =1 ,inplace=True)
'''

"\ntop250Movies.drop(['reviews'], axis =1 ,inplace=True)\nbottom100Movies.drop(['reviews'], axis =1 ,inplace=True)\n"

In [29]:
#Then we need to groupby imdbID so we can actually join the tables.
#Currently, the review data has 15 rows for each id, while the original movie data only has 1

'''
top250MoviesCopy = top250Movies.groupby(["imdbID"], group_keys=False, as_index=False).apply(lambda x: x.iloc[:,1:].max())
bottom100MoviesCopy = bottom100Movies.groupby(["imdbID"], group_keys=False, as_index=False).apply(lambda x: x.iloc[:,1:].max())
'''

'\ntop250MoviesCopy = top250Movies.groupby(["imdbID"], group_keys=False, as_index=False).apply(lambda x: x.iloc[:,1:].max())\nbottom100MoviesCopy = bottom100Movies.groupby(["imdbID"], group_keys=False, as_index=False).apply(lambda x: x.iloc[:,1:].max())\n'

In [30]:
#Before we go any further, let's save these sentiment tables to their own .csv files

'''
top250Movies.to_csv('../assets/06-project6-assets/data/top250Descriptors.csv', encoding='utf8', index=False)
bottom100Movies.to_csv('../assets/06-project6-assets/data/bottom100Descriptors.csv', encoding='utf8', index=False)
'''

"\ntop250Movies.to_csv('../assets/06-project6-assets/data/top250Descriptors.csv', encoding='utf8', index=False)\nbottom100Movies.to_csv('../assets/06-project6-assets/data/bottom100Descriptors.csv', encoding='utf8', index=False)\n"

In [31]:
#combine them to the tables with the rest of thir info

'''
goodMovies = top250.join(top250MoviesCopy)
badMovies = bottom100.join(bottom100MoviesCopy)
'''

'\ngoodMovies = top250.join(top250MoviesCopy)\nbadMovies = bottom100.join(bottom100MoviesCopy)\n'

In [32]:
# Now we can save these to .csv files to avoid all of that scraping above

'''
goodMovies.to_csv('../assets/06-project6-assets/data/top250MoviesDescriptors.csv', encoding='utf8', index=False)
badMovies.to_csv('../assets/06-project6-assets/data/bottom100MoviesDescriptors.csv', encoding='utf8', index=False)
'''

"\ngoodMovies.to_csv('../assets/06-project6-assets/data/top250MoviesDescriptors.csv', encoding='utf8', index=False)\nbadMovies.to_csv('../assets/06-project6-assets/data/bottom100MoviesDescriptors.csv', encoding='utf8', index=False)\n"

In [33]:
#combine them into a master using the copy - should have 348, 107 shape
'''
movies = pd.concat([goodMovies, badMovies])
movies.reset_index(inplace=True, drop=True)
'''

'\nmovies = pd.concat([goodMovies, badMovies])\nmovies.reset_index(inplace=True, drop=True)\n'

In [34]:
#save that master

#movies.to_csv('../assets/06-project6-assets/data/movies.csv', encoding='utf8', index=False)