# Webscraping and API

Webscraping and API to get all the information about a movie from one given year


# Import all necessary libraries


In [1]:
import pandas as pd
import numpy as np
import requests
import wikipedia
import re
from bs4 import BeautifulSoup
from urllib.request import urlopen
from scrapy import selector

# The Wikipedia API
The following function accepts a title, year or category and obtains the HTML document for the relevant wikipedia page


In [2]:
## WIKIPEDIA API Function

# Define function fr obtaining movie data using Wikipedia API
def wikiapi(title = "", year = ""):
    
    '''
    Takes in a title or a year and returns the wikipedia page
    ''' 

    if len(title)>0 and len(year) > 0:
        title = title + f'({str(year)} film)'
        try:
            return wikipedia.page(title)  
        except wikipedia.exceptions.PageError:
            print(f"The page for the title {title} could not be found ")
            return ""

            
            
    if len(title)==0 and len(year) > 0:
        year = year + 'Academy Awards'
        try:
            return wikipedia.page(year)
        except wikipedia.exceptions.PageError:
            print(f"The page for the year {year} could not be found ")
            return ""
    else:
        return ""



# Oscar Categories
The names of categories underwent many changes over the years. New categories were added and older ones were dropped. "Best Motion Picture" was changed to "Best Picture". The next pice of the code identifies these different categories and puts them in one of the 13 predecided categories
    

In [79]:
# The Wikipedia pages for Academy Awards are listed in terms of their edition: 1st, 2nd, 3rd, etc.
def get_edition(year):
    
    '''Takes in a year and returns the edition of the given year's oscars
    eg: 1930 --> 2nd'''
    
    editions = ['th','st', 'nd', 'rd','th', 'th', 'th', 'th', 'th', 'th', 'th']
    editions_dict = {}
    nth = year - 1928
    if nth>10 and nth<20:
        year_th = str(nth)+'th'
    else:
        last = nth%10
        year_th = str(nth) + editions[last]
    return year_th


# Prediceded list of award categories
main_categories = ['picture','director','supporting actor','supporting actress','actor', 
                        'actress','screenplay','music','cinematography',
                        'editing','special effects','sound','costume']

                    
# print(main_categories)
def get_main_category(category):
    category = category.lower()

    if category.find('story')>=0:
        return 'other'
    if category.find('best picture')>=0 or category.find('best motion picture')>=0:
        return 'best picture'
    if category.find('actor')>=0:
        if category.find('supporting')>=0:
            return 'supporting actor'
        else:
            return 'actor'   
    if category.find('actress')>=0:
        if category.find('supporting')>=0:
            return 'supporting actress'
        else:
            return 'actress'    
    if category.find('best director')>=0:
        return 'director'
    if category.find('screenplay')>=0:
        return 'screenplay'
    if category.find('music')>=0 or (category.find('score')>=0):
        return 'music'
    if category.find('costume')>=0:
        return 'costume'
    if category.find('editing')>=0:
        return 'editing'
    if category.find('effects')>=0:
        return 'effects'
    if category.find('cinematography')>=0:
        return 'cinematography'
    if category.find('sound')>=0:
        return 'sound'
    if category.find('art direction')>=0:
        return 'art direction'
    else:
        # print(f'Warning:{category} did not get matched!')
        return 'other'

# Web scraping Oscar Information for Movies

We used the Wikipedia API to scrape the HTML code from every wikipedia page for the Academy Awards. Then we used BeautifulSoup to extract the information for each category of award and save the winners and nominees. 


In [80]:
# Awards and Nominations

def awards_and_nominations(year):
    
    # Get the html file from the Wikipedia page using the wikipedia API.
    # Parse it with BeautifulSoup
    page = wikiapi(year=get_edition(year))
    soup = BeautifulSoup(page.html(),'html5lib')
    
    # initialize empty DataFrame and the corresponding Screening Number
    oscars_wn = pd.DataFrame()
    missing_categories = []
    
    # Get the table-body (tbody) from the wikipedia page where Oscars information are stored
    tbody = soup.body.find('table', class_="wikitable").find('tbody')
       
    # Make sure number of cells and header match
    if len(tbody.find_all('td')) !=len(tbody.find_all(['div', 'th'])):
    
        # The wikipedia tables needs to be fixed! Some category header may be missing.
        print('Warning: Number of cell <td> element  and header <th> does not add up for year:', year)
        print('Returned Empty dataFrame')
        return oscars_wn, missing_categories
        
        
    # Get winners and nominees
    try:
        for td,th in zip(tbody.find_all('td'),tbody.find_all(['div', 'th'])):
            cat = th.text.strip()
            category = get_main_category(cat)
            # print('\nCategory:', category)
            if category=='other':
                missing_categories.append(cat)
                continue

            # Go into the list and get the winners in bold
            for tli in td.find_all('li'):

                # Get the winners in bold
                for tb in tli.find_all('b'):
                    winner = tb.find('i').text.strip()
                    winner = re.sub(r'–',"", winner).strip()
                    if winner != None:
                        oscars_wn.loc[winner,'year']= int(year)
                        oscars_wn.loc[winner,category]='w'

                # Get the nominees from the next generation within the list
                for tli2 in tli.find_all('li'):
                    nominee = tli2.find('i').text.strip()
                    nominee = re.sub(r'–',"", nominee).strip()
                    if nominee != None:
                        oscars_wn.loc[nominee,'year']= int(year)
                        oscars_wn.loc[nominee,category]='n'
        
    except AttributeError:
        print(f'Error: in Category {category} for Year: {year}')
            

    oscars_wn['film'] = oscars_wn.index
    return oscars_wn.fillna('o'), missing_categories


In [81]:
# Get oscars Data from year_first to year_last

year_first = 1950
year_last = 2017
df_oscars = pd.DataFrame()
missing_categories = dict()
for year in range(year_first,year_last+1,1):
    print('In year:', year)
    df1, missing = awards_and_nominations(year)
    df_oscars = df_oscars.append(df1)
    missing_categories[year] = missing

# Get the set of all oscar movies
oscar_movies = set(df_oscars.film)


In year: 1950
In year: 1951
In year: 1952


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


In year: 1953
In year: 1954
In year: 1955
Error: in Category effects for Year: 1955
In year: 1956
Error: in Category effects for Year: 1956
In year: 1957
In year: 1958
In year: 1959
In year: 1960
In year: 1961
In year: 1962
In year: 1963
In year: 1964
In year: 1965
In year: 1966
In year: 1967
Error: in Category sound for Year: 1967
In year: 1968
In year: 1969
In year: 1970
In year: 1971
Error: in Category editing for Year: 1971
In year: 1972
In year: 1973
In year: 1974
Error: in Category screenplay for Year: 1974
In year: 1975
In year: 1976
In year: 1977
In year: 1978
In year: 1979
In year: 1980
In year: 1981
In year: 1982
In year: 1983
In year: 1984
In year: 1985
Error: in Category actor for Year: 1985
In year: 1986
In year: 1987
In year: 1988
In year: 1989
In year: 1990
In year: 1991
In year: 1992
In year: 1993
In year: 1994
In year: 1995
In year: 1996
In year: 1997
In year: 1998
In year: 1999
In year: 2000
In year: 2001
In year: 2002
In year: 2003
In year: 2004
In year: 2005
Error: 

In [145]:
# df_oscars = df_oscars[['year', 'film', 'best picture','director',
#                       'actor', 'actress','supporting actor', 'supporting actress', 
#                       'cinematography','screenplay','editing', 
#                       'costume','effects', 'music', 'sound']]
df_oscars.to_excel("oscars_1950_2017.xlsx", sheet_name='Sheet_name_1')                  

In [148]:
print(set(df_oscars.film))



In [143]:
# Calculate points
def oscar_score(df_oscars, year, category):
    
    #select year and category and create series
    series = df_oscars[df_oscars['year'] == year][category]  
    #drop unnominated films
    series = series[series!='o']                             
    series.dropna()
    print(series)
    #computer number of nominees
    n = len(series)   
    #initialize
    df_points = pd.Series(index = df.index)
    for idx in series.index:
        if series[idx] == 'w':
            df_points[idx] = float(n + 1/n)
        elif series[idx] == 'n':
            df_points[idx] = float(1/n)
    return df_points.fillna(0)
    

In [144]:
print(oscar_score(df_oscars, 1988, 'director'))

The Last Emperor    w
Fatal Attraction    n
Hope and Glory      n
Moonstruck          n
My Life as a Dog    n
Name: director, dtype: object
One Flew Over the Cuckoo's Nest    0.0
Barry Lyndon                       0.0
Dog Day Afternoon                  0.0
Nashville                          0.0
Amarcord                           0.0
The Last Emperor                   5.2
Fatal Attraction                   0.2
Hope and Glory                     0.2
Moonstruck                         0.2
My Life as a Dog                   0.2
dtype: float64


# Movie information using OMDB API

For a specified movie, get all its information using the OMDB-API and Wikpedia, including Box-office and Budget information

In [3]:
# Define function fr obtaining movie data using omdb API
def omdbapi(title):
    if not isinstance(title, str):
        return {}
    
    url_base = 'http://www.omdbapi.com/?i=tt3896198&apikey=5db77b44&'
    url = url_base + 't=' + str(title)
    r = requests.get(url)
    json_data = r.json()
    return json_data

# OMDB API returns the following keys: dict_keys(['Title', 'Year', 'Rated', 'Released', 
# 'Runtime', 'Genre', 'Director', 'Writer', 'Actors', 'Plot', 'Language', 'Country', 
# 'Awards', 'Poster', 'Ratings', 'Metascore', 'imdbRating', 'imdbVotes', 'imdbID', 'Type', 
# 'DVD', 'BoxOffice', 'Production', 'Website', 'Response'])

In [57]:
def get_budget_boxoffice(movie, year):
    
    budget = ""
    boxoffice = ""
    page = wikiapi(title = movie, year = year)
    if len(str(page))==0:
        print(f'Warning: Budget and Boxoffice info for {movie} could not be obtained from wikipedia ')
        return"",""
    try:
        soup = BeautifulSoup(page.html(),'html5lib')
    except AttributeError:
        print(f'Warning: Budget and BX info for {movie} could not be obtained from wikipedia ')
    if soup.body.find('table', class_="infobox vevent")==None:
        return "", ""

    try:
        trows = soup.body.find('table', class_="infobox vevent").find('tbody').find_all('tr')
    except AttributeError:
        print(f"Something wrong {movie} infobox. Budget/Boxoffice could not be fetched!")
        return budget, boxoffice
    for row in trows:
        if row.find('th') == None:
            continue
        if row.find('th').text.lower() == 'budget':
            budget = row.find('td').text
        if row.find('th').text.lower() == 'box office':
            boxoffice = row.find('td').text
    return budget, boxoffice


def currency_to_numeric(money):
    ''' Accepts $12 million and returns 12000000
        Accepts $13,678,654 and reurns  13678654
    '''
    # Check to see dollar, otherwise return nan
    money = money.lower()
    if money.find('$')<0:
        return np.nan
    money = re.sub(r'\[.*\]', '', money)
    
    if money.find('illion')>0: # when currency expressed in million/billion
        
        # Billion: $12.4 billion
        reg = r"[\$-]([0-9.]+)\sbillion"
        num = re.findall(reg, money) # find number like $12.4 billion
        if len(num)>0:
            # num = re.sub(r'\D', "", num) # drop any non-numeric characters like comma, dash etc
            return float(num[0])*1e9
        
        # Million: $6.8 million
        reg = r"[\$-]([0-9.]+)\smillion"
        num = re.findall(reg, money) # find number like $6.8 million
        if len(num)>0:
            # num = re.sub(r'\D', "", num) # drop any non-numeric characters like comma, dash etc
            return float(num[0])*1e6
    else: # When currency not expressed in millions  
        reg = r"\D*"
        num = re.sub(reg, "", money)
        return float(num)


In [58]:
examples = ['$13.4 million[2]', '$12.4 billion', '$30.2 million', '$12 million[2]', '$32,000', 
            '$22,939,866', '$45,677,899[3]', '$6000000', '$14-28 million', '14-28 billion']
examples
for example in examples:
    print(example,'-',currency_to_numeric(example) )
    
print(10e6)

$13.4 million[2] - 13400000.0
$12.4 billion - 12400000000.0
$30.2 million - 30200000.0
$12 million[2] - 12000000.0
$32,000 - 32000.0
$22,939,866 - 22939866.0
$45,677,899[3] - 45677899.0
$6000000 - 6000000.0
$14-28 million - 28000000.0
14-28 billion - nan
10000000.0


In [59]:
# df_imdb = pd.read_excel('Movies/statcrunch_IMDB.xlsx')

In [60]:

def get_ratings(ratings):
    
    imdb, rotten_tomatoes, metacritic = np.nan, np.nan, np.nan

    for rating in ratings:
        if (rating['Source']=='Internet Movie Database') & (len(rating['Value'])>0):
            rate = rating['Value'].strip('/')
            imdb = float(rate[0])
        if (rating['Source']=='Rotten Tomatoes') & (len(rating['Value'])>0):
            rate = rating['Value']
            rotten_tomatoes = float(rate[:-1])/10
        if (rating['Source']=='Metacritic') & (len(rating['Value'])>0):
            rate = rating['Value'].strip('/')
            metacritic = float(rate[:-4])/10
    
    return imdb, rotten_tomatoes, metacritic

        

def get_movie_info(movie):
    
    df_info = pd.DataFrame()
    
    json = omdbapi(movie)
    if 'Error' in json:
        print('OMDB API could not fetch info for', movie)
        return df_info

    # Get imdb Index
    if 'imdbID' in json.keys():
        idx = json['imdbID']
    else:
        print(f'Title {movie} not found')
        return df_info
    
    if 'Title' in json.keys():
        df_info.loc[idx,'film']=json['Title']
    if 'Year' in json.keys():
        df_info.loc[idx,'year']=json['Year']
    if 'Runtime' in json.keys():
        df_info.loc[idx,'runtime']=json['Runtime']
    if 'Released' in json.keys():
        df_info.loc[idx,'release_date']=json['Released']
    if 'imdbRating' in json.keys():
        df_info.loc[idx,'imdb_ratings']=json['imdbRating']
    if 'imdbVotes' in json.keys():  
        df_info.loc[idx,'imdb_num_votes']=json['imdbVotes']

    # Get Ratings as float
    imdb, rotten_tomatoes, metacritic=get_ratings(json['Ratings'])
    df_info.loc[idx,'imdb']=imdb
    df_info.loc[idx,'rotten_tomatoes']=rotten_tomatoes
    df_info.loc[idx,'metacritic']= metacritic
        
    # Get budget and box-office from wikipedia
    budget, box_office = get_budget_boxoffice(movie, json['Year']) 
    
    if len(budget)==0:
        budget = np.nan
    else:
        #budget = budget[0]
        df_info.loc[idx,'budget']= currency_to_numeric(budget)

    
    if len(box_office)==0:
        box_office = np.nan
    else:
        #box_office = box_office[0]
        df_info.loc[idx,'box_office']= currency_to_numeric(box_office)    
    
    return df_info


In [67]:
test_movies = ['Three Little Words', 'Catch Me If You Can', 'Howards End', 'Bang the Drum Slowly', 
               'The Journey of Natty Gann', 'Hustle & Flow', 'Kagemusha', 'My Geisha', "Mon Oncle D'Amerique", 
               'Ida', 'Blade Runner', 'Beauty and the Beast', 'Independence Day', 'The Silence of the Lambs', 
               'Saturday Night Fever', 'Come to the Stable', 'Stalag 17', 'BUtterfield 8', 
               'The Diving Bell and the Butterfly', 'The Whisperers', 'The Enemy Below', 
               'A Patch of Blue', 'Heaven Can Wait', 'Born Yesterday', 'Return of the Jedi', 
               'The Rose Tattoo', 'Lenny', "Murphy's Romance", 'Animal Kingdom', 'All About Eve', 
               'Raintree County', 'Skyfall', 'Road to Perdition', 'Daddy Long Legs', 
               'An Unmarried Woman', 'The Brothers Karamazov', 'Wild in the Streets', 
               'Seven Brides for Seven Brothers', 'South Pacific', 'Imitation of Life', 
               'Monster', 'The World According to Garp', 'The Greatest Story Ever Told', 
               'Flight', 'The Happiest Millionaire', 'My Life as a Dog', 'Hollow Man', 
               'Désirée', 'Unstrung Heroes', 'Lars and the Real Girl', 
               'A Funny Thing Happened on the Way to the Forum', 'Innerspace', 
               'The Color Purple', "Logan's Run", 'An Affair to Remember', "Sophie's Choice", 
               'Willow', 'Julie & Julia', 'The Reivers', 'The Competition', 'Calamity Jane', 
               'The Ides of March', 'Tribute', 'The Fabulous Baker Boys', 'Last Tango in Paris', 
               '45 Years', 'Two for the Seesaw', 'Inception', 'The Deep', 'Save the Tiger', 
               'Sunday Bloody Sunday', "The Man Who Wasn't There", 'The Lobster', 
               'The Bridges at Toko-Ri', 'You Can Count On Me', 'Rachel, Rachel', 
               'The Prince of Egypt', 'The Hospital', 'The Bad and the Beautiful', 
               'A Little Romance', 'Paris Blues', 'Rear Window', 'Once More, My Darling', 
               'In the Valley of Elah', 'Transformers: Revenge of the Fallen', 
               'Face/Off', 'Volver', 'Beasts of the Southern Wild', 'Transformers: Dark of the Moon', 
               'Say One for Me', 'Restoration', 'Lonelyhearts', "What's Eating Gilbert Grape"]
df_info = pd.DataFrame()
for movie in test_movies:
    print(movie)
    df = get_movie_info(movie)
    # print(df)
    df_info = df_info.append(df)
print(df_info)

Three Little Words
Catch Me If You Can
Howards End
Bang the Drum Slowly
The Journey of Natty Gann
Hustle & Flow
The page for the title Hustle & Flow(2004–2012 film) could not be found 
Kagemusha
My Geisha
Mon Oncle D'Amerique
Ida
Blade Runner
Beauty and the Beast
Independence Day
The Silence of the Lambs
Saturday Night Fever
Come to the Stable
Stalag 17
BUtterfield 8
The Diving Bell and the Butterfly
The Whisperers
The Enemy Below
A Patch of Blue
Heaven Can Wait
Born Yesterday
Return of the Jedi
The Rose Tattoo
Lenny
Murphy's Romance
Animal Kingdom
The page for the title Animal Kingdom(2010 film) could not be found 
All About Eve
Raintree County
Skyfall
Road to Perdition
Daddy Long Legs
An Unmarried Woman
The Brothers Karamazov
Wild in the Streets
Seven Brides for Seven Brothers
South Pacific
Imitation of Life
Monster
The World According to Garp
The Greatest Story Ever Told
Flight
The Happiest Millionaire
My Life as a Dog
Hollow Man
Désirée
Unstrung Heroes
Lars and the Real Girl
A Funn

In [68]:
df_info

Unnamed: 0,box_office,budget,film,imdb,imdb_num_votes,imdb_ratings,metacritic,release_date,rotten_tomatoes,runtime,year
tt0043044,4.526000e+06,1470000.0,Three Little Words,7.0,1416,7.0,,12 Jul 1950,,102 min,1950
tt0264464,3.521000e+08,52000000.0,Catch Me If You Can,8.0,745174,8.1,7.5,25 Dec 2002,9.5,141 min,2002
tt0104454,2.610000e+07,8000000.0,Howards End,7.0,26523,7.4,8.9,26 Feb 1993,9.4,142 min,1992
tt0069765,,1000000.0,Bang the Drum Slowly,6.0,5039,6.9,8.0,09 Mar 1978,9.2,96 min,1973
tt0089385,9.700000e+06,,The Journey of Natty Gann,7.0,4715,7.0,6.6,27 Sep 1985,10.0,101 min,1985
tt0379632,,,Hustle,8.0,20669,8.1,,14 Jan 2006,,60 min,2004–2012
tt0080979,3.057990e+24,7500000.0,Kagemusha,8.0,29286,8.0,8.4,10 Oct 1980,8.8,162 min,1980
tt0056267,,2000000.0,My Geisha,6.0,1157,6.5,,01 Feb 1962,,119 min,1962
tt5911642,,,Mon oncle d'Amérique 2,,,,,,,,2017
tt2718492,1.530000e+07,2600000.0,Ida,7.0,47708,7.4,9.1,25 Oct 2013,9.6,82 min,2013


In [43]:
movie = 'The Silence of the Lambs'
json = omdbapi(movie)
json
bd, bx = get_budget_boxoffice(movie, json['Year']) 
# ratings = json['Ratings']
# rating = ratings[2]
# rating
# rate = rating['Value']
# rate[:-4]
# # # metacritic = float(rate[0])/10
# # metacritic
print(bd,bx)
currency_to_numeric(bd),currency_to_numeric(bx)

$272.7 million[2] --- 17
$19 million[2] $272.7 million[2]
$19 million[2]  being converted!
$272.7 million[2]  being converted!


(19000000.0, 272700000.0)

# Evaluation of films 3 Criterion

Once the information of movies have been obtained, movies are evaluated under 3 broad categories. <br>
1. Critical Accolades based on awards and nominations <br>
2. Critical Feedback based on IMDB, Metacrritic and Rotten tomatoes <br>
3. Commercial success based on Budget and Box office returns<br>
<br>

## Accolades
Each movie was scored in each category for both nominations and wins. The following heuristic was used. <br>
A movie scores higher if it competed with and beat more films nomiated in that category. 
If in a given year, more number of movies are nominated, each nominated film earns a lower score.


## Critical Feedback
Two kinds of ratings were considered, both separately as well as their average, on a scale of 10. 

## Box Office
The commercial success was measured as a form of %age which made it independent of changes inflation rate over the years. A second measure, total earnings, was also calculated accounting for inflation rate.