In [1]:
# Import all necessary libraries
import pandas as pd
import numpy as np
import requests
import wikipedia
from bs4 import BeautifulSoup
from urllib.request import urlopen
import re


# 1.0 Data Acquisition

Data will be acquired using two different approaches. First, movie datasets will be harvested from websites like Kaggle, Dataworld and statscrunch. Second, we will use the wikipedia API to extract basic movie information from the wikipedia page to fill in missing information about budget, running time and box office revenues. 

## 1.1. Reading directly from files
We will create pandas tables from structured datasets containing information about Academy Award nominations and wins, IMDB ratings and budget/box office 

In [2]:
## Reading from cvs and excel files
df_actor = pd.read_csv('Movies/Oscar_data/actors.csv')
df_actress = pd.read_csv('Movies/Oscar_data/actresses.csv')
df_oscar = pd.read_csv('Movies/oscar_database.csv')
df_budget = pd.read_excel('Movies/statcrunch_budgetboxoffice.xlsx')
df_imdb = pd.read_excel('Movies/statcrunch_IMDB.xlsx')

# 1.2 Webscraping using wikipedia API

We will use the wikipedia API to obtain information about the cast running time, budget and box office revenues

In [3]:
# Define function fr obtaining movie data using omdb API
def omdbapi(title):
    if not isinstance(title, str):
        return {}
    
    url_base = 'http://www.omdbapi.com/?i=tt3896198&apikey=5db77b44&'
    url = url_base + 't=' + str(title)
    r = requests.get(url)
    json_data = r.json()
    return json_data

# OMDB API returns the following keys: dict_keys(['Title', 'Year', 'Rated', 'Released', 
# 'Runtime', 'Genre', 'Director', 'Writer', 'Actors', 'Plot', 'Language', 'Country', 
# 'Awards', 'Poster', 'Ratings', 'Metascore', 'imdbRating', 'imdbVotes', 'imdbID', 'Type', 
# 'DVD', 'BoxOffice', 'Production', 'Website', 'Response'])


    

# WIkipedia API
## Function that calls the wikipedia API
## Function that accepst the infobox as a string and returns budget and box office

In [29]:
## WIKIPEDIA API Function

# Define function fr obtaining movie data using Wikipedia API
def wikiapi(title):
    title = title + '(film)'
    return wikipedia.page(title)

# Get budget and box office info from the wikipedia infobox
def budget_box_office(mystring):

    pattern1 = re.compile(r'\$')
    pattern2 = re.compile(r'million')
    matches1 = list(pattern1.finditer(mystring))
    matches2 = list(pattern2.finditer(mystring))
    if len(matches1) !=len(matches2):
        print('Error: lengths dont match up!')
    


    money_dict = {}
    for match1, match2 in zip(matches1, matches2):
        xstart1,xstop1 = match1.span()
        xstart2,xstop2 = match2.span()
        money = mystring[xstop1:(xstart2-1)]
        if 'budget' in mystring[xstart1-30:xstart1]:
            money_dict['budget']=float(money)
        elif 'box' in mystring[xstart1-30:xstart1]:
            money_dict['box office']=float(money)
        return money_dict

In [30]:
# Non-API: This function gets wikipedia info without the Wikipedia API
def wikiscrape(title):
    url = 'https://en.wikipedia.org/wiki/' + title
    my_client = urlopen(url)
    page_html = my_client.read()
    my_client.close()
    return page_html

In [31]:
movie = 'Lawrence of Arabia (film)'
# x = omdbapi(movie)


In [32]:
# Using the wikipedia API to get the movie
# Just like the search field, we do not have to provide exact name
# We then parse the page using BeautifulSoup 
page = wikiapi(movie)
soup = BeautifulSoup(page.html(),'html.parser')


# Now find the infobox table within the page and extrcat it. 
# Ref: https://stackoverflow.com/questions/52913838/how-to-automate-scraping-wikipedia-info-box-specifically-and-print-the-data-usin?noredirect=1&lq=1
table = soup.find('table', attrs={'class': 'infobox vevent'})
infobox = table.find_all('tr')
infobox = str(infobox).lower()
# print(type(infobox))
print(infobox)
print(len(infobox))
finances = budget_box_office(infobox)
print(finances)




[<tr><th class="summary" colspan="2" style="text-align:center;font-size:125%;font-weight:bold;font-size:110%;font-style:italic;">lawrence of arabia</th></tr>, <tr><td colspan="2" style="text-align:center"><a class="image" href="/wiki/file:lawrence_of_arabia_ver3_xxlg.jpg"><img alt="lawrence of arabia ver3 xxlg.jpg" class="thumbborder" data-file-height="2924" data-file-width="1920" decoding="async" height="335" src="//upload.wikimedia.org/wikipedia/commons/thumb/c/c5/lawrence_of_arabia_ver3_xxlg.jpg/220px-lawrence_of_arabia_ver3_xxlg.jpg" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/c/c5/lawrence_of_arabia_ver3_xxlg.jpg/330px-lawrence_of_arabia_ver3_xxlg.jpg 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/c/c5/lawrence_of_arabia_ver3_xxlg.jpg/440px-lawrence_of_arabia_ver3_xxlg.jpg 2x" width="220"/></a><div style="font-size:95%;padding:0.35em 0.35em 0.25em;line-height:1.25em;">theatrical release poster by <a href="/wiki/howard_terpning" title="howard terpning">howard terpn

In [6]:
print(infobox[-20:])

/a></sup></td></tr>]


In [21]:
## Extracting all the tags
pattern = re.compile(r'<')
matches = list(pattern.finditer(infobox))
tags = list()
for match in matches:
    x,y = match.span()
    if infobox[y] == '/':
        continue
    else:
        tags.append(infobox[y:(y+2)].strip())
print(set(tags))



{'li', 'ul', 'a', 'sp', 'im', 'su', 'tr', 'td', 'br', 'i>', 'th', 'di'}
