In [1]:
# Import all necessary libraries
import pandas as pd
import numpy as np
import requests
import wikipedia
from bs4 import BeautifulSoup
from urllib.request import urlopen
import re


# 1.0 Data Acquisition

Data will be acquired using two different approaches. First, movie datasets will be harvested from websites like Kaggle, Dataworld and statscrunch. Second, we will use the wikipedia API to extract basic movie information from the wikipedia page to fill in missing information about budget, running time and box office revenues. 

## 1.1. Reading directly from files
We will create pandas tables from structured datasets containing information about Academy Award nominations and wins, IMDB ratings and budget/box office 

In [2]:
## Reading from cvs and excel files
df_actor = pd.read_csv('Movies/Oscar_data/actors.csv')
df_actress = pd.read_csv('Movies/Oscar_data/actresses.csv')
df_oscar = pd.read_csv('Movies/oscar_database.csv')
df_budget = pd.read_excel('Movies/statcrunch_budgetboxoffice.xlsx')
df_imdb = pd.read_excel('Movies/statcrunch_IMDB.xlsx')

# 1.2 Webscraping using wikipedia API

We will use the wikipedia API to obtain information about the cast running time, budget and box office revenues

In [3]:
# Define function fr obtaining movie data using omdb API
def omdbapi(title):
    if not isinstance(title, str):
        return {}
    
    url_base = 'http://www.omdbapi.com/?i=tt3896198&apikey=5db77b44&'
    url = url_base + 't=' + str(title)
    r = requests.get(url)
    json_data = r.json()
    return json_data

# OMDB API returns the following keys: dict_keys(['Title', 'Year', 'Rated', 'Released', 
# 'Runtime', 'Genre', 'Director', 'Writer', 'Actors', 'Plot', 'Language', 'Country', 
# 'Awards', 'Poster', 'Ratings', 'Metascore', 'imdbRating', 'imdbVotes', 'imdbID', 'Type', 
# 'DVD', 'BoxOffice', 'Production', 'Website', 'Response'])


    

# WIkipedia API
## Function that calls the wikipedia API
## Function that accepst the infobox as a string and returns budget and box office

In [10]:
## WIKIPEDIA API Function

# Define function fr obtaining movie data using Wikipedia API
def wikiapi(title):
    title = title + '(film)'
    return wikipedia.page(title)

# Get budget and box office info from the wikipedia infobox
def budget_box_office(my_string):
    
    regex = r"(budget|box office).+?(\$.+?million)"
    return re.findall(regex,my_string)

In [11]:
# Non-API: This function gets wikipedia info without the Wikipedia API
def wikiscrape(title):
    url = 'https://en.wikipedia.org/wiki/' + title
    my_client = urlopen(url)
    page_html = my_client.read()
    my_client.close()
    return page_html

In [25]:
movie = "Schindler's List"
# x = omdbapi(movie)


In [26]:
# Using the wikipedia API to get the movie
# Just like the search field, we do not have to provide exact name
# We then parse the page using BeautifulSoup 
page = wikiapi(movie)
soup = BeautifulSoup(page.html(),'html.parser')


# Now find the infobox table within the page and extrcat it. 
# Ref: https://stackoverflow.com/questions/52913838/how-to-automate-scraping-wikipedia-info-box-specifically-and-print-the-data-usin?noredirect=1&lq=1
table = soup.find('table', attrs={'class': 'infobox vevent'})
infobox = table.find_all('tr')
infobox = str(infobox).lower()
# print(type(infobox))
# print(infobox)
# print(len(infobox))
info = budget_box_office(infobox)
print(f'Information for Movie {movie} is:', info)





Information for Movie Schindler's List is: [('budget', '$22 million'), ('box office', '$322.1 million')]


In [30]:
json = omdbapi(movie)
json
print(movie)
json.keys()

Schindler's List


dict_keys(['Title', 'Year', 'Rated', 'Released', 'Runtime', 'Genre', 'Director', 'Writer', 'Actors', 'Plot', 'Language', 'Country', 'Awards', 'Poster', 'Ratings', 'Metascore', 'imdbRating', 'imdbVotes', 'imdbID', 'Type', 'DVD', 'BoxOffice', 'Production', 'Website', 'Response'])

In [38]:
movie_list = df_imdb['title']
movie_list = set(movie_list)
len(movie_list)

56005

In [39]:

json

{'Title': "Schindler's List",
 'Year': '1993',
 'Rated': 'R',
 'Released': '04 Feb 1994',
 'Runtime': '195 min',
 'Genre': 'Biography, Drama, History',
 'Director': 'Steven Spielberg',
 'Writer': 'Thomas Keneally (book), Steven Zaillian (screenplay)',
 'Actors': 'Liam Neeson, Ben Kingsley, Ralph Fiennes, Caroline Goodall',
 'Plot': 'In German-occupied Poland during World War II, industrialist Oskar Schindler gradually becomes concerned for his Jewish workforce after witnessing their persecution by the Nazis.',
 'Language': 'English, Hebrew, German, Polish',
 'Country': 'USA',
 'Awards': 'Won 7 Oscars. Another 82 wins & 49 nominations.',
 'Poster': 'https://m.media-amazon.com/images/M/MV5BNDE4OTMxMTctNmRhYy00NWE2LTg3YzItYTk3M2UwOTU5Njg4XkEyXkFqcGdeQXVyNjU0OTQ0OTY@._V1_SX300.jpg',
 'Ratings': [{'Source': 'Internet Movie Database', 'Value': '8.9/10'},
  {'Source': 'Rotten Tomatoes', 'Value': '97%'},
  {'Source': 'Metacritic', 'Value': '94/100'}],
 'Metascore': '94',
 'imdbRating': '8.9',


# Setting up the DataFrames

# Movie Info Dataframe
This DataFrame will be indexed by the movie title/IMDB ID of the film and will contain all the information about the movie. The following columns will contain all the movie attributes:
1. Year <br>
2. Director(s) <br>
3. Cast <br>
4. Producer(s) <br>
5. Screenplay <br>
6. Writer <br>
7. Based On <br>
8. Music <br>
9. Cinematography <br>
10. Release Date <br>
11. Running time <br>
12. Budget <br>

# Genre DataFrame
This table will contain the information for each movie about the genre it belongs to. A movie can belong to multiple genres
1. Biography <br>
2. Drama <br>
3. History <br>
etc...




# Award Info DataFrame
This DataFrame will capture the nominations and awards obrained in the various categories. The following categories will be considered:
1. Picture/Film (film) <br>
2. Director (director) <br>
3. Actor (actor) <br>
4. Supporting Actor (sup_actor) <br>
5. Actress (actress) <br>
6. Supporting Actress (sup_actress) <br>
7. Cinematorgraphy (cinematograpjy) <br>
8. Screenplay (screenplay) <br>


# Outcomes Info Dataframe
1. IMDB Ratings (imdb_ratings) <br>
2. IMDB Total Number of Votes (total_votes) <br>
3. Oscar Nominations (oscar_noms) <br>
4. Oscar Wins (oscar_wins) <br>
4. Box Office Revenues (box_office) <br>

In [58]:
DF_movie_info = pd.DataFrame()
DF_genre_info = pd.DataFrame()
DF_award_info = pd.DataFrame()
DF_outcome_info = pd.DataFrame()



In [99]:
# Setting up the movie info

movie = "Schindler's List"
movie = movie.lower()
json = omdbapi(movie)

# Movie Info Dataframe
all_columns = 'Title Year Rated Released Runtime Genre Director Writer Actors Budget'.split()
for col in all_columns:
    if col not in json.keys():
        DF_movie_info.loc[json['imdbID'], col] = ""
    else:
        if isinstance(json[col], str):
            json[col] = json[col].lower() # make all strings lower case to keep things consistent
        
        DF_movie_info.loc[json['imdbID'], col] = json[col]

DF_movie_info     
# Oscar Info Dataframe
# movie = movie.lower()
# df_oscar




Unnamed: 0,Title,Year,Rated,Released,Runtime,Genre,Director,Writer,Actors,Budget
tt0108052,schindler's list,1993,r,04 feb 1994,195 min,"biography, drama, history",steven spielberg,"thomas keneally (book), steven zaillian (scree...","liam neeson, ben kingsley, ralph fiennes, caro...",


In [96]:
set(df_oscar.Award)

{'Actor',
 'Actor in a Leading Role',
 'Actor in a Supporting Role',
 'Actress',
 'Actress in a Leading Role',
 'Actress in a Supporting Role',
 'Animated Feature Film',
 'Art Direction',
 'Art Direction (Black and White)',
 'Art Direction (Color)',
 'Assistant Director',
 'Award of Commendation',
 'Best Motion Picture',
 'Best Picture',
 'Cinematography',
 'Cinematography (Black and White)',
 'Cinematography (Color)',
 'Costume Design',
 'Costume Design (Black and White)',
 'Costume Design (Color)',
 'Dance Direction',
 'Directing',
 'Directing (Comedy Picture)',
 'Directing (Dramatic Picture)',
 'Documentary',
 'Documentary (Feature)',
 'Documentary (Short Subject)',
 'Engineering Effects',
 'Film Editing',
 'Foreign Language Film',
 'Gordon E. Sawyer Award',
 'Honorary Award',
 'Honorary Foreign Language Film Award',
 'Irving G. Thalberg Memorial Award',
 'Jean Hersholt Humanitarian Award',
 'John A. Bonner Medal of Commendation',
 'Makeup',
 'Makeup and Hairstyling',
 'Medal of Com

In [103]:
col_names = 'picture, director, actor ,actor_sup, actress, actress_sup, cinematography, constume, music, visual_effects, writing'.split(',') 
DF_award_info.rename(columns = col_names)


In [151]:
df_oscar = df_oscar.dropna()
df_oscar.tail(50)

Unnamed: 0,Year,Ceremony,Award,Winner,Name,Film
9692,2013,86,Visual Effects,1.0,Gravity,"tim webber, chris lawrence, david shirk and ne..."
9700,2013,86,Writing (Adapted Screenplay),1.0,12 Years a Slave,screenplay by john ridley
9705,2013,86,Writing (Original Screenplay),1.0,Her,written by spike jonze
9717,2014,87,Actor in a Leading Role,1.0,Eddie Redmayne,the theory of everything
9722,2014,87,Actor in a Supporting Role,1.0,J.K. Simmons,whiplash
9725,2014,87,Actress in a Leading Role,1.0,Julianne Moore,still alice
9728,2014,87,Actress in a Supporting Role,1.0,Patricia Arquette,boyhood
9733,2014,87,Animated Feature Film,1.0,Big Hero 6,"don hall, chris williams and roy conli"
9738,2014,87,Cinematography,1.0,Birdman or (The Unexpected Virtue of Ignorance),emmanuel lubezki
9743,2014,87,Costume Design,1.0,The Grand Budapest Hotel,milena canonero
