# Walt Disney Pictures Films Data Set Creation
**Wikipedia:** [List of Walt Disney Pictures films](https://en.wikipedia.org/wiki/List_of_Walt_Disney_Pictures_films)


#### Import necessary packages
- BeautifulSoup for pulling data out of HTML and XML files

In [None]:
from bs4 import BeautifulSoup as bs
import requests

### 1. Messing around with one movie and retrive all the information from the wikipedia (Movie info box)

In [None]:
# Loading the Webpage
p = requests.get('https://en.wikipedia.org/wiki/The_Lion_King_(2019_film)')

In [None]:
# Convert into a beautifulsoup object
soup = bs(p.content)

# Display the content
contents =  soup.prettify()
#print(contents)

In [None]:
# Finding the Info box table from the wiki page
info_box = soup.find(class_='infobox vevent')
info_rows = info_box.find_all('tr')

#for row in info_rows:
    #print(row.prettify())

In [None]:
# Get the list of content from the info box table
def get_content_list(row_data):
    if row_data.find('li'):
        return [li.get_text(' ', strip=True).replace('\xa0', ' ') for li in row_data.find_all('li')]
    else:
        return row_data.get_text(' ', strip=True).replace('\xa0', ' ')


In [None]:
# Extract the data from the info box table rows
movie_info = dict()

for index, row in enumerate(info_rows):
    if index==0:
        movie_info['title'] = row.find('th').get_text(' ', strip=True)
    elif index == 1:
        continue
    else:
        info_key = row.find('th').get_text(' ', strip=True)
        info_value = get_content_list(row.find('td'))
        movie_info[info_key] = info_value
movie_info

### 2. Getting all the movies url and title

##### Robots exclusion standard for wikipedia can be found [here](https://en.wikipedia.org/wiki/Robots_exclusion_standard)


In [None]:
# Load the movie list page from the wiki.
m = requests.get('https://en.wikipedia.org/wiki/List_of_Walt_Disney_Pictures_films')

movies_soup = bs(m.content)

# Display the content
movies_contents =  movies_soup.prettify()
#print(movies_contents)

In [None]:
# Select all the movie list link
movies = movies_soup.select('.wikitable.sortable i')
print(movies[0].a['href'])
print(movies[0].a['title'])
movies[0:5]

In [None]:
# Get the list of content from the info box table
def get_content_list(row_data):
    if row_data.find('li'):
        return [li.get_text(' ', strip=True).replace('\xa0', ' ') for li in row_data.find_all('li')]
    elif row_data.find('br'):
        return [text for text in row_data.stripped_strings]     
    else:
        return row_data.get_text(' ', strip=True).replace('\xa0', ' ')
    
# Clean the references e.g. [1] from the movie list
def clean_tags(soup):
    for tag in soup.find_all(['sup', 'span']):
        tag.decompose()

In [None]:
# Extract the data from the info box table 
def get_info_box(url):
    m = requests.get(url)
    
    movies_soup = bs(m.content)
    
    info_box = movies_soup.find(class_='infobox vevent')
    info_rows = info_box.find_all('tr')
    
    clean_tags(movies_soup)
    
    movie_info = dict()
    
    for index, row in enumerate(info_rows):
        if index==0:
            movie_info['title'] = row.find('th').get_text(' ', strip=True)
        else:
            header = row.find('th')
            if header:
                info_key = row.find('th').get_text(' ', strip=True)
                info_value = get_content_list(row.find('td'))
                movie_info[info_key] = info_value
    return movie_info

In [None]:
# Test Movie
get_info_box('https://en.wikipedia.org/wiki/One_Little_Indian_(film)')

In [None]:
m = requests.get('https://en.wikipedia.org/wiki/List_of_Walt_Disney_Pictures_films')

movies_soup = bs(m.content)
movies = movies_soup.select('.wikitable.sortable i a')
base_path = 'https://en.wikipedia.org'
movie_info_list = list()

for index, movie in enumerate(movies):
    try:
        relative_path = movie['href']
        path = base_path + relative_path
        title = movie['title']
        print()
        
        movie_info_list.append(get_info_box(path))
        
    except Exception as e:
        print(movie.get_text())
        print(e)
        
movie_info_list

In [None]:
len(movie_info_list)

### 3. Save and Load the movie dataset as JSON

In [None]:
import json

def save_data(title, data):
    with open(title, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=2)

In [None]:
def load_data(title):
    with open(title, encoding='utf-8') as f:
        return json.load(f)

In [None]:
# Load the data
movie_info_list = load_data('data/disney_movie_data.json')

In [None]:
save_data('data/disney_movie_data.json', movie_info_list)

### 3. Cleaning the data

##### Convert Running time string into int

In [None]:
# Check all Running time
[movie.get('Running time(int)', 'N/A') for movie in movie_info_list]

In [None]:
# Convert the running time string to int
def convert_minutes_to_int(running_time):
    if running_time == 'N/A':
        return None
    if isinstance(running_time, list):
        entry = running_time[0]
        return int(entry.split(' ')[0])
    else:   
        return int(running_time.split(' ')[0])

# Add new entry in the movie list as `Running time (int):int64`    
for movie in movie_info_list:
    movie['Running time (int)'] = convert_minutes_to_int(movie.get('Running time', 'N/A'))

In [None]:
# Check one movie which doesn't have running time
movie_info_list[0]

##### Convert Budget and Box office from string to decimal  2.6 million = 2.600000

In [None]:
[movie.get('Budget', 'N/A') for movie in movie_info_list]

In [None]:
import re

amounts = r'thousand|million|billion'
number = r'\d+(,\d{3})*\.*\d*'

word_re = rf'\${number}(-|\sto\s|–)?({number})?\s({amounts})'
value_re = rf'\${number}'

def word_to_value(word):
    value_dict = {"thousand": 1000, "million": 1000000, "billion": 1000000000}
    return value_dict[word]

def parse_word_syntax(string):
    value_string = re.search(number, string).group()
    value = float(value_string.replace(',', ''))
    word = re.search(amounts, string, flags=re.I).group().lower()
    word_value = word_to_value(word)
    return value*word_value


def parse_value_syntax(string):
    value_string = re.search(number, string).group()
    value = float(value_string.replace(',', ''))
    return value

def convert_string_money_to_decimal(money):
    if money == 'N/A':
        return None
    
    if isinstance(money, list):
        money = money[0]
    
    
    word_syntax = re.search(word_re, money, flags=re.I)
    value_syntax = re.search(value_re, money)
    
    if word_syntax:
        return  parse_word_syntax(word_syntax.group())
    elif value_syntax:
        return parse_value_syntax(value_syntax.group())
    else:
        return None


In [None]:
# Add new entry in the movie list as `Running time (int):int64`    
for movie in movie_info_list:
    movie['Budget (float)'] = convert_string_money_to_decimal(movie.get('Budget', 'N/A'))
    movie['Box office (float)'] = convert_string_money_to_decimal(movie.get('Box office', 'N/A'))

In [None]:
movie_info_list[40]

##### Convert Date into datetimes

In [None]:
[movie.get('Release date', 'N/A') for movie in movie_info_list]

In [None]:
from datetime import datetime

dates = [movie.get('Release date', 'N/A') for movie in movie_info_list]

def clean_date(date):
    return date.split('(')[0].strip()

def convert_string_date_to_datetime(date):
    if isinstance(date, list):
        date = date[0]
        
    if date == 'N/A':
        return None
    
    date_string = clean_date(date)
    
    frmts = ['%B %d, %Y', '%B %d %Y']
    for frmt in frmts:
        try:
            return datetime.strptime(date_string, frmt)  
        except:
             pass
            
    return None

In [None]:
for movie in movie_info_list:
    movie['Release date (datetime)'] = convert_string_date_to_datetime(movie.get('Release date', 'N/A'))

##### Saving as `pickle`

In [None]:
#convert_string_date_to_datetime(movie_info_list[-54]['Release date (datetime)'])
movie_info_list[-40]

In [None]:
import pickle
def save_data_pickle(name, data):
    with open (name, 'wb') as f:
        pickle.dump(data,f,)
        
def load_data_pickle(name):
    with open (name, 'rb') as f:
        return pickle.load(f)

In [None]:
save_data_pickle('data/disney_movie_data_cleaned.pickle', movie_info_list)

In [None]:
a = load_data_pickle('data/disney_movie_data_cleaned.pickle')
a == movie_info_list

### Append IMDB/Rotten Tomatoes/Metascore 

In [None]:
movie_info_list = load_data_pickle('data/disney_movie_data_cleaned.pickle')

In [None]:
import requests
import urllib

def get_omdb_info(title):
    # http://www.omdbapi.com/?t=['']&apikey=50d7e542apikey=[yourkey]
    base_url = 'http://www.omdbapi.com/?'
    
    # Don't use my apikey. I am using a free version which is limited to make 1000 api call each day. 
    #That would not work for you. 
    parameters = {'t':title, "apikey": '50d7e542'} 
    params_encoded = urllib.parse.urlencode(parameters)
    full_url = base_url + params_encoded
    #print(full_url)
    return requests.get(full_url).json()

def get_rotten_tomatos_score(omdb_info):
    ratings = omdb_info.get('Ratings', [])
    for rating in ratings:
        if rating['Source'] == 'Rotten Tomatoes':
            return rating['Value']
    return None

info = get_omdb_info('Beauty and the Beast')
get_rotten_tomatos_score(info)

In [None]:
for movie in movie_info_list:
    title = movie['title']
    omdb_info = get_omdb_info(title)
    movie['imdb_rating'] = omdb_info.get('imdbRating', None)
    movie['imdb_votes'] = omdb_info.get('imdbVotes', None)
    movie['imdb_id'] = omdb_info.get('imdbID', None)
    movie['metascore'] = omdb_info.get('imdbRating', None)
    movie['rotten_tomatoes'] = get_rotten_tomatos_score(omdb_info)

In [None]:
# Checking if it worked
movie_info_list[152]

In [None]:
save_data_pickle('data/disney_movie_data_with_ratings.pickle', movie_info_list)

##### Making a copy of movie infos to save as `json`

In [None]:
movie_info_copy = [movie.copy() for movie in movie_info_list]

In [None]:
movie_info_copy[50]

In [None]:
for movie in movie_info_copy:
    current_date = movie['Release date (datetime)']
    if current_date:
        movie['Release date (datetime)'] = current_date.strftime('%B %d, %Y')
    else:
        movie['Release date (datetime)'] = None

In [None]:
movie_info_copy[50]

In [None]:
save_data('data/disney_movie_data_with_ratings.json', movie_info_copy)

##### Saving as `csv`

In [None]:
import pandas as pd

df = pd.DataFrame(movie_info_list)
df.head()

In [None]:
df.to_csv('data/disney_movie_data_with_ratings.csv')