# Walt Disney Pictures Films Data Set Creation
**Wikipedia:** [List of Walt Disney Pictures films](https://en.wikipedia.org/wiki/List_of_Walt_Disney_Pictures_films)


#### Import necessary packages
- BeautifulSoup for pulling data out of HTML and XML files

In [1]:
from bs4 import BeautifulSoup as bs
import requests

### 1. Messing around with one movie and retrive all the information from the wikipedia (Movie info box)

In [32]:
# Loading the Webpage
p = requests.get('https://en.wikipedia.org/wiki/The_Lion_King_(2019_film)')

In [34]:
# Convert into a beautifulsoup object
soup = bs(p.content)

# Display the content
contents =  soup.prettify()
#print(contents)

In [39]:
# Finding the Info box table from the wiki page
info_box = soup.find(class_='infobox vevent')
info_rows = info_box.find_all('tr')

#for row in info_rows:
    #print(row.prettify())

In [40]:
# Get the list of content from the info box table
def get_content_list(row_data):
    if row_data.find('li'):
        return [li.get_text(' ', strip=True).replace('\xa0', ' ') for li in row_data.find_all('li')]
    else:
        return row_data.get_text(' ', strip=True).replace('\xa0', ' ')


In [41]:
# Extract the data from the info box table rows
movie_info = dict()

for index, row in enumerate(info_rows):
    if index==0:
        movie_info['title'] = row.find('th').get_text(' ', strip=True)
    elif index == 1:
        continue
    else:
        info_key = row.find('th').get_text(' ', strip=True)
        info_value = get_content_list(row.find('td'))
        movie_info[info_key] = info_value
movie_info

{'title': 'The Lion King',
 'Directed by': 'Jon Favreau',
 'Produced by': ['Jon Favreau', 'Jeffrey Silver', 'Karen Gilchrist'],
 'Screenplay by': 'Jeff Nathanson',
 'Based on': ["Disney 's The Lion King by Irene Mecchi Jonathan Roberts Linda Woolverton [1]"],
 'Starring': ['Donald Glover',
  'Seth Rogen',
  'Chiwetel Ejiofor',
  'Alfre Woodard',
  'Billy Eichner',
  'John Kani',
  'John Oliver',
  'Beyoncé Knowles-Carter',
  'James Earl Jones'],
 'Music by': 'Hans Zimmer',
 'Cinematography': 'Caleb Deschanel',
 'Edited by': ['Mark Livolsi', 'Adam Gerstel'],
 'Production company': ['Walt Disney Pictures', 'Fairview Entertainment'],
 'Distributed by': 'Walt Disney Studios Motion Pictures',
 'Release date': ['July 9, 2019 ( 2019-07-09 ) ( Hollywood )',
  'July 19, 2019 ( 2019-07-19 ) (United States)'],
 'Running time': '118 minutes [2]',
 'Country': 'United States',
 'Language': 'English',
 'Budget': '$250–260 million [3] [4]',
 'Box office': '$1.657 billion [5]'}

### 2. Getting all the movies url and title

##### Robots exclusion standard for wikipedia can be found [here](https://en.wikipedia.org/wiki/Robots_exclusion_standard)


In [43]:
# Load the movie list page from the wiki.
m = requests.get('https://en.wikipedia.org/wiki/List_of_Walt_Disney_Pictures_films')

movies_soup = bs(m.content)

# Display the content
movies_contents =  movies_soup.prettify()
#print(movies_contents)

In [56]:
# Select all the movie list link
movies = movies_soup.select('.wikitable.sortable i')
print(movies[0].a['href'])
print(movies[0].a['title'])
movies[0:5]

/wiki/Academy_Award_Review_of_Walt_Disney_Cartoons
Academy Award Review of Walt Disney Cartoons


[<i><a href="/wiki/Academy_Award_Review_of_Walt_Disney_Cartoons" title="Academy Award Review of Walt Disney Cartoons">Academy Award Review of Walt Disney Cartoons</a></i>,
 <i><a href="/wiki/Snow_White_and_the_Seven_Dwarfs_(1937_film)" title="Snow White and the Seven Dwarfs (1937 film)">Snow White and the Seven Dwarfs</a></i>,
 <i><a href="/wiki/Pinocchio_(1940_film)" title="Pinocchio (1940 film)">Pinocchio</a></i>,
 <i><a href="/wiki/Fantasia_(1940_film)" title="Fantasia (1940 film)">Fantasia</a></i>,
 <i><a href="/wiki/The_Reluctant_Dragon_(1941_film)" title="The Reluctant Dragon (1941 film)">The Reluctant Dragon</a></i>]

In [14]:
# Get the list of content from the info box table
def get_content_list(row_data):
    if row_data.find('li'):
        return [li.get_text(' ', strip=True).replace('\xa0', ' ') for li in row_data.find_all('li')]
    elif row_data.find('br'):
        return [text for text in row_data.stripped_strings]     
    else:
        return row_data.get_text(' ', strip=True).replace('\xa0', ' ')
    
# Clean the references e.g. [1] from the movie list
def clean_tags(soup):
    for tag in soup.find_all(['sup', 'span']):
        tag.decompose()

In [15]:
# Extract the data from the info box table 
def get_info_box(url):
    m = requests.get(url)
    
    movies_soup = bs(m.content)
    
    info_box = movies_soup.find(class_='infobox vevent')
    info_rows = info_box.find_all('tr')
    
    clean_tags(movies_soup)
    
    movie_info = dict()
    
    for index, row in enumerate(info_rows):
        if index==0:
            movie_info['title'] = row.find('th').get_text(' ', strip=True)
        else:
            header = row.find('th')
            if header:
                info_key = row.find('th').get_text(' ', strip=True)
                info_value = get_content_list(row.find('td'))
                movie_info[info_key] = info_value
    return movie_info

In [57]:
# Test Movie
get_info_box('https://en.wikipedia.org/wiki/One_Little_Indian_(film)')

{'title': 'One Little Indian',
 'Directed by': 'Bernard McEveety',
 'Produced by': 'Winston Hibler',
 'Written by': 'Harry Spalding',
 'Starring': ['James Garner',
  'Vera Miles',
  'Pat Hingle',
  'Morgan Woodward',
  'Jodie Foster'],
 'Music by': 'Jerry Goldsmith',
 'Cinematography': 'Charles F. Wheeler',
 'Edited by': 'Robert Stafford',
 'Production company': 'Walt Disney Productions',
 'Distributed by': 'Buena Vista Distribution',
 'Release date': ['June 20, 1973'],
 'Running time': '90 Minutes',
 'Country': 'United States',
 'Language': 'English',
 'Box office': '$2 million'}

In [None]:
m = requests.get('https://en.wikipedia.org/wiki/List_of_Walt_Disney_Pictures_films')

movies_soup = bs(m.content)
movies = movies_soup.select('.wikitable.sortable i a')
base_path = 'https://en.wikipedia.org'
movie_info_list = list()

for index, movie in enumerate(movies):
    try:
        relative_path = movie['href']
        path = base_path + relative_path
        title = movie['title']
        print()
        
        movie_info_list.append(get_info_box(path))
        
    except Exception as e:
        print(movie.get_text())
        print(e)
        
movie_info_list

In [58]:
len(movie_info_list)

437

### 3. Save and Load the movie dataset as JSON

In [59]:
import json

def save_data(title, data):
    with open(title, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=2)

In [60]:
def load_data(title):
    with open(title, encoding='utf-8') as f:
        return json.load(f)

In [61]:
save_data('data/disney_movie_data.json', movie_info_list)

In [62]:
# Load the data
movie_info_list = load_data('data/disney_movie_data.json')

### 3. Cleaning the data

##### Convert Running time string into int

In [None]:
# Check all Running time
[movie.get('Running time', 'N/A') for movie in movie_info_list]

In [63]:
# Convert the running time string to int
def convert_minutes_to_int(running_time):
    if running_time == 'N/A':
        return None
    if isinstance(running_time, list):
        entry = running_time[0]
        return int(entry.split(' ')[0])
    else:   
        return int(running_time.split(' ')[0])

# Add new entry in the movie list as `Running time (int):int64`    
for movie in movie_info_list:
    movie['Running time (int)'] = convert_minutes_to_int(movie.get('Running time', 'N/A'))

In [64]:
# Check one movie which doesn't have running time
movie_info_list[-1]

{'title': 'The Beatles: Get Back',
 'Directed by': 'Peter Jackson',
 'Produced by': ['Peter Jackson', 'Clare Olssen', 'Jonathan Clyde'],
 'Starring': 'The Beatles',
 'Music by': 'The Beatles',
 'Edited by': 'Jabez Olssen',
 'Production companies': ['Walt Disney Pictures',
  'Apple Corps',
  'WingNut Films'],
 'Distributed by': ['Walt Disney Studios', 'Motion Pictures'],
 'Release date': ['27 August 2021'],
 'Country': ['United Kingdom', 'New Zealand', 'United States'],
 'Language': 'English',
 'Running time (int)': None}

##### Convert Budget and Box office from string to decimal  2.6 million = 2.600000

In [None]:
[movie.get('Budget', 'N/A') for movie in movie_info_list]