### Disney Dataset Creation (w/ Python BeautifulSoup)
Scrape & clean a list of disney wikipedia pages to create a dataset to further analyze

### Task #1: Get Info Box (store in Python dictionary)

#### Import Necessary Libraries




In [1]:
from bs4 import BeautifulSoup as bs
import requests

#### Load the webpage

In [2]:
r = requests.get("https://en.wikipedia.org/wiki/Toy_Story_3")

# Convert to a beautiful soup object
soup = bs(r.content)

# Print out the HTML
contents = soup.prettify()
print(contents)

<!DOCTYPE html>
<html class="client-nojs" dir="ltr" lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   Toy Story 3 - Wikipedia
  </title>
  <script>
   document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"8134b609-c11f-4d53-9620-e13ce33fe62f","wgCSPNonce":!1,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"Toy_Story_3","wgTitle":"Toy Story 3","wgCurRevisionId":979102428,"wgRevisionId":979102428,"wgArticleId":1213838,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Wikipedia indefinitely semi-protected pages","Articles with short description","Short description matches Wikidata","Good articles","Use American English fro

In [4]:
info_box = soup.find(class_="infobox vevent")
info_rows = info_box.find_all("tr")
for row in info_rows:
    print(row.prettify())

<tr>
 <th class="summary" colspan="2" style="text-align:center;font-size:125%;font-weight:bold;font-size:110%;font-style:italic;">
  Toy Story 3
 </th>
</tr>

<tr>
 <td colspan="2" style="text-align:center">
  <a class="image" href="/wiki/File:Toy_Story_3_poster.jpg" title="All of the toys packed close together, holding up a large numeral '3', with Buzz, who is putting a friendly arm around Woody's shoulder, and Woody holding the top of the 3.">
   <img alt="All of the toys packed close together, holding up a large numeral '3', with Buzz, who is putting a friendly arm around Woody's shoulder, and Woody holding the top of the 3." class="thumbborder" data-file-height="326" data-file-width="220" decoding="async" height="326" src="//upload.wikimedia.org/wikipedia/en/6/69/Toy_Story_3_poster.jpg" width="220"/>
  </a>
  <div style="font-size:95%;padding:0.35em 0.35em 0.25em;line-height:1.25em;">
   Theatrical release poster
  </div>
 </td>
</tr>

<tr>
 <th scope="row" style="white-space:nowra

In [5]:
def get_content_value(row_data):
    if row_data.find("li"):
        return [li.get_text(" ", strip=True).replace("\xa0", " ") for li in row_data.find_all("li")]
    else:
        return row_data.get_text(" ", strip=True).replace("\xa0", " ")

movie_info = {}
for index, row in enumerate(info_rows):
    if index == 0:
        movie_info['title'] = row.find("th").get_text(" ", strip=True)
    elif index == 1:
        continue
    else:
        content_key = row.find("th").get_text(" ", strip=True)
        content_value = get_content_value(row.find("td"))
        movie_info[content_key] = content_value
    
movie_info

{'title': 'Toy Story 3',
 'Directed by': 'Lee Unkrich',
 'Produced by': 'Darla K. Anderson',
 'Screenplay by': 'Michael Arndt',
 'Story by': ['John Lasseter', 'Andrew Stanton', 'Lee Unkrich'],
 'Starring': ['Tom Hanks',
  'Tim Allen',
  'Joan Cusack',
  'Don Rickles',
  'Wallace Shawn',
  'John Ratzenberger',
  'Estelle Harris',
  'Blake Clark',
  'Jeff Pidgeon',
  'Ned Beatty',
  'Michael Keaton',
  'Jodi Benson',
  'John Morris'],
 'Music by': 'Randy Newman',
 'Cinematography': ['Jeremy Lasky', 'Kim White'],
 'Edited by': 'Ken Schretzmann',
 'Production company': ['Walt Disney Pictures', 'Pixar Animation Studios'],
 'Distributed by': 'Walt Disney Studios Motion Pictures',
 'Release date': ['June 12, 2010 ( 2010-06-12 ) ( Taormina Film Fest )',
  'June 18, 2010 ( 2010-06-18 ) (United States)'],
 'Running time': '103 minutes [1]',
 'Country': 'United States',
 'Language': 'English',
 'Budget': '$200 million [1]',
 'Box office': '$1.067 billion [1]'}

### Task #2: Get info box for all movies

In [6]:
r = requests.get("https://en.wikipedia.org/wiki/List_of_Walt_Disney_Pictures_films")

# Convert to a beautiful soup object
soup = bs(r.content)

# Print out the HTML
contents = soup.prettify()
print(contents)

<!DOCTYPE html>
<html class="client-nojs" dir="ltr" lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   List of Walt Disney Pictures films - Wikipedia
  </title>
  <script>
   document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"96cb4718-b42e-4c1f-a2fc-a87df30b684b","wgCSPNonce":!1,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"List_of_Walt_Disney_Pictures_films","wgTitle":"List of Walt Disney Pictures films","wgCurRevisionId":979108754,"wgRevisionId":979108754,"wgArticleId":1970335,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Webarchive template wayback links","CS1 maint: archived copy as title","Articles with sh

In [7]:
movies = soup.select(".wikitable.sortable i")
movies[0:10]

[<i><a href="/wiki/Academy_Award_Review_of_Walt_Disney_Cartoons" title="Academy Award Review of Walt Disney Cartoons">Academy Award Review of Walt Disney Cartoons</a></i>,
 <i><a href="/wiki/Snow_White_and_the_Seven_Dwarfs_(1937_film)" title="Snow White and the Seven Dwarfs (1937 film)">Snow White and the Seven Dwarfs</a></i>,
 <i><a href="/wiki/Pinocchio_(1940_film)" title="Pinocchio (1940 film)">Pinocchio</a></i>,
 <i><a href="/wiki/Fantasia_(1940_film)" title="Fantasia (1940 film)">Fantasia</a></i>,
 <i><a href="/wiki/The_Reluctant_Dragon_(1941_film)" title="The Reluctant Dragon (1941 film)">The Reluctant Dragon</a></i>,
 <i><a href="/wiki/Dumbo" title="Dumbo">Dumbo</a></i>,
 <i><a href="/wiki/Bambi" title="Bambi">Bambi</a></i>,
 <i><a href="/wiki/Saludos_Amigos" title="Saludos Amigos">Saludos Amigos</a></i>,
 <i><a href="/wiki/Victory_Through_Air_Power_(film)" title="Victory Through Air Power (film)">Victory Through Air Power</a></i>,
 <i><a href="/wiki/The_Three_Caballeros" title=

In [52]:
def get_content_value(row_data):
    if row_data.find("li"):
        return [li.get_text(" ", strip=True).replace("\xa0", " ") for li in row_data.find_all("li")]
    elif row_data.find("br"):
        return [text for text in row_data.stripped_strings]
    else:
        return row_data.get_text(" ", strip=True).replace("\xa0", " ")

def clean_tags(soup):
    for tag in soup.find_all(["sup", "span"]):
        tag.decompose()
        
def get_info_box(url):

    r = requests.get(url)
    soup = bs(r.content)
    info_box = soup.find(class_="infobox vevent")
    info_rows = info_box.find_all("tr")
    
    clean_tags(soup)

    movie_info = {}
    for index, row in enumerate(info_rows):
        if index == 0:
            movie_info['title'] = row.find("th").get_text(" ", strip=True)
        else:
            header = row.find('th')
            if header:
                content_key = row.find("th").get_text(" ", strip=True)
                content_value = get_content_value(row.find("td"))
                movie_info[content_key] = content_value
            
    return movie_info    

In [51]:
get_info_box("https://en.wikipedia.org/wiki/One_Little_Indian_(film)")

{'title': 'One Little Indian',
 'Directed by': 'Bernard McEveety',
 'Produced by': 'Winston Hibler',
 'Written by': 'Harry Spalding',
 'Starring': ['James Garner',
  'Vera Miles',
  'Pat Hingle',
  'Morgan Woodward',
  'Jodie Foster'],
 'Music by': 'Jerry Goldsmith',
 'Cinematography': 'Charles F. Wheeler',
 'Edited by': 'Robert Stafford',
 'Production company': 'Walt Disney Productions',
 'Distributed by': 'Buena Vista Distribution',
 'Release date': ['June 20, 1973'],
 'Running time': '90 Minutes',
 'Country': 'United States',
 'Language': 'English',
 'Box office': '$2 million'}

In [53]:
r = requests.get("https://en.wikipedia.org/wiki/List_of_Walt_Disney_Pictures_films")
soup = bs(r.content)
movies = soup.select(".wikitable.sortable i a")

base_path = "https://en.wikipedia.org/"

movie_info_list = []
for index, movie in enumerate(movies):
    if index % 10 == 0:
        print(index)
    try:
        relative_path = movie['href']
        full_path = base_path + relative_path
        title = movie['title']
        
        movie_info_list.append(get_info_box(full_path))
        
    except Exception as e:
        print(movie.get_text())
        print(e)
    

0
10
20
30
40
Zorro the Avenger
'NoneType' object has no attribute 'find'
The Sign of Zorro
'NoneType' object has no attribute 'find'
50
60
70
80
90
100
110
120
True-Life Adventures
'NoneType' object has no attribute 'find_all'
130
140
150
160
170
180
190
200
210
220
230
240
250
260
270
280
290
300
310
320
330
340
350
360
370
380
390
400
410
420
430


In [54]:
len(movie_info_list)

432

#### Save/Reload Movie Data

In [39]:
import json

def save_data(title, data):
    with open(title, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=2)

In [4]:
import json

def load_data(title):
    with open(title, encoding="utf-8") as f:
        return json.load(f)

In [56]:
save_data("disney_data_cleaned.json", movie_info_list)

### Task #3: Clean our data!

In [2]:
movie_info_list = load_data("disney_data_cleaned.json")

#### Subtasks
- ~~Clean up references [1]~~
- ~~Convert running time into an integer~~
- Convert dates into datetime object
- ~~Split up the long strings~~
- ~~Convert Budget & Box office to numbers~~

In [22]:
movie_info_list[-40]

{'title': 'Beauty and the Beast',
 'Directed by': 'Bill Condon',
 'Produced by': ['David Hoberman', 'Todd Lieberman'],
 'Screenplay by': ['Stephen Chbosky', 'Evan Spiliotopoulos'],
 'Based on': ["Disney 's Beauty and the Beast by Linda Woolverton",
  'Beauty and the Beast by Jeanne-Marie Leprince de Beaumont'],
 'Starring': ['Emma Watson',
  'Dan Stevens',
  'Luke Evans',
  'Kevin Kline',
  'Josh Gad',
  'Ewan McGregor',
  'Stanley Tucci',
  'Audra McDonald',
  'Gugu Mbatha-Raw',
  'Ian McKellen',
  'Emma Thompson'],
 'Music by': 'Alan Menken',
 'Cinematography': 'Tobias A. Schliessler',
 'Edited by': 'Virginia Katz',
 'Production company': ['Walt Disney Pictures', 'Mandeville Films'],
 'Distributed by': ['Walt Disney Studios', 'Motion Pictures'],
 'Release date': ['February 23, 2017 ( Spencer House )',
  'March 17, 2017 (United States)'],
 'Running time': '129 minutes',
 'Country': 'United States',
 'Language': 'English',
 'Budget': '$160–255 million',
 'Box office': '$1.264 billion',

In [5]:
print([movie.get('Running time', 'N/A') for movie in movie_info_list])


['41 minutes (74 minutes 1966 release)',
 '83 minutes',
 '88 minutes',
 '126 minutes',
 '74 minutes',
 '64 minutes',
 '70 minutes',
 '42 minutes',
 '65 min.',
 '71 minutes',
 '75 minutes',
 '94 minutes',
 '73 minutes',
 '75 minutes',
 '82 minutes',
 '68 minutes',
 '74 minutes',
 '96 minutes',
 '75 minutes',
 '84 minutes',
 '77 minutes',
 '92 minutes',
 '69 minutes',
 '81 minutes',
 ['60 minutes (VHS version)', '71 minutes (original)'],
 '127 minutes',
 '92 minutes',
 '76 minutes',
 '75 minutes',
 '73 minutes',
 '85 minutes',
 '81 minutes',
 '70 minutes',
 '90 min.',
 '80 minutes',
 '75 minutes',
 '83 minutes',
 '83 minutes',
 '72 minutes',
 '97 minutes',
 '75 minutes',
 '104 minutes',
 '93 minutes',
 '105 minutes',
 '95 minutes',
 '97 minutes',
 '134 minutes',
 '69 minutes',
 '92 minutes',
 '126 minutes',
 '79 minutes',
 '97 minutes',
 '128 minutes',
 '74 minutes',
 '91 minutes',
 '105 minutes',
 '98 minutes',
 '130 minutes',
 '89 min.',
 '93 minutes',
 '67 minutes',
 '98 minutes',
 '1

In [13]:
# "85 minutes"
def minutes_to_integer(running_time):
    if running_time == "N/A":
        return None
    
    if isinstance(running_time, list):
        return int(running_time[0].split(" ")[0])
    else: # is a string
        return int(running_time.split(" ")[0])

for movie in movie_info_list:
    movie['Running time (int)'] = minutes_to_integer(movie.get('Running time', "N/A"))



In [15]:
print([movie.get('Running time (int)', 'N/A') for movie in movie_info_list])


[41, 83, 88, 126, 74, 64, 70, 42, 65, 71, 75, 94, 73, 75, 82, 68, 74, 96, 75, 84, 77, 92, 69, 81, 60, 127, 92, 76, 75, 73, 85, 81, 70, 90, 80, 75, 83, 83, 72, 97, 75, 104, 93, 105, 95, 97, 134, 69, 92, 126, 79, 97, 128, 74, 91, 105, 98, 130, 89, 93, 67, 98, 100, 118, 103, 110, 80, 79, 91, 91, 97, 118, 139, 92, 131, 87, 116, 93, 110, 110, 131, 101, 108, 84, 78, 75, 164, 106, 110, 99, 113, 108, 112, 93, 91, 93, 100, 100, 79, 96, 113, 89, 118, 92, 88, 92, 87, 93, 93, 93, 90, 83, 96, 88, 89, 91, 93, 92, 97, 100, 100, 89, 91, 112, 115, 95, 91, 95, 104, 74, 48, 77, 104, 128, 101, 94, 104, 90, 100, 88, 93, 98, 100, 112, 84, 98, 97, 114, 96, 100, 109, 83, 90, 107, 96, 103, 91, 95, 105, 113, 80, 101, 89, 74, 90, 89, 110, 74, 93, 84, 83, 69, 77, 107, 93, 88, 108, 84, 121, 89, 104, 90, 86, 84, 108, 107, 96, 98, 105, 108, 94, 106, 102, 88, 102, 102, 97, 111, 100, 96, 98, 78, 81, 108, 89, 99, 89, 81, 92, 100, 89, 79, 91, 101, 104, 103, 86, 105, 93, 92, 98, 95, 93, 87, 93, 87, 128, 86, 95, 114, 93, 

In [44]:
print([movie.get('Budget', 'N/A') for movie in movie_info_list])

['N/A', '$1.49 million', '$2.6 million', '$2.28 million', '$600,000', '$950,000', '$858,000', 'N/A', '$788,000', 'N/A', '$1.35 million', '$2.125 million', 'N/A', '$1.5 million', '$1.5 million', 'N/A', '$2.9 million', '$1,800,000', '$3 million', 'N/A', '$4 million', '$2 million', '$300,000', '$1.8 million', 'N/A', '$5 million', 'N/A', '$4 million', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', '$700,000', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', '$6 million', 'under $1 million or $1,250,000', 'N/A', '$2 million', 'N/A', 'N/A', '$2.5 million', 'N/A', 'N/A', '$4 million', '$3.6 million', 'N/A', 'N/A', 'N/A', 'N/A', '$3 million', 'N/A', '$3 million', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', '$3 million', 'N/A', 'N/A', 'N/A', 'N/A', '$4.4–6 million', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', '$4 million', 'N/A', '$5 million', 'N/A', 'N/A', 'N/A', 'N/A', '$5 million', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', '$4 million', 'N/A', 'N/A', 'N/A', '

In [40]:
import re

amounts = r"thousand|million|billion"
number = r"\d+(,\d{3})*\.*\d*"

word_re = rf"\${number}(-|\sto\s|–)?({number})?\s({amounts})"
value_re = rf"\${number}"

def word_to_value(word):
    value_dict = {"thousand": 1000, "million": 1000000, "billion": 1000000000}
    return value_dict[word]

def parse_word_syntax(string):
    value_string = re.search(number, string).group()
    value = float(value_string.replace(",", ""))
    word = re.search(amounts, string, flags=re.I).group().lower()
    word_value = word_to_value(word)
    return value*word_value

def parse_value_syntax(string):
    value_string = re.search(number, string).group()
    value = float(value_string.replace(",", ""))
    return value

'''
money_conversion("$12.2 million") --> 12200000 ## Word syntax
money_conversion("$790,000") --> 790000        ## Value syntax
'''
def money_conversion(money):
    if money == "N/A":
        return None

    if isinstance(money, list):
        money = money[0]
        
    word_syntax = re.search(word_re, money, flags=re.I)
    value_syntax = re.search(value_re, money)

    if word_syntax:
        return parse_word_syntax(word_syntax.group())

    elif value_syntax:
        return parse_value_syntax(value_syntax.group())

    else:
        return None

In [38]:
for movie in movie_info_list:
    movie['Budget (float)'] = money_conversion(movie.get('Budget', "N/A"))
    movie['Box office (float)'] = money_conversion(movie.get('Box office', "N/A"))

$1.49 million
HERE
$418 million
HERE
$2.6 million
HERE
$164 million
HERE
$2.28 million
HERE
$76.4–$83.3 million
HERE
$600,000
$960,000 (worldwide rentals)
$950,000
$1.3 million (est. United States/Canada rentals, 1941)
HERE
$858,000
$267.4 million
HERE
$1,135,000 (worldwide rentals)
$788,000
$799,000
$3,355,000 (worldwide rentals)
$1.35 million
HERE
$3,275,000 (worldwide rentals)
$2.125 million
HERE
$65 million
HERE
$3,165,000 (worldwide rentals)
$1.5 million
HERE
$2,560,000 (worldwide rentals)
$1.5 million
HERE
$3.7 million (U.S. rental) $575,000 (foreign rental)
HERE
$1,625,000 (worldwide rentals)
$2.9 million
HERE
$263.6 million
HERE
$1,800,000
$4,100,000 (worldwide rentals)
$3 million
HERE
$5.6 million (US, 1951)
HERE
$2.1 million (US rentals)
HERE
$4 million
HERE
$87.4 million
HERE
$2 million
HERE
$1 million (US)
HERE
$300,000
$2.6 million (US)
HERE
$1.8 million
HERE
$1.75 million (US and Canadian rentals)
HERE
$5 million
HERE
$28.2 million
HERE
$2,150,000 (US)
$4 million
HERE
$18

HERE
1.31 billion
833.5 million
$175 million
HERE
$807.1 million
HERE
$100–130 million
HERE
$133.4 million
HERE
$200 million
HERE
$1.243 billion
HERE
$75 million
HERE
$197.7 million
HERE
$120–133 million
HERE
$174 million
HERE
$175 million
HERE
$529.3 million
HERE
$130 million
HERE
$349.5 million
HERE
$170 million
HERE
$353.3 million
HERE
$7.7
 million
$183 million
HERE

$200 million
HERE
$1.073 billion
HERE
$250–260 million
HERE
$1.657 billion
HERE
$185 million
HERE
$491.7 million
HERE
$60 million
HERE
$150 million
HERE
$1.450 billion
HERE
$40 million
HERE
$42 million
HERE
$175–200 million
HERE
$136.4 million
HERE
$125 million
HERE
$12.5 million (stage production)
HERE
$24 million
HERE
$200 million
HERE
$57 million
HERE
$150 million+
HERE


In [54]:
money_conversion(str(movie_info_list[-40]["Budget"]))

160000000.0

In [57]:
# Convert Dates into datetimes
print([movie.get('Release date', 'N/A') for movie in movie_info_list])

[['May 19, 1937'], ['December 21, 1937 ( Carthay Circle Theatre , Los Angeles , CA )', 'February 4, 1938 (United States)'], ['February 7, 1940 ( Center Theatre )', 'February 23, 1940 (United States)'], ['November 13, 1940'], ['June 20, 1941'], ['October 23, 1941 (New York City)', 'October 31, 1941 (U.S.)'], ['August 9, 1942 (World Premiere-London)', 'August 13, 1942 (Premiere-New York City)', 'August 21, 1942 (U.S.)'], ['August 24, 1942 (World Premiere-Rio de Janeiro)', 'February 6, 1943 (U.S. Premiere-Boston)', 'February 19, 1943 (U.S.)'], ['July 17, 1943'], ['December 21, 1944 (Mexico City)', 'February 3, 1945 (US)'], ['April 20, 1946 (Premiere-New York City)', 'August 15, 1946 (U.S.)'], ['November 12, 1946 (Premiere: Atlanta, Georgia)', 'November 20, 1946'], ['September 27, 1947'], 'May 27, 1948', ['November 29, 1948 (Chicago, Illinois)', 'January 19, 1949 (Indianapolis, Indiana)'], ['October 5, 1949'], ['February 15, 1950 (Boston)', 'March 4, 1950 (United States)'], ['June 22, 1950

In [56]:
movie_info_list[-50]

{'title': 'The Jungle Book',
 'Directed by': 'Jon Favreau',
 'Produced by': ['Jon Favreau', 'Brigham Taylor'],
 'Written by': 'Justin Marks',
 'Based on': ['The Jungle Book', 'by', 'Rudyard Kipling'],
 'Starring': ['Bill Murray',
  'Ben Kingsley',
  'Idris Elba',
  "Lupita Nyong'o",
  'Scarlett Johansson',
  'Giancarlo Esposito',
  'Christopher Walken',
  'Neel Sethi'],
 'Music by': 'John Debney',
 'Cinematography': 'Bill Pope',
 'Edited by': 'Mark Livolsi',
 'Production company': ['Walt Disney Pictures', 'Fairview Entertainment'],
 'Distributed by': ['Walt Disney Studios', 'Motion Pictures'],
 'Release date': ['April 4, 2016 ( El Capitan Theatre )',
  'April 15, 2016 (United States)'],
 'Running time': '106 minutes',
 'Country': 'United States',
 'Language': 'English',
 'Budget': '$175–177 million',
 'Box office': '$966.6 million',
 'Running time (int)': 106,
 'Budget (float)': 175000000.0,
 'Box office (float)': 966600000.0}

In [61]:
# June 28, 1950
from datetime import datetime

dates = [movie.get('Release date', 'N/A') for movie in movie_info_list]

def clean_date(date):
    return date.split("(")[0].strip()

def date_conversion(date):
    if isinstance(date, list):
        date = date[0]
        
    if date == "N/A":
        return None
        
    date_str = clean_date(date)

    fmts = ["%B %d, %Y", "%d %B %Y"]
    for fmt in fmts:
        try:
            return datetime.strptime(date_str, fmt)
        except:
            pass
    return None


In [62]:
for movie in movie_info_list:
    movie['Release date (datetime)'] = date_conversion(movie.get('Release date', 'N/A'))

In [63]:
movie_info_list[50]

{'title': 'One Hundred and One Dalmatians',
 'Directed by': ['Clyde Geronimi', 'Hamilton Luske', 'Wolfgang Reitherman'],
 'Produced by': 'Walt Disney',
 'Story by': 'Bill Peet',
 'Based on': ['The Hundred and One Dalmatians', 'by', 'Dodie Smith'],
 'Starring': ['Rod Taylor',
  'Cate Bauer',
  'Betty Lou Gerson',
  'Ben Wright',
  'Bill Lee (singing voice)',
  'Lisa Davis',
  'Martha Wentworth'],
 'Music by': 'George Bruns',
 'Edited by': ['Roy M. Brewer, Jr.', 'Donald Halliday'],
 'Production company': 'Walt Disney Productions',
 'Distributed by': 'Buena Vista Pictures Distribution',
 'Release date': ['January 25, 1961'],
 'Running time': '79 minutes',
 'Country': 'United States',
 'Language': 'English',
 'Budget': '$3.6 million',
 'Box office': '$303 million',
 'Running time (int)': 79,
 'Budget (float)': 3600000.0,
 'Box office (float)': 303000000.0,
 'Release date (datetime)': datetime.datetime(1961, 1, 25, 0, 0)}

In [7]:
import pickle

def save_data_pickle(name, data):
    with open(name, 'wb') as f:
        pickle.dump(data, f)

In [9]:
import pickle

def load_data_pickle(name):
    with open(name, 'rb') as f:
        return pickle.load(f)

In [8]:
save_data_pickle("disney_movie_data_cleaned_more.pickle", movie_info_list)

In [10]:
a = load_data_pickle("disney_movie_data_cleaned_more.pickle")

In [12]:
a == movie_info_list

True

### Task #4: Attach IMDB/Rotten Tomatoes/Metascore scores

In [13]:
movie_info_list = load_data_pickle('disney_movie_data_cleaned_more.pickle')

In [14]:
movie_info_list[-60]

{'title': 'Into the Woods',
 'Directed by': 'Rob Marshall',
 'Produced by': ['Rob Marshall',
  'John DeLuca',
  'Marc Platt',
  'Callum McDougall'],
 'Screenplay by': 'James Lapine',
 'Based on': ['Into the Woods', 'by', 'Stephen Sondheim', 'James Lapine'],
 'Starring': ['Meryl Streep',
  'Emily Blunt',
  'James Corden',
  'Anna Kendrick',
  'Chris Pine',
  'Tracey Ullman',
  'Christine Baranski',
  'Johnny Depp'],
 'Narrated by': 'James Corden',
 'Music by': 'Stephen Sondheim',
 'Cinematography': 'Dion Beebe',
 'Edited by': 'Wyatt Smith',
 'Production company': ['Walt Disney Pictures',
  'Lucamar Productions',
  'Marc Platt Productions'],
 'Distributed by': ['Walt Disney Studios', 'Motion Pictures'],
 'Release date': ['December 8, 2014 ( Ziegfeld Theatre )',
  'December 25, 2014 (United States)'],
 'Running time': '124 minutes',
 'Country': 'United States',
 'Language': 'English',
 'Budget': '$50 million',
 'Box office': '$213.1 million',
 'Running time (int)': 124,
 'Budget (float)':

In [None]:
# http://www.omdbapi.com/?apikey=[yourkey]&

In [17]:
import requests
import urllib
import os

def get_omdb_info(title):
    base_url = "http://www.omdbapi.com/?"
    parameters = {"apikey": os.environ['OMDB_API_KEY'], 't': title}
    params_encoded = urllib.parse.urlencode(parameters)
    full_url = base_url + params_encoded
    return requests.get(full_url).json()

def get_rotten_tomato_score(omdb_info):
    ratings = omdb_info.get('Ratings', [])
    for rating in ratings:
        if rating['Source'] == 'Rotten Tomatoes':
            return rating['Value']
    return None

get_omdb_info("into the woods")

{'Title': 'Into the Woods',
 'Year': '2014',
 'Rated': 'PG',
 'Released': '25 Dec 2014',
 'Runtime': '125 min',
 'Genre': 'Adventure, Comedy, Drama, Fantasy, Musical',
 'Director': 'Rob Marshall',
 'Writer': 'James Lapine (screenplay by), James Lapine (based on the musical by)',
 'Actors': 'Anna Kendrick, Daniel Huttlestone, James Corden, Emily Blunt',
 'Plot': 'A witch tasks a childless baker and his wife with procuring magical items from classic fairy tales to reverse the curse put on their family tree.',
 'Language': 'English',
 'Country': 'USA',
 'Awards': 'Nominated for 3 Oscars. Another 11 wins & 71 nominations.',
 'Poster': 'https://m.media-amazon.com/images/M/MV5BMTY4MzQ4OTY3NF5BMl5BanBnXkFtZTgwNjM5MDI3MjE@._V1_SX300.jpg',
 'Ratings': [{'Source': 'Internet Movie Database', 'Value': '5.9/10'},
  {'Source': 'Rotten Tomatoes', 'Value': '71%'},
  {'Source': 'Metacritic', 'Value': '69/100'}],
 'Metascore': '69',
 'imdbRating': '5.9',
 'imdbVotes': '129,838',
 'imdbID': 'tt2180411',


In [18]:
for movie in movie_info_list:
    title = movie['title']
    omdb_info = get_omdb_info(title)
    movie['imdb'] = omdb_info.get('imdbRating', None)
    movie['metascore'] = omdb_info.get('Metascore', None)
    movie['rotten_tomatoes'] = get_rotten_tomato_score(omdb_info)

In [54]:
movie_info_list[-30]

{'title': 'The Nutcracker and the Four Realms',
 'Directed by': ['Lasse Hallström', 'Joe Johnston'],
 'Produced by': ['Mark Gordon', 'Larry Franco'],
 'Written by': ['Ashleigh Powell'],
 'Based on': ['" The Nutcracker and the Mouse King " by E. T. A. Hoffmann',
  'The Nutcracker by Marius Petipa'],
 'Starring': ['Keira Knightley',
  'Mackenzie Foy',
  'Eugenio Derbez',
  'Matthew Macfadyen',
  'Richard E. Grant',
  'Misty Copeland',
  'Helen Mirren',
  'Morgan Freeman'],
 'Music by': 'James Newton Howard',
 'Cinematography': 'Linus Sandgren',
 'Edited by': 'Stuart Levy',
 'Production company': ['Walt Disney Pictures', 'The Mark Gordon Company'],
 'Distributed by': 'Walt Disney Studios Motion Pictures',
 'Release date': ['October 29, 2018 ( Dolby Theatre )',
  'November 2, 2018 (United States)'],
 'Running time': '99 minutes',
 'Country': 'United States',
 'Language': ['English', 'French'],
 'Budget': '$120–133 million',
 'Box office': '$174 million',
 'Running time (int)': 99,
 'Budget

In [55]:
for movie in movie_info_list:
    movie['imdb'] = float(movie['imdb'])
    movie['metascore'] = float(movie['metascore'])
    movie['rotten_tomatoes'] = float(movie['rotten_tomatoes'].strip('%'))

ValueError: could not convert string to float: 'N/A'

In [21]:
save_data_pickle('disney_movie_data_final.pickle', movie_info_list)

### Task #5: Save data as JSON & CSV

In [27]:
movie_info_list[50]

{'title': 'One Hundred and One Dalmatians',
 'Directed by': ['Clyde Geronimi', 'Hamilton Luske', 'Wolfgang Reitherman'],
 'Produced by': 'Walt Disney',
 'Story by': 'Bill Peet',
 'Based on': ['The Hundred and One Dalmatians', 'by', 'Dodie Smith'],
 'Starring': ['Rod Taylor',
  'Cate Bauer',
  'Betty Lou Gerson',
  'Ben Wright',
  'Bill Lee (singing voice)',
  'Lisa Davis',
  'Martha Wentworth'],
 'Music by': 'George Bruns',
 'Edited by': ['Roy M. Brewer, Jr.', 'Donald Halliday'],
 'Production company': 'Walt Disney Productions',
 'Distributed by': 'Buena Vista Pictures Distribution',
 'Release date': ['January 25, 1961'],
 'Running time': '79 minutes',
 'Country': 'United States',
 'Language': 'English',
 'Budget': '$3.6 million',
 'Box office': '$303 million',
 'Running time (int)': 79,
 'Budget (float)': 3600000.0,
 'Box office (float)': 303000000.0,
 'Release date (datetime)': datetime.datetime(1961, 1, 25, 0, 0),
 'imdb': '7.2',
 'metascore': '83',
 'rotten_tomatoes': '98%'}

In [28]:
movie_info_copy = [movie.copy() for movie in movie_info_list]

In [35]:
for movie in movie_info_copy:
    current_date = movie['Release date (datetime)']
    if current_date:
        movie['Release date (datetime)'] = current_date.strftime("%B %d, %Y")
    else:
        movie['Release date (datetime)'] = None

In [40]:
save_data("disney_data_final.json", movie_info_copy)

#### Convert data to CSV

In [41]:
import pandas as pd

df = pd.DataFrame(movie_info_list)

In [44]:
df.head()

Unnamed: 0,title,Production company,Release date,Running time,Country,Language,Running time (int),Budget (float),Box office (float),Release date (datetime),...,Box office,Story by,Narrated by,Cinematography,Edited by,Screenplay by,Production companies,Adaptation by,Traditional,Simplified
0,Academy Award Review of,Walt Disney Productions,"[May 19, 1937]",41 minutes (74 minutes 1966 release),United States,English,41.0,,,1937-05-19,...,,,,,,,,,,
1,Snow White and the Seven Dwarfs,Walt Disney Productions,"[December 21, 1937 ( Carthay Circle Theatre , ...",83 minutes,United States,English,83.0,1490000.0,418000000.0,1937-12-21,...,$418 million,,,,,,,,,
2,Pinocchio,Walt Disney Productions,"[February 7, 1940 ( Center Theatre ), February...",88 minutes,United States,English,88.0,2600000.0,164000000.0,1940-02-07,...,$164 million,"[Ted Sears, Otto Englander, Webb Smith, Willia...",,,,,,,,
3,Fantasia,Walt Disney Productions,"[November 13, 1940]",126 minutes,United States,English,126.0,2280000.0,83300000.0,1940-11-13,...,$76.4–$83.3 million,"[Joe Grant, Dick Huemer]",Deems Taylor,James Wong Howe,,,,,,
4,The Reluctant Dragon,Walt Disney Productions,"[June 20, 1941]",74 minutes,United States,English,74.0,600000.0,960000.0,1941-06-20,...,"$960,000 (worldwide rentals)",,,Bert Giennon,Paul Weatherwax,,,,,


In [45]:
df.to_csv("disney_movie_data_final.csv")

In [51]:
running_times = df.sort_values(['Running time (int)'],  ascending=False)
running_times.head(20)

Unnamed: 0,title,Production company,Release date,Running time,Country,Language,Running time (int),Budget (float),Box office (float),Release date (datetime),...,Box office,Story by,Narrated by,Cinematography,Edited by,Screenplay by,Production companies,Adaptation by,Traditional,Simplified
300,Pirates of the Caribbean: At World's End,,"[May 19, 2007 ( Disneyland Resort ), May 25, 2...",168 minutes,United States,English,168.0,300000000.0,961000000.0,2007-05-19,...,$961 million,,,Dariusz Wolski,"[Craig Wood, Stephen Rivkin]",,"[Walt Disney Pictures, Jerry Bruckheimer Films]",,,
86,The Happiest Millionaire,Walt Disney Productions,"[June 23, 1967, November 30, 1967]","[164 minutes, (, Los Angeles, premiere), 144 m...",United States,English,164.0,5000000.0,5000000.0,1967-06-23,...,$5 million (US/ Canada rentals),,,Edward Colman,Cotton Warburton,,,,,
397,Jagga Jasoos,"[Walt Disney Pictures India, Picture Shuru Ent...",[14 July 2017],162 minutes,India,Hindi,162.0,,,2017-07-14,...,833.5 million,Anurag Basu,,Ravi Varman,Amitabh Shukla,"[Anurag Basu, Dialogues in Rhyme:, Amitabh Bha...",,,,
390,Dangal,"[Aamir Khan Productions, Walt Disney Pictures ...","[21 December 2016 (United States), 23 December...",161 minutes,India,Hindi,161.0,,,2016-12-21,...,"[est., (, )]",,Aparshakti Khurana,Setu (Satyajit Pande),Ballu Saluja,,,,,
421,Hamilton,"[Walt Disney Pictures, 5000 Broadway Productio...","[July 3, 2020]",160 minutes,United States,English,160.0,12500000.0,,2020-07-03,...,,,,Declan Quinn,Jonah Moran,,,,,
378,ABCD 2,Walt Disney Pictures,[19 June 2015],154 minutes,India,Hindi,154.0,,,2015-06-19,...,est.,Remo D'Souza,,Vijay Kumar Arora,Manan Sagar,,,,,
294,Pirates of the Caribbean: Dead Man's Chest,,"[June 24, 2006 ( Disneyland Resort ), July 7, ...",151 minutes,United States,English,151.0,225000000.0,1066000000.0,2006-06-24,...,$1.066 billion,,,Dariusz Wolski,"[Craig Wood, Stephen Rivkin]",,"[Walt Disney Pictures, Jerry Bruckheimer Films]",,,
309,The Chronicles of Narnia: Prince Caspian,"[Walt Disney Pictures, Walden Media]","[May 7, 2008 ( New York City ), May 16, 2008 (...",150 minutes,"[United Kingdom, United States]",English,150.0,225000000.0,419700000.0,2008-05-07,...,$419.7 million,,,Karl Walter Lindenlaub,Sim Evan-Jones,"[Andrew Adamson, Christopher Markus Stephen Mc...",,,,
360,The Lone Ranger,"[Walt Disney Pictures, Jerry Bruckheimer Films...","[June 22, 2013 ( Hyperion Theatre ), July 3, 2...",149 minutes,United States,English,149.0,225000000.0,260500000.0,2013-06-22,...,$260.5 million,"[Ted Elliott, Terry Rossio, Justin Haythe]",,Bojan Bazelli,"[James Haygood, Craig Wood]","[Justin Haythe, Ted Elliott, Terry Rossio]",,,,
287,"The Chronicles of Narnia: The Lion, the Witch ...","[Walt Disney Pictures, Walden Media]","[December 7, 2005 ( Royal Film Performance ), ...",143 minutes,"[United Kingdom, United States]",English,143.0,180000000.0,745000000.0,2005-12-07,...,$745 million,,,Donald McAlpine,"[Sim Evan-Jones, Jim May]","[Ann Peacock, Andrew Adamson, Christopher Mark...",,,,
