<a href="https://colab.research.google.com/github/sid999999/API-WebScraping/blob/main/MovieInfo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Disney Dataset Creation (w/ Python BeautifulSoup)
# Scrape & clean a list of disney wikipedia pages to create a dataset to further analyze

#Beautiful soup documentation: https://www.crummy.com/software/BeautifulSoup/bs4/doc/

# ***Task #1: Get Info Box (store in Python dictionary)***
Import Necessary Libraries

In [None]:
from bs4 import BeautifulSoup as bs
import requests

Load the webpage

In [None]:
r = requests.get("https://en.wikipedia.org/wiki/Toy_Story_3")

# Convert to a beautiful soup object
soup = bs(r.content)

# Print out the HTML
contents = soup.prettify()
print(contents)

In [None]:
#get the dat from the infobox from wikipedia
info_box = soup.find(class_="infobox vevent")
info_rows = info_box.find_all("tr")
for row in info_rows:
#prettify: get indented nicer looking info
    print(row.prettify())

1.strip whitespace form the beginning and end of each bit of text

soup.get_text("|",strip=True)

In [None]:
#get the row data for just one movie
def get_content_value(row_data):
#if there is a list 
    if row_data.find("li"):
        return [li.get_text(" ", strip=True).replace("\xa0", " ") for li in row_data.find_all("li")]
    else:
        return row_data.get_text(" ", strip=True).replace("\xa0", " ")

movie_info = {}
for index, row in enumerate(info_rows):
    if index == 0:
        movie_info['title'] = row.find("th").get_text(" ", strip=True)
    elif index == 1:
        continue
    else:
        content_key = row.find("th").get_text(" ", strip=True)
        content_value = get_content_value(row.find("td"))
        movie_info[content_key] = content_value
    
movie_info

{'Box office': '$1.067 billion [1]',
 'Budget': '$200 million [1]',
 'Cinematography': ['Jeremy Lasky', 'Kim White'],
 'Country': 'United States',
 'Directed by': 'Lee Unkrich',
 'Distributed by': 'Walt Disney Studios Motion Pictures',
 'Edited by': 'Ken Schretzmann',
 'Language': 'English',
 'Music by': 'Randy Newman',
 'Produced by': 'Darla K. Anderson',
 'Production companies': ['Walt Disney Pictures', 'Pixar Animation Studios'],
 'Release dates': ['June 12, 2010 ( 2010-06-12 ) ( Taormina Film Fest )',
  'June 18, 2010 ( 2010-06-18 ) (United States)'],
 'Running time': '103 minutes [1]',
 'Screenplay by': 'Michael Arndt',
 'Starring': ['Tom Hanks',
  'Tim Allen',
  'Joan Cusack',
  'Don Rickles',
  'Wallace Shawn',
  'John Ratzenberger',
  'Estelle Harris',
  'Ned Beatty',
  'Michael Keaton',
  'Jodi Benson',
  'John Morris'],
 'Story by': ['John Lasseter', 'Andrew Stanton', 'Lee Unkrich'],
 'title': 'Toy Story 3'}

# ***Task #2: Get info box for all movies***

In [None]:
r = requests.get("https://en.wikipedia.org/wiki/List_of_Walt_Disney_Pictures_films")

# Convert to a beautiful soup object
soup = bs(r.content)

# Print out the HTML
contents = soup.prettify()
print(contents)

In [None]:
#select 2 different classes
#select the i box where contains title and link of the movie
movies = soup.select(".wikitable.sortable i a")
movies[0:2]
#movies

[<i><a href="/wiki/Academy_Award_Review_of_Walt_Disney_Cartoons" title="Academy Award Review of Walt Disney Cartoons">Academy Award Review of Walt Disney Cartoons</a></i>,
 <i><a href="/wiki/Snow_White_and_the_Seven_Dwarfs_(1937_film)" title="Snow White and the Seven Dwarfs (1937 film)">Snow White and the Seven Dwarfs</a></i>]

In [None]:
def get_content_value(row_data):
    if row_data.find("li"):
        return [li.get_text(" ", strip=True).replace("\xa0", " ") for li in row_data.find_all("li")]
    elif row_data.find("br"):
        return [text for text in row_data.stripped_strings]
    else:
        return row_data.get_text(" ", strip=True).replace("\xa0", " ")

def clean_tags(soup):
    for tag in soup.find_all(["sup", "span"]):
        tag.decompose()
        
def get_info_box(url):
#copy the code above for getting info for one movie
    r = requests.get(url)
    soup = bs(r.content)
    info_box = soup.find(class_="infobox vevent")
    info_rows = info_box.find_all("tr")
    
    clean_tags(soup)

    movie_info = {}
    for index, row in enumerate(info_rows):
        if index == 0:
            movie_info['title'] = row.find("th").get_text(" ", strip=True)
        else:
            header = row.find('th')
            if header:
                content_key = row.find("th").get_text(" ", strip=True)
                content_value = get_content_value(row.find("td"))
                movie_info[content_key] = content_value
            
    return movie_info    

In [None]:
#test the function for one movie
get_info_box("https://en.wikipedia.org/wiki/One_Little_Indian_(film)")

{'Box office': '$2 million',
 'Cinematography': 'Charles F. Wheeler',
 'Country': 'United States',
 'Directed by': 'Bernard McEveety',
 'Distributed by': 'Buena Vista Distribution',
 'Edited by': 'Robert Stafford',
 'Language': 'English',
 'Music by': 'Jerry Goldsmith',
 'Produced by': 'Winston Hibler',
 'Production company': 'Walt Disney Productions',
 'Release date': ['June 20, 1973'],
 'Running time': '90 Minutes',
 'Starring': ['James Garner',
  'Vera Miles',
  'Pat Hingle',
  'Morgan Woodward',
  'Jodie Foster'],
 'Written by': 'Harry Spalding',
 'title': 'One Little Indian'}

In [None]:
#get all the movies from disney
#get the information for each movie(title and link)
r = requests.get("https://en.wikipedia.org/wiki/List_of_Walt_Disney_Pictures_films")
soup = bs(r.content)
movies = soup.select(".wikitable.sortable i a")

base_path = "https://en.wikipedia.org/"

#get info for each movie
movie_info_list = []
for index, movie in enumerate(movies):
    if index % 10 == 0:
        print(index)
    try:
        relative_path = movie['href']
        full_path = base_path + relative_path
        title = movie['title']
        
        movie_info_list.append(get_info_box(full_path))
        
    except Exception as e:
        print(movie.get_text())
        print(e)

0
10
20
30
40
Zorro the Avenger
'NoneType' object has no attribute 'find'
The Sign of Zorro
'NoneType' object has no attribute 'find'
50
60
70
80
90
100
110
120
130
140
The London Connection
'NoneType' object has no attribute 'find'
150
160
170
180
190
200
210
220
230
240
250
260
270
280
290
300
310
320
330
340
350
360
370
380
390
400
410
420
430
440
450
460
470
480
The Beatles: Get Back – The Rooftop Concert
'NoneType' object has no attribute 'find'
490
500
61
'NoneType' object has no attribute 'find_all'
All Night Long
'NoneType' object has no attribute 'find'
510
Keeper of the Lost Cities
'NoneType' object has no attribute 'find_all'
Muppet Man
'NoneType' object has no attribute 'find_all'
520
Sister Act 3
'NoneType' object has no attribute 'find'
The Thief
'NoneType' object has no attribute 'find_all'
Tom Sawyer
'NoneType' object has no attribute 'find_all'
530
Tower of Terror
'NoneType' object has no attribute 'find_all'
Tron: Ares
'NoneType' object has no attribute 'find'
FC Barc

In [None]:
len(movie_info_list)

519

In [None]:
movie_info_list[0:10]

# ***Save/Reload Movie Data***

In [None]:
import json

def save_data(title, data):
    with open(title, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=2)

In [None]:
import json

def load_data(title):
    with open(title, encoding="utf-8") as f:
        return json.load(f)

In [None]:
save_data("disney_data_cleaned.json", movie_info_list)

# ***Task #3: Clean our data!***

In [None]:
movie_info_list = load_data("disney_data_cleaned.json")

Subtasks

Clean up references [1]

Convert running time into an integer

Convert dates into datetime object

Split up the long strings

Convert Budget & Box office to numbers

In [None]:
movie_info_list[-40]

{'Based on': "Walt Disney 's Jungle Cruise",
 'Box office': '$220.9 million',
 'Budget': '$200 million',
 'Cinematography': 'Flavio Labiano',
 'Country': 'United States',
 'Directed by': 'Jaume Collet-Serra',
 'Distributed by': 'Walt Disney Studios Motion Pictures',
 'Edited by': 'Joel Negron',
 'Language': 'English',
 'Music by': 'James Newton Howard',
 'Produced by': ['John Davis',
  'John Fox',
  'Beau Flynn',
  'Dwayne Johnson',
  'Dany Garcia',
  'Hiram Garcia'],
 'Production companies': ['Walt Disney Pictures',
  'Davis Entertainment',
  'Seven Bucks Productions',
  'Flynn Picture Company'],
 'Release dates': ['July 24, 2021 ( Disneyland Resort )',
  'July 30, 2021 (United States)'],
 'Running time': '128 minutes',
 'Screenplay by': ['Michael Green', 'Glenn Ficarra', 'John Requa'],
 'Starring': ['Dwayne Johnson',
  'Emily Blunt',
  'Édgar Ramírez',
  'Jack Whitehall',
  'Jesse Plemons',
  'Paul Giamatti'],
 'Story by': ['John Norville',
  'Josh Goldstein',
  'Glenn Ficarra',
  'J

running time clean


In [None]:
print([movie.get('Running time', 'N/A') for movie in movie_info_list])

['41 minutes (74 minutes 1966 release)', '83 minutes', '88 minutes', '126 minutes', '74 minutes', '64 minutes', '70 minutes', '42 minutes', '70 min', '71 minutes', '75 minutes', '94 minutes', '73 minutes', '75 minutes', '82 minutes', '68 minutes', '74 minutes', '96 minutes', '75 minutes', '84 minutes', '77 minutes', '92 minutes', '69 minutes', '81 minutes', ['60 minutes (VHS version)', '71 minutes (original)'], '127 minutes', '93 minutes', '76 minutes', '75 minutes', '73 minutes', '85 minutes', '81 minutes', '70 minutes', '90 min.', '80 minutes', '75 minutes', '83 minutes', '83 minutes', '72 minutes', '97 minutes', '75 minutes', '104 minutes', '93 minutes', '105 minutes', '95 minutes', '97 minutes', '134 minutes', '69 minutes', '92 minutes', '126 minutes', '79 minutes', '97 minutes', '128 minutes', '73 minutes', '91 minutes', '105 minutes', '98 minutes', '130 minutes', '89 minutes', '93 minutes', '67 minutes', '98 minutes', '100 minutes', '118 minutes', '103 minutes', '110 minutes', '8

In [None]:
# "85 minutes"
def minutes_to_integer(running_time):
    if running_time == "N/A":
        return None
    
    if isinstance(running_time, list):
        return int(running_time[0].split(" ")[0])
    else: # is a string
        return int(running_time.split(" ")[0])

for movie in movie_info_list:
    movie['Running time (int)'] = minutes_to_integer(movie.get('Running time', "N/A"))

In [None]:
print([movie.get('Running time (int)', 'N/A') for movie in movie_info_list])

[41, 83, 88, 126, 74, 64, 70, 42, 70, 71, 75, 94, 73, 75, 82, 68, 74, 96, 75, 84, 77, 92, 69, 81, 60, 127, 93, 76, 75, 73, 85, 81, 70, 90, 80, 75, 83, 83, 72, 97, 75, 104, 93, 105, 95, 97, 134, 69, 92, 126, 79, 97, 128, 73, 91, 105, 98, 130, 89, 93, 67, 98, 100, 118, 103, 110, 80, 74, 91, 91, 97, 118, 139, 131, 92, 87, 116, 93, 110, 110, 131, 101, 108, 84, 78, 75, 164, 106, 110, 99, 113, 108, 112, 93, 91, 93, 100, 100, 79, 96, 113, 89, 117, 92, 88, 92, 87, 93, 93, 93, 90, 83, 96, 88, 89, 91, 93, 92, 97, 100, 100, 89, None, 91, 112, 115, 95, 91, 97, 104, 74, 48, 77, 104, 128, 101, 94, 104, 90, 100, 88, 93, 98, 112, 84, 97, 97, 114, 96, 97, 109, 83, 90, 107, 96, 103, 91, 95, 105, 113, 80, 101, 90, 74, 90, 89, 110, 74, 93, 84, 83, 74, 77, 107, 93, 88, 108, 84, 121, 89, 104, 90, 86, 84, 108, 107, 96, 98, 105, 108, 94, 106, 102, 88, 102, 102, 97, 111, 100, 96, 98, 78, 81, 108, 89, 99, 89, 81, 92, 100, 89, 79, 91, 101, 104, 103, 86, 105, 75, 93, 92, 98, 95, 93, 87, 93, 87, 128, 77, 86, 95, 1

budget clean

In [None]:
print([movie.get('Budget', 'N/A') for movie in movie_info_list])

['N/A', '$1.49 million', '$2.6 million', '$2.28 million', '$600,000', '$950,000', '$858,000', 'N/A', '$788,000', 'N/A', '$1.35 million', '$2.125 million', 'N/A', '$1.5 million', '$1.5 million', 'N/A', '$2.2 million', '$1,800,000', '$3 million', 'N/A', '$4 million', '$2 million', '$300,000', '$1.8 million', 'N/A', '$5 million', 'N/A', '$4 million', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', '$700,000', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', '$6 million', 'under $1 million or $1,250,000', 'N/A', '$2 million', 'N/A', 'N/A', '$2.5 million', 'N/A', 'N/A', '$4 million', '$3.6 million', 'N/A', 'N/A', 'N/A', 'N/A', '$3 million', 'N/A', '$3 million', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', '$3 million', 'N/A', 'N/A', 'N/A', 'N/A', '$4.4–6 million', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', '$4 million', 'N/A', '$5 million', 'N/A', 'N/A', 'N/A', 'N/A', '$5 million', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', '$4 million', 'N/A', 'N/A', 'N/A', '

In [None]:
import re

amounts = r"thousand|million|billion"
number = r"\d+(,\d{3})*\.*\d*"

word_re = rf"\${number}(-|\sto\s|–)?({number})?\s({amounts})"
value_re = rf"\${number}"

def word_to_value(word):
    value_dict = {"thousand": 1000, "million": 1000000, "billion": 1000000000}
    return value_dict[word]

def parse_word_syntax(string):
    value_string = re.search(number, string).group()
    value = float(value_string.replace(",", ""))
    word = re.search(amounts, string, flags=re.I).group().lower()
    word_value = word_to_value(word)
    return value*word_value

def parse_value_syntax(string):
    value_string = re.search(number, string).group()
    value = float(value_string.replace(",", ""))
    return value

'''
money_conversion("$12.2 million") --> 12200000 ## Word syntax
money_conversion("$790,000") --> 790000        ## Value syntax
'''
def money_conversion(money):
    if money == "N/A":
        return None

    if isinstance(money, list):
        money = money[0]
        
    word_syntax = re.search(word_re, money, flags=re.I)
    value_syntax = re.search(value_re, money)

    if word_syntax:
        return parse_word_syntax(word_syntax.group())

    elif value_syntax:
        return parse_value_syntax(value_syntax.group())

    else:
        return None

In [None]:
for movie in movie_info_list:
    movie['Budget (float)'] = money_conversion(movie.get('Budget', "N/A"))
    movie['Box office (float)'] = money_conversion(movie.get('Box office', "N/A"))

In [None]:
money_conversion(str(movie_info_list[-40]["Budget"]))

200000000.0

In [None]:
# Convert Dates into datetimes
print([movie.get('Release date', 'N/A') for movie in movie_info_list])

[['May 19, 1937'], 'N/A', 'N/A', ['November 13, 1940'], ['June 27, 1941'], 'N/A', 'N/A', 'N/A', ['July 17, 1943'], 'N/A', 'N/A', 'N/A', ['September 27, 1947'], 'May 27, 1948', 'N/A', ['October 5, 1949'], 'N/A', 'N/A', 'N/A', 'N/A', ['February 5, 1953 (United States)'], ['July 23, 1953 (US)'], ['November 10, 1953'], 'N/A', ['August 17, 1954'], ['December 23, 1954'], 'May 25, 1955', ['June 22, 1955'], ['September 14, 1955'], 'December 22, 1955', 'June 8, 1956', 'July 18, 1956', ['September 4, 1956'], ['December 20, 1956'], 'June 19, 1957', 'August 28, 1957', ['December 25, 1957'], ['July 8, 1958'], ['August 12, 1958'], ['December 25, 1958'], ['January 29, 1959'], ['March 19, 1959'], 'N/A', ['November 10, 1959'], 'January 21, 1960 ( Sarasota, FL )', ['February 24, 1960'], 'May 19, 1960', 'N/A', ['November 1, 1960'], ['December 21, 1960'], ['January 25, 1961'], 'March 16, 1961', ['June 21, 1961'], ['July 12, 1961'], ['July 17, 1961'], ['December 14, 1961'], 'April 5, 1962', ['May 17, 1962'

In [None]:
movie_info_list[-50]

{'Based on': ['The One and Only Ivan', 'by', 'K. A. Applegate'],
 'Box office (float)': None,
 'Budget (float)': None,
 'Cinematography': 'Florian Ballhaus',
 'Country': 'United States',
 'Directed by': 'Thea Sharrock',
 'Distributed by': 'Walt Disney Studios Motion Pictures',
 'Edited by': 'Barney Pilling',
 'Language': 'English',
 'Music by': 'Craig Armstrong',
 'Produced by': ['Angelina Jolie', 'Allison Shearmur', 'Brigham Taylor'],
 'Production companies': ['Walt Disney Pictures', 'Jolie Pas Productions'],
 'Release date': ['August 21, 2020 (United States)'],
 'Running time': '95 minutes',
 'Running time (int)': 95,
 'Screenplay by': 'Mike White',
 'Starring': ['Sam Rockwell',
  'Angelina Jolie',
  'Danny DeVito',
  'Helen Mirren',
  'Ramón Rodríguez',
  'Ariana Greenblatt',
  'Bryan Cranston'],
 'title': 'The One and Only Ivan'}

In [None]:
# June 28, 1950
from datetime import datetime

dates = [movie.get('Release date', 'N/A') for movie in movie_info_list]

def clean_date(date):
    return date.split("(")[0].strip()

def date_conversion(date):
    if isinstance(date, list):
        date = date[0]
        
    if date == "N/A":
        return None
        
    date_str = clean_date(date)

    fmts = ["%B %d, %Y", "%d %B %Y"]
    for fmt in fmts:
        try:
            return datetime.strptime(date_str, fmt)
        except:
            pass
    return None

In [None]:
for movie in movie_info_list:
    movie['Release date (datetime)'] = date_conversion(movie.get('Release date', 'N/A'))

In [None]:
movie_info_list[50]

{'Based on': ['The Hundred and One Dalmatians', 'by', 'Dodie Smith'],
 'Box office': '$303 million',
 'Box office (float)': 303000000.0,
 'Budget': '$3.6 million',
 'Budget (float)': 3600000.0,
 'Country': 'United States',
 'Directed by': ['Clyde Geronimi', 'Hamilton Luske', 'Wolfgang Reitherman'],
 'Distributed by': 'Buena Vista Distribution',
 'Edited by': ['Roy M. Brewer, Jr.', 'Donald Halliday'],
 'Language': 'English',
 'Music by': 'George Bruns',
 'Produced by': 'Walt Disney',
 'Production company': 'Walt Disney Productions',
 'Release date': ['January 25, 1961'],
 'Release date (datetime)': datetime.datetime(1961, 1, 25, 0, 0),
 'Running time': '79 minutes',
 'Running time (int)': 79,
 'Starring': ['Rod Taylor',
  'Cate Bauer',
  'Betty Lou Gerson',
  'Ben Wright',
  'Bill Lee (singing voice)',
  'Lisa Davis',
  'Martha Wentworth'],
 'Story by': 'Bill Peet',
 'title': '101 Dalmatians'}

Pickle in Python is primarily used in serializing and deserializing a Python object structure.

pickle file: it's the process of converting a Python object into a byte stream to store it in a file/database, maintain program state across sessions, or transport data over the network

In [None]:
import pickle

def save_data_pickle(name, data):
    with open(name, 'wb') as f:
        pickle.dump(data, f)

In [None]:
import pickle

def load_data_pickle(name):
    with open(name, 'rb') as f:
        return pickle.load(f)

In [None]:
save_data_pickle("disney_movie_data_cleaned_more.pickle", movie_info_list)

In [None]:
a = load_data_pickle("disney_movie_data_cleaned_more.pickle")

In [None]:
a == movie_info_list

True

# ***Task #4: Attach IMDB/Rotten Tomatoes/Metascore scores***

In [None]:
movie_info_list = load_data_pickle('disney_movie_data_cleaned_more.pickle')

In [None]:
movie_info_list[-60]

{'Box office (float)': None,
 'Budget': '$40 million',
 'Budget (float)': 40000000.0,
 'Cinematography': 'Ericson Core',
 'Country': 'United States',
 'Directed by': 'Ericson Core',
 'Distributed by': 'Disney+',
 'Edited by': 'Martin Pensa',
 'Language': 'English',
 'Music by': 'Mark Isham',
 'Produced by': 'Kim Zubick',
 'Production company': 'Walt Disney Pictures',
 'Release date': ['December 20, 2019 (United States)'],
 'Release date (datetime)': datetime.datetime(2019, 12, 20, 0, 0),
 'Running time': '114 minutes',
 'Running time (int)': 114,
 'Starring': ['Willem Dafoe'],
 'Written by': 'Tom Flynn',
 'title': 'Togo'}

In [None]:
#use the website to get api data
# http://www.omdbapi.com/?apikey=[yourkey]&
apikey="98b4f43e"

In [None]:
import requests
import urllib
import os

def get_omdb_info(title):
    base_url = "http://www.omdbapi.com/?"
    parameters = {"apikey": apikey, 't': title}
    params_encoded = urllib.parse.urlencode(parameters)
  #construct a full url based on the title of movie
    full_url = base_url + params_encoded
    return requests.get(full_url).json()

def get_rotten_tomato_score(omdb_info):
  #if no Ratings return []
    ratings = omdb_info.get('Ratings', [])
    for rating in ratings:
        if rating['Source'] == 'Rotten Tomatoes':
            return rating['Value']
    return None
get_omdb_info("into the woods")

{'Actors': 'Anna Kendrick, Meryl Streep, Chris Pine',
 'Awards': 'Nominated for 3 Oscars. 10 wins & 74 nominations total',
 'BoxOffice': '$128,002,372',
 'Country': 'United States',
 'DVD': '24 Mar 2015',
 'Director': 'Rob Marshall',
 'Genre': 'Adventure, Comedy, Drama',
 'Language': 'English',
 'Metascore': '69',
 'Plot': 'A witch tasks a childless baker and his wife with procuring magical items from classic fairy tales to reverse the curse put on their family tree.',
 'Poster': 'https://m.media-amazon.com/images/M/MV5BMTY4MzQ4OTY3NF5BMl5BanBnXkFtZTgwNjM5MDI3MjE@._V1_SX300.jpg',
 'Production': 'N/A',
 'Rated': 'PG',
 'Ratings': [{'Source': 'Internet Movie Database', 'Value': '5.9/10'},
  {'Source': 'Rotten Tomatoes', 'Value': '71%'},
  {'Source': 'Metacritic', 'Value': '69/100'}],
 'Released': '25 Dec 2014',
 'Response': 'True',
 'Runtime': '125 min',
 'Title': 'Into the Woods',
 'Type': 'movie',
 'Website': 'N/A',
 'Writer': 'James Lapine',
 'Year': '2014',
 'imdbID': 'tt2180411',
 '

In [None]:
#add scores to movies
for movie in movie_info_list:
    title = movie['title']
    omdb_info = get_omdb_info(title)
    movie['imdb'] = omdb_info.get('imdbRating', None)
    movie['metascore'] = omdb_info.get('Metascore', None)
    movie['Genre'] = omdb_info.get('Genre', None)
    movie['Awards'] = omdb_info.get('Awards', None)
    movie['rotten_tomatoes'] = get_rotten_tomato_score(omdb_info)

In [None]:
movie_info_list[-60]

{'Awards': '1 win & 4 nominations',
 'Box office (float)': None,
 'Budget': '$40 million',
 'Budget (float)': 40000000.0,
 'Cinematography': 'Ericson Core',
 'Country': 'United States',
 'Directed by': 'Ericson Core',
 'Distributed by': 'Disney+',
 'Edited by': 'Martin Pensa',
 'Genre': 'Adventure, Biography, Drama',
 'Language': 'English',
 'Music by': 'Mark Isham',
 'Produced by': 'Kim Zubick',
 'Production company': 'Walt Disney Pictures',
 'Release date': ['December 20, 2019 (United States)'],
 'Release date (datetime)': datetime.datetime(2019, 12, 20, 0, 0),
 'Running time': '114 minutes',
 'Running time (int)': 114,
 'Starring': ['Willem Dafoe'],
 'Written by': 'Tom Flynn',
 'imdb': '8.0',
 'metascore': '69',
 'rotten_tomatoes': '92%',
 'title': 'Togo'}

In [None]:
save_data_pickle('disney_movie_data_final.pickle', movie_info_list)

# ***Task #5: Save data as JSON & CSV***

In [None]:
movie_info_copy = [movie.copy() for movie in movie_info_list]

In [None]:
for movie in movie_info_copy:
    current_date = movie['Release date (datetime)']
    if current_date:
        movie['Release date (datetime)'] = current_date.strftime("%B %d, %Y")
    else:
        movie['Release date (datetime)'] = None

In [None]:
save_data("disney_data_final.json", movie_info_copy)

*Convert data to CSV*

In [None]:
import pandas as pd

df = pd.DataFrame(movie_info_list)

In [None]:
df.head()

Unnamed: 0,title,Production company,Distributed by,Release date,Running time,Country,Language,Box office,Running time (int),Budget (float),...,Layouts by,Original concept by,Created by,Original work,Owner,Music,Lyrics,Book,Basis,Productions
0,Academy Award Review of,Walt Disney Productions,United Artists,"[May 19, 1937]",41 minutes (74 minutes 1966 release),United States,English,$45.472,41.0,,...,,,,,,,,,,
1,Snow White and the Seven Dwarfs,Walt Disney Productions,RKO Radio Pictures,,83 minutes,United States,English,$418 million,83.0,1490000.0,...,,,,,,,,,,
2,Pinocchio,Walt Disney Productions,RKO Radio Pictures,,88 minutes,United States,English,$164 million,88.0,2600000.0,...,,,,,,,,,,
3,Fantasia,Walt Disney Productions,RKO Radio Pictures,"[November 13, 1940]",126 minutes,United States,English,$76.4–$83.3 million (United States and Canada),126.0,2280000.0,...,,,,,,,,,,
4,The Reluctant Dragon,Walt Disney Productions,RKO Radio Pictures,"[June 27, 1941]",74 minutes,United States,English,"$960,000 (worldwide rentals)",74.0,600000.0,...,,,,,,,,,,


In [None]:
df.to_csv("disney_movie_data_final.csv")