# Disney Dataset Creation w/ BeautifulSoup

In [11]:
import requests
import re
from bs4 import BeautifulSoup as bs
import pandas as pd

In [2]:
all_disney_movies_wiki = 'https://en.wikipedia.org/wiki/List_of_Walt_Disney_Pictures_films'
url = 'https://en.wikipedia.org'

In [3]:
soup = bs(requests.get(all_disney_movies_wiki).content)

### Task #1: Scrape the infobox from Tron: Legacy wiki page (save in python dictionary)

In [4]:
# find('Tron: Legacy')
# tron = soup.select('tbody tr i')

# for match in tron:
#     print(match.find(string='Tron: Legacy'))

tron_url = soup.find('a', string='Tron: Legacy')['href']

tron_url = url + tron_url
tron_url

'https://en.wikipedia.org/wiki/Tron:_Legacy'

In [5]:
soup_tron = bs(requests.get(tron_url).content)

In [6]:
infobox = soup_tron.find(class_='infobox').tbody
info_rows = infobox.find_all('tr')
for row in info_rows:
    print(row.prettify())

<tr>
 <th class="infobox-above summary" colspan="2" style="font-size: 125%; font-style: italic;">
  Tron: Legacy
 </th>
</tr>

<tr>
 <td class="infobox-image" colspan="2">
  <a class="image" href="/wiki/File:Tron_Legacy_poster.jpg" title="A man releasing a disc upwards into the air, embraced by a woman. A beam of light descends upon the disk. In the background is a futuristic city and spaceships.">
   <img alt="A man releasing a disc upwards into the air, embraced by a woman. A beam of light descends upon the disk. In the background is a futuristic city and spaceships." class="thumbborder" data-file-height="383" data-file-width="259" decoding="async" height="325" src="//upload.wikimedia.org/wikipedia/en/thumb/c/c2/Tron_Legacy_poster.jpg/220px-Tron_Legacy_poster.jpg" srcset="//upload.wikimedia.org/wikipedia/en/c/c2/Tron_Legacy_poster.jpg 1.5x" width="220"/>
  </a>
  <div class="infobox-caption">
   Theatrical release poster
  </div>
 </td>
</tr>

<tr>
 <th class="infobox-label" scope="r

In [7]:
movie_info = {}

def get_content_value(row_data):
    if row_data.find('li'):
        x = [li.get_text(' ', strip=True).replace('\xa0', ' ') for li in row_data.find_all('li')]
        return ', '.join(x)
    else:
        return row_data.get_text(' ', strip=True).replace('\xa0', ' ')

    
for index, row in enumerate(info_rows):
    if index == 0:
        movie_info['Title'] = row.find('th').get_text(' ', strip=True)
    elif index == 1:    # First row (Movie Image)
        continue
    else:
        content_key = row.find('th').get_text(' ', strip=True)
        content_value = get_content_value(row.find('td'))
        movie_info[content_key] = content_value

movie_info


{'Title': 'Tron: Legacy',
 'Directed by': 'Joseph Kosinski',
 'Screenplay by': 'Edward Kitsis, Adam Horowitz',
 'Story by': 'Edward Kitsis, Adam Horowitz, Brian Klugman, Lee Sternthal',
 'Based on': 'Steven Lisberger, Bonnie MacBird',
 'Produced by': 'Sean Bailey, Jeffrey Silver, Steven Lisberger',
 'Starring': 'Garrett Hedlund, Jeff Bridges, Olivia Wilde, Bruce Boxleitner, James Frain, Beau Garrett, Michael Sheen',
 'Cinematography': 'Claudio Miranda',
 'Edited by': 'James Haygood',
 'Music by': 'Daft Punk',
 'Production companies': 'Walt Disney Pictures [1], Sean Bailey Productions [2]',
 'Distributed by': 'Walt Disney Studios Motion Pictures [1]',
 'Release dates': 'November 30, 2010 ( 2010-11-30 ) (Tokyo), December 17, 2010 ( 2010-12-17 ) (United States) [3]',
 'Running time': '125 minutes [4]',
 'Country': 'United States',
 'Language': 'English',
 'Budget': '$170 million [5] [6]',
 'Box office': '$400.1 million [6]'}

In [8]:
df_tron = pd.DataFrame.from_dict(movie_info, orient='index')
df_tron.transpose()

Unnamed: 0,Title,Directed by,Screenplay by,Story by,Based on,Produced by,Starring,Cinematography,Edited by,Music by,Production companies,Distributed by,Release dates,Running time,Country,Language,Budget,Box office
0,Tron: Legacy,Joseph Kosinski,"Edward Kitsis, Adam Horowitz","Edward Kitsis, Adam Horowitz, Brian Klugman, L...","Steven Lisberger, Bonnie MacBird","Sean Bailey, Jeffrey Silver, Steven Lisberger","Garrett Hedlund, Jeff Bridges, Olivia Wilde, B...",Claudio Miranda,James Haygood,Daft Punk,"Walt Disney Pictures [1], Sean Bailey Producti...",Walt Disney Studios Motion Pictures [1],"November 30, 2010 ( 2010-11-30 ) (Tokyo), Dece...",125 minutes [4],United States,English,$170 million [5] [6],$400.1 million [6]


### Task #2: Scrape infobox for all movies in List of Disney Films (save as list of dictionaries)

In [32]:
disney_movies_urls = soup.find_all('table', class_='wikitable sortable')

list_disney_movies_urls = []

for url_movie in disney_movies_urls:
    rows = url_movie.find('tbody').find_all('tr')
    for index, row in enumerate(rows):
        if index == 0:
            continue
        list_disney_movies_urls.append(row.find('td').find('a'))
        
list_disney_movies_urls = [url_movie['href'] for url_movie in list_disney_movies_urls if url_movie]

print('Length of Disney Links:', len(list_disney_movies_urls))
list_disney_movies_urls

Length of Disney Links: 554


['/wiki/Academy_Award_Review_of_Walt_Disney_Cartoons',
 '/wiki/Snow_White_and_the_Seven_Dwarfs_(1937_film)',
 '/wiki/Pinocchio_(1940_film)',
 '/wiki/Fantasia_(1940_film)',
 '/wiki/The_Reluctant_Dragon_(1941_film)',
 '/wiki/Dumbo',
 '/wiki/Bambi',
 '/wiki/Saludos_Amigos',
 '/wiki/Victory_Through_Air_Power_(film)',
 '/wiki/The_Three_Caballeros',
 '/wiki/Make_Mine_Music',
 '/wiki/Song_of_the_South',
 '/wiki/Fun_and_Fancy_Free',
 '/wiki/Melody_Time',
 '/wiki/So_Dear_to_My_Heart',
 '/wiki/The_Adventures_of_Ichabod_and_Mr._Toad',
 '/wiki/Cinderella_(1950_film)',
 '/wiki/Treasure_Island_(1950_film)',
 '/wiki/Alice_in_Wonderland_(1951_film)',
 '/wiki/The_Story_of_Robin_Hood_(film)',
 '/wiki/Peter_Pan_(1953_film)',
 '/wiki/The_Sword_and_the_Rose',
 '/wiki/The_Living_Desert',
 '/wiki/Rob_Roy:_The_Highland_Rogue',
 '/wiki/The_Vanishing_Prairie',
 '/wiki/20,000_Leagues_Under_the_Sea_(1954_film)',
 '/wiki/Davy_Crockett:_King_of_the_Wild_Frontier_(film)',
 '/wiki/Lady_and_the_Tramp',
 '/wiki/The_Afr

In [44]:
from time import sleep
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry


def get_content_value(row_data):
    if row_data.find('li'):
        x = [li.get_text(' ', strip=True).replace('\xa0', ' ') for li in row_data.find_all('li')]
        return x
#         return ', '.join(x)
    elif row_data.find('br'):
        return [text for text in row_data.stripped_strings if ('(' not in text) and (',' not in text)]
    else:
        return row_data.get_text(' ', strip=True).replace('\xa0', ' ')
    
    
def clean_tag(soup):
    for tag in soup.find_all(['sup', 'span']):
        tag.decompose()

        
def get_movie_info(url):
     
    # https://stackoverflow.com/questions/23013220/max-retries-exceeded-with-url-in-requests
    session = requests.Session()
    retry = Retry(connect=3, backoff_factor=0.5)
    adapter = HTTPAdapter(max_retries=retry)
    session.mount('http://', adapter)
    session.mount('https://', adapter)
    
    soup_movie = bs(session.get(url, timeout=1).content)
    infobox = soup_movie.find(class_='infobox').tbody
    info_rows = infobox.find_all('tr')
    
    clean_tag(soup_movie)
    
    movie_info = {}

    for index, row in enumerate(info_rows):
        if index == 0:
            movie_info['Title'] = row.find('th').get_text(' ', strip=True)
        elif index == 1:     # First row (Movie Image)
            continue
        else:
            td = row.find('td')
            th = row.find('th')
            if td and th:    # Check if 'td' and 'th' exist
                content_key = th.get_text(' ', strip=True)
                content_value = get_content_value(td)
                movie_info[content_key] = content_value
            else:
                break
            
    return movie_info
       
        
movies_infobox = []
            
for index, movie_path in enumerate(list_disney_movies_urls):
    if index % 50 == 0:
        print(index)
    movie_url = url + movie_path
    try:
        movies_infobox.append(get_movie_info(movie_url))
    except Exception as e:
        print(movie_url)
        print(e)
    
movies_infobox

0
10
20


KeyboardInterrupt: 

In [37]:
print(len(movies_infobox))

548


#### Saving in Pandas DataFrame

In [38]:
df = pd.DataFrame.from_dict(movies_infobox)
df

Unnamed: 0,Title,Production company,Distributed by,Release date,Running time,Country,Language,Box office,Directed by,Written by,...,Owner,First appearance,Last appearance,Years,Education,Known for,Nationality,Employer,Notable work,Website
0,Academy Award Review of,Walt Disney Productions,United Artists,"[May 19, 1937]",41 minutes (74 minutes 1966 release),United States,English,$45.472,,,...,,,,,,,,,,
1,Snow White and the Seven Dwarfs,Walt Disney Productions,RKO Radio Pictures,,83 minutes,United States,English,$418 million,"[David Hand, William Cottrell, Wilfred Jackson...","[Ted Sears, Richard Creedon, Otto Englander, D...",...,,,,,,,,,,
2,Pinocchio,Walt Disney Productions,RKO Radio Pictures,,88 minutes,United States,English,$164 million,"[Ben Sharpsteen, Hamilton Luske, Bill Roberts,...",,...,,,,,,,,,,
3,Fantasia,Walt Disney Productions,RKO Radio Pictures,"[November 13, 1940]",126 minutes,United States,English,$76.4–$83.3 million (United States and Canada),"[Samuel Armstrong, James Algar, Bill Roberts, ...",,...,,,,,,,,,,
4,The Reluctant Dragon,Walt Disney Productions,RKO Radio Pictures,"[June 27, 1941]",74 minutes,United States,English,"$960,000 (worldwide rentals)","[Alfred Werker, Hamilton Luske, Jack Cutting, ...","[Live-action:, Ted Sears, Al Perkins, Larry Cl...",...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
543,Stephen Anderson,,,,,,,,,,...,,,,,,,,Walt Disney Animation Studios (1995–present),,
544,Suzi Yoonessi,,,,,,,,,,...,,,,,,,,,,
545,The Jungle Book,,"[Walt Disney Studios, Motion Pictures]",,106 minutes,United States,English,$966.6 million,Jon Favreau,,...,,,,,,,,,,
546,Maleficent: Mistress of Evil,,"[Walt Disney Studios, Motion Pictures]","[October 18, 2019]",119 minutes,United States,English,$491.7 million,Joachim Rønning,"[Linda Woolverton, Noah Harpster, Micah Fitzer...",...,,,,,,,,,,


In [39]:
df.to_csv('Data_All_Movies_Disney.csv', index=False)

#### Saving in JSON

In [16]:
import json


def save_data(title, data):
    with open(title, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=2)
        
def load_data(title):
    with open(title, encoding='utf-8') as f:
        return json.load(f)

In [41]:
save_data('Data_All_Movies_Disney.json', movies_infobox)

# Checkpoint

In [18]:
import requests
import re
from bs4 import BeautifulSoup as bs
import pandas as pd
import json
import re


def save_data(title, data):
    with open(title, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=2)
        
def load_data(title):
    with open(title, encoding='utf-8') as f:
        return json.load(f)

### Task #3: Clean our data!

- ~~Clean un references. e.g.: [1]~~
- ~~Convert running time to an integer~~
- Convert dates into datetime object
- ~~Split up the long strings~~
- ~~Convert Budget e Box Office to numbers~~

#### Loading JSON data

In [45]:
movies_infobox = load_data('Data_All_Movies_Disney.json')

In [46]:
movies_infobox

[{'Title': 'Academy Award Review of',
  'Production company': 'Walt Disney Productions',
  'Distributed by': 'United Artists',
  'Release date': ['May 19, 1937'],
  'Running time': '41 minutes (74 minutes 1966 release)',
  'Country': 'United States',
  'Language': 'English',
  'Box office': '$45.472'},
 {'Title': 'Snow White and the Seven Dwarfs',
  'Directed by': ['David Hand',
   'William Cottrell',
   'Wilfred Jackson',
   'Larry Morey',
   'Perce Pearce',
   'Ben Sharpsteen'],
  'Written by': ['Ted Sears',
   'Richard Creedon',
   'Otto Englander',
   'Dick Rickard',
   'Earl Hurd',
   'Merrill De Maris',
   'Dorothy Ann Blank',
   'Webb Smith'],
  'Based on': ['Snow White', 'by The', 'Brothers Grimm'],
  'Produced by': 'Walt Disney',
  'Starring': ['Adriana Caselotti',
   'Lucille La Verne',
   'Harry Stockwell',
   'Roy Atwell',
   'Pinto Colvig',
   'Otis Harlan',
   'Scotty Mattraw',
   'Billy Gilbert',
   'Eddie Collins',
   'Moroni Olsen',
   'Stuart Buchanan'],
  'Music by':

#### Running time to an integer

In [4]:
def runningtime_to_integer(running_time):
    if running_time == None:
        return None
    elif isinstance(running_time, list):
        if len(running_time) == 0:
            return None
        entry = running_time[0]
    else:
        entry = running_time
    value = int(entry.split(' ')[0])
    return value

In [5]:
for movie in movies_infobox:
    try:
        movie['Running time (int)'] = runningtime_to_integer(movie.get('Running time'))
    except Exception as e:
        print(movie.get('Running time'))
        print(e)
        break
        
movies_infobox

[{'Title': 'Academy Award Review of',
  'Production company': 'Walt Disney Productions',
  'Distributed by': 'United Artists',
  'Release date': ['May 19, 1937'],
  'Running time': '41 minutes (74 minutes 1966 release)',
  'Country': 'United States',
  'Language': 'English',
  'Box office': '$45.472',
  'Running time (int)': 41},
 {'Title': 'Snow White and the Seven Dwarfs',
  'Directed by': ['David Hand',
   'William Cottrell',
   'Wilfred Jackson',
   'Larry Morey',
   'Perce Pearce',
   'Ben Sharpsteen'],
  'Written by': ['Ted Sears',
   'Richard Creedon',
   'Otto Englander',
   'Dick Rickard',
   'Earl Hurd',
   'Merrill De Maris',
   'Dorothy Ann Blank',
   'Webb Smith'],
  'Based on': ['Snow White', 'by The', 'Brothers Grimm'],
  'Produced by': 'Walt Disney',
  'Starring': ['Adriana Caselotti',
   'Lucille La Verne',
   'Harry Stockwell',
   'Roy Atwell',
   'Pinto Colvig',
   'Otis Harlan',
   'Scotty Mattraw',
   'Billy Gilbert',
   'Eddie Collins',
   'Moroni Olsen',
   'Stua

#### Convert Budget e Box Office to numbers

In [6]:
from locale import atof, setlocale, LC_NUMERIC
setlocale(LC_NUMERIC, 'en-US')


def word_to_value(word):
    value_dict = {"thousand": 1000, "million": 1000000, "billion": 1000000000}
    return value_dict.get(word.lower(), 1)


def money_conversion(value):

    formatNumber = lambda n: float(f"{n:.3f}") if n % 1 else int(n)
    
    if value == None or value == 'unknown' or value == '':
        return None
    elif isinstance(value, list):
        if len(value) == 0:
            return None
        value = value[0]

    new_value = re.search(r'(?<=\$)\d+[\d\.\,]*', value)
    modifier = re.search(r"thousand|million|billion", value, flags=re.I)

    if new_value and modifier:
        return formatNumber(atof(new_value.group())*word_to_value(modifier.group()))
    elif new_value:
        return formatNumber(atof(new_value.group()))

In [7]:
for index, movie in enumerate(movies_infobox):
    try:
        movie['Budget (float)'] = money_conversion(movie.get('Budget'))
        movie['Box office (float)'] = money_conversion(movie.get('Box office'))
    except Exception as e:
        print(index, movie)
        print(e)
        break
        

In [8]:
[[index, movie.get('Box office')] for index, movie in enumerate(movies_infobox)]

[[0, '$45.472'],
 [1, '$418 million'],
 [2, '$164 million'],
 [3, '$76.4–$83.3 million (United States and Canada)'],
 [4, '$960,000 (worldwide rentals)'],
 [5, '>$1.3 million (est. United States/Canada rentals, 1941)'],
 [6, '$267.4 million'],
 [7, '$1.135 million (worldwide rentals)'],
 [8, '$799,000'],
 [9, '$3.355 million (worldwide rentals)'],
 [10, '$3.275 million (worldwide rentals)'],
 [11, '$65 million'],
 [12, '$3.165 million (worldwide rentals)'],
 [13, '$2.56 million (worldwide rentals)'],
 [14, '$3.7 million (U.S. rental) $575,000 (foreign rental)'],
 [15, '$1.625 million (worldwide rentals)'],
 [16, '$182 million'],
 [17, '$4,100,000 (worldwide rentals)'],
 [18, ['$2.4 million (1951, domestic)', '$3.5 million (1974, domestic)']],
 [19, '$2.1 million (US rentals)'],
 [20, '$87.4 million (United States and Canada)'],
 [21, '$1 million (US)'],
 [22, '$2.6 million (US)'],
 [23, None],
 [24, '$1.75 million (US and Canadian rentals)'],
 [25, '$28.2 million'],
 [26, '$2,150,000 (

In [9]:
[movie.get('Box office (float)') for movie in movies_infobox]

[45.472,
 418000000,
 164000000,
 76400000,
 960000,
 1300000,
 267400000.0,
 1135000,
 799000,
 3355000,
 3275000,
 65000000,
 3165000,
 2560000,
 3700000,
 1625000,
 182000000,
 4100000,
 2400000,
 2100000,
 87400000,
 1000000,
 2600000,
 None,
 1750000,
 28200000,
 2150000,
 187000000,
 2100000,
 1600000,
 1700000,
 None,
 None,
 2750000,
 None,
 1750000,
 6250000,
 None,
 1800000,
 2500000,
 51600000,
 12300000,
 None,
 None,
 1700000,
 3100000,
 None,
 3750000,
 None,
 2300000,
 None,
 40000000,
 303000000,
 25400000,
 25100000,
 None,
 None,
 4600000,
 3500000,
 5000000,
 None,
 None,
 None,
 21745500,
 22100000,
 2550000,
 3000000,
 4350000,
 4200000,
 22200000,
 1600000,
 4000000,
 2250000,
 3500000,
 103100000,
 3500000,
 1275000,
 4000000,
 28068222,
 6200000,
 22565634,
 None,
 16207116,
 3000000,
 1900000,
 4000000,
 378000000,
 None,
 5000000,
 21540050,
 2250000,
 4150000,
 3300000,
 51300000,
 1300000,
 None,
 5500000,
 None,
 18607492,
 4000000,
 191000000,
 None,
 None

#### Convert dates into datetime object

In [10]:
from datetime import datetime

In [11]:
movies = []
for index, movie in enumerate(movies_infobox):
    if movie.get('Release date'):
        movie = movie.get('Release date')
    elif movie.get('Release dates'):
        movie = movie.get('Release dates')  
    else:
        movies.append(None)
        continue
    
    print(index, movie)

movies

0 ['May 19, 1937']
1 ['December 21, 1937 ( Carthay Circle Theatre )', 'February 4, 1938 (United States)']
2 ['February 7, 1940 ( Center Theatre )', 'February 23, 1940 (United States)']
3 ['November 13, 1940']
4 ['June 27, 1941']
5 ['October 23, 1941 (New York City)', 'October 31, 1941 (U.S.)']
6 ['August 9, 1942 (World Premiere – London)', 'August 13, 1942 (Premiere – New York City)', 'August 21, 1942 (U.S.)']
7 ['August 24, 1942 (World Premiere – Rio de Janeiro)', 'February 6, 1943 (U.S. Premiere – Boston)', 'February 19, 1943 (U.S.)']
8 ['July 17, 1943']
9 ['December 21, 1944 (Mexico City)', 'February 3, 1945 (US)']
10 ['April 20, 1946 (New York City premiere)', 'August 15, 1946 (U.S.)']
11 ['November 12, 1946 (Premiere: Atlanta, Georgia)', 'November 20, 1946', 'March 30, 1947 (Stanford theater, Palo Alto, California)']
12 ['September 27, 1947']
13 May 27, 1948
15 ['October 5, 1949']
16 ['February 15, 1950 (Boston)', 'March 4, 1950 (United States)']
17 ['June 22, 1950 (World Premiere

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None]

In [12]:
def date_conversion(date):
    if date == None:
        return None
    elif isinstance(date, list):
        if len(date) == 0:
            return None
        date_str = date[0]
    else:
        date_str = date
    if date_str.find('(') != -1:
        date_str = date_str.split('(')[0].strip()
        
    fmts = ['%B %d, %Y', '%d %B %Y']
    
    for fmt in fmts:
        try:
            return datetime.strptime(date_str, fmt)
        except ValueError:
            pass

In [13]:
for index, movie in enumerate(movies_infobox):
    if movie.get('Release date'):
        value = movie.get('Release date')
    elif movie.get('Release dates'):
        value = movie.get('Release dates')  
    else:
        value = None
    try:
        if value:
            movie['Release date (datetime)'] = date_conversion(value)
    except Exception as e:
        print(index, movie)
        print(e)
        break

#### Saving in PICKLE

In [42]:
# Can't save in JSON (TypeError: Object of type datetime is not JSON serializable)
# save_data('Data_All_Movies_Disney_Cleaned.json', movies_infobox)

# Using Pickle
# https://stackoverflow.com/questions/11218477/how-can-i-use-pickle-to-save-a-dict-or-any-other-python-object

import pickle


def save_data_pickle(title, data):
    with open(title, 'wb') as handle:
        pickle.dump(data, handle, protocol=pickle.HIGHEST_PROTOCOL)
        
def load_data_pickle(title):
    with open(title, 'rb') as handle:
        return pickle.load(handle)

In [43]:
save_data_pickle('Data_All_Movies_Disney_Cleaned.pickle', movies_infobox)

# Checkpoint PICKLE Cleaned Data

#### Loading in PICKLE data

In [14]:
import pickle


def save_data_pickle(title, data):
    with open(title, 'wb') as handle:
        pickle.dump(data, handle, protocol=pickle.HIGHEST_PROTOCOL)
        
def load_data_pickle(title):
    with open(title, 'rb') as handle:
        return pickle.load(handle)


movies_infobox = load_data_pickle('Data_All_Movies_Disney_Cleaned.pickle')

### Task #4: Attach IMDB, Metascore, and Rotten Tomatoes scores to dataset (working with APIs)

#### OMDb API

In [6]:
# http://www.omdbapi.com/?apikey=[yourkey]&

In [11]:
import requests
import urllib


def get_omdb_title(title, year):
    base_url = 'https://www.omdbapi.com/?'
    parameters = {'apikey': '211a1ddb', 't': title, 'y': year}
    params_encoded = urllib.parse.urlencode(parameters)
    full_url = base_url + params_encoded
    return requests.get(full_url).json()


def replace_roman_numerals(title):  
    roman_numerals = {'VI': '6', 'V': '5', 'IV': '4', 'III': '3', 'II': '2'}
    
    for roman in roman_numerals:
        if roman in title:
            return title.replace(roman, roman_numerals[roman])

In [12]:
for index, movie in enumerate(movies_infobox):
    if index % 50 == 0:
        print('Iteration:', index)
        print()
        
    title = movie['Title']
    roman_numerals = ['II', 'III', 'IV', 'V', 'VI']
    if any(roman in title for roman in roman_numerals):
        title = replace_roman_numerals(title)
    
    year = movie.get('Release date (datetime)')
    if year:
        year = str(year.year)
    else:
        year = ''
        
    omdb_info = get_omdb_title(title, year)   
    if omdb_info['Response'] == 'True':
        try:
            ratings = omdb_info.get('Ratings')
            for rating in ratings:
                if rating.get('Source') == 'Internet Movie Database':
                    movie['IMDb'] = float(rating.get('Value').split('/')[0])
                elif rating.get('Source') == 'Rotten Tomatoes':
                    movie['Rotten Tomatoes'] = rating.get('Value')
                elif rating.get('Source') == 'Metacritic':
                    movie['Metacritic'] = int(rating.get('Value').split('/')[0])
        except Exception as e:
            print(movies_infobox[index])
            print(e)
    else:
        if omdb_info['Error'] == 'Request limit reached!':
            print('Index:', index, '|', omdb_info)
            break
        print(f'Index: {index} | Title: {title} | Year: {year}')
        print(omdb_info)
        print()

Iteration: 0

Index: 8 | Title: 5ictory Through Air Power | Year: 1943
{'Response': 'False', 'Error': 'Movie not found!'}

Index: 24 | Title: The 5anishing Prairie | Year: 1954
{'Response': 'False', 'Error': 'Movie not found!'}

Index: 45 | Title: Toby Tyler or 10 Weeks with a Circus | Year: 1960
{'Response': 'False', 'Error': 'Movie not found!'}

Index: 46 | Title: Kidnapped | Year: 1960
{'Response': 'False', 'Error': 'Movie not found!'}

Iteration: 50

Index: 59 | Title: Bon 5oyage! | Year: 1962
{'Response': 'False', 'Error': 'Movie not found!'}

Index: 75 | Title: Those Calloways | Year: 1964
{'Response': 'False', 'Error': 'Movie not found!'}

Iteration: 100

Index: 130 | Title: Escape from the Dark | Year: 
{'Response': 'False', 'Error': 'Movie not found!'}

Iteration: 150

Index: 167 | Title: The Man from Snowy River 2 | Year: 1988
{'Response': 'False', 'Error': 'Movie not found!'}

Iteration: 200

Index: 211 | Title: Homeward Bound 2: Lost in San Francisco | Year: 1996
{'Response

#### Saving in PICKLE

In [15]:
save_data_pickle('Data_All_Movies_Disney_Final.pickle', movies_infobox)

# Checkpoint PICKLE Final Data

#### Loading in PICKLE data

In [23]:
import pickle
import json


def save_data(title, data):
    with open(title, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=2)
        
def load_data(title):
    with open(title, encoding='utf-8') as f:
        return json.load(f)

def save_data_pickle(title, data):
    with open(title, 'wb') as handle:
        pickle.dump(data, handle, protocol=pickle.HIGHEST_PROTOCOL)
        
def load_data_pickle(title):
    with open(title, 'rb') as handle:
        return pickle.load(handle)

In [17]:
movies_infobox = load_data_pickle('Data_All_Movies_Disney_Final.pickle')

### Task #5: Save final dataset as a JSON file and as a CSV file

#### Convert Datetime Object to string and Saving in JSON

In [19]:
movies_infobox_copy = load_data_pickle('Data_All_Movies_Disney_Final.pickle')

In [21]:
from datetime import datetime

for movie in movies_infobox_copy:
    current_date = movie.get('Release date (datetime)')
    if current_date:
        movie['Release date (datetime)'] = current_date.strftime('%B %d, %Y')
    else:
        movie['Release date (datetime)'] = None

In [25]:
save_data('Data_All_Movies_Disney_Final.json', movies_infobox_copy)

#### Convert data to Pandas DataFrame and CSV

In [28]:
import pandas as pd

df = pd.DataFrame(movies_infobox)
df.to_csv('Data_All_Movies_Disney_Final.csv')

In [31]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 548 entries, 0 to 547
Data columns (total 68 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   Title                    548 non-null    object        
 1   Production company       206 non-null    object        
 2   Distributed by           518 non-null    object        
 3   Release date             331 non-null    object        
 4   Running time             505 non-null    object        
 5   Country                  467 non-null    object        
 6   Language                 501 non-null    object        
 7   Box office               405 non-null    object        
 8   Running time (int)       501 non-null    float64       
 9   Budget (float)           315 non-null    float64       
 10  Box office (float)       393 non-null    float64       
 11  Release date (datetime)  500 non-null    datetime64[ns]
 12  IMDb                     503 non-nul

In [34]:
ratings = df[['Title', 'IMDb', 'Metacritic', 'Rotten Tomatoes']]

In [50]:
ratings.sort_values('IMDb', ascending=False).head(50).reset_index().drop(columns=['index'])

Unnamed: 0,Title,IMDb,Metacritic,Rotten Tomatoes
0,The Beatles: Get Back,9.0,,
1,Spirited Away,8.6,96.0,97%
2,The Lion King,8.5,88.0,93%
3,WALL-E,8.4,95.0,95%
4,Hamilton,8.4,90.0,98%
5,Coco,8.4,81.0,97%
6,Up,8.3,88.0,98%
7,Toy Story,8.3,95.0,100%
8,Toy Story 3,8.3,92.0,98%
9,Jim Henson,8.3,,
