In [1]:
from bs4 import BeautifulSoup as bs
import requests
import json

In [2]:
#JSON 형태로 로드
def load_data(title):
    with open(title, encoding='utf-8') as f:
        return json.load(f)

#JSON 형태로 저장
def save_data(title, data):
    with open(title, 'w', encoding='utf-8') as f:
        #json.dumps() = Python 객체를 JSON으로 변환
        #ensure_ascii - true면 모든 비 ASCII 문자가 출력 안됨, False면 문자 그대로 출력
        #indent - 인덱스폭 (문자 수) 지정
        json.dump(data, f, ensure_ascii=False, indent=2)

In [3]:
movie_info_list = load_data("Disney_data.json")

In [4]:
r = requests.get("https://en.wikipedia.org/wiki/List_of_Walt_Disney_Pictures_films")

#Convert to a beautifulsoup object
soup = bs(r.content, 'html.parser')

In [5]:

movies = soup.select('.wikitable.sortable i') # 이렇게하면 i를 포함하고있는 '.wikitable.sortable i' 테이블을 리턴
# movies[0:10]

### JSON 형태로 저장 시킨 파일을 로드 후 정제 작업

In [6]:

def get_content_value(row_data):
#     print(row_data)
    
    if row_data.find('li'):
        return [li.get_text(" ", strip=True).replace("\xa0", " ") for li in row_data.find_all('li')]
    elif row_data.find("br"):
        return [text for text in row_data.stripped_strings]
    else:
        return row_data.get_text(" ", strip=True).replace("\xa0", " ")

    
# Clean up references (remove [1], [2] etc)
def clean_tags(soup):
    for tag in soup.find_all(["sup", "span"]):
        tag.decompose()
    
    
def get_info_box(url):
    
    r = requests.get(url)

    #Convert to a beautifulsoup object
    soup = bs(r.content, 'html.parser') #이 'soup' 이 함수 매게변수로 들어 감
    info_box = soup.find(class_="infobox vevent")
    info_rows = info_box.find_all('tr')
    
    clean_tags(soup) #이 부분이 cealn_tags() 함수를 탐
    
    movie_info = {}
    
    for idx, row in enumerate(info_rows):
        if idx == 0: # 타이틀의 인덱스는 = 0
            movie_info['title'] = row.find('th').get_text(" ", strip=True)
        else:
            header = row.find('th') 
#             print(header)
            if header:#인덱스 0을 제외하고 'th'태그가 있을 경우 if문을 타게 됨
                content_key = row.find('th').get_text(" ", strip=True)
    #             print(content_key)
                content_value = get_content_value(row.find('td')) #이 부분이 get_content_value를 탐
                movie_info[content_key] = content_value
    return movie_info

In [7]:
get_info_box("https://en.wikipedia.org/wiki/Spirited_Away")

{'title': 'Spirited Away',
 'Japanese': '',
 'Hepburn': 'Sen to Chihiro no Kamikakushi',
 'Directed by': 'Hayao Miyazaki',
 'Written by': 'Hayao Miyazaki',
 'Produced by': 'Toshio Suzuki',
 'Starring': ['Rumi Hiiragi',
  'Miyu Irino',
  'Mari Natsuki',
  'Takeshi Naito',
  'Yasuko Sawaguchi',
  'Tsunehiko Kamijō',
  'Takehiko Ono',
  'Bunta Sugawara'],
 'Cinematography': 'Atsushi Okui',
 'Edited by': 'Takeshi Seyama',
 'Music by': 'Joe Hisaishi',
 'Production company': 'Studio Ghibli',
 'Distributed by': 'Toho',
 'Release date': ['20 July 2001 (Japan)'],
 'Running time': '125 minutes',
 'Country': 'Japan',
 'Language': 'Japanese',
 'Budget': '(US$15–19.2 million)',
 'Box office': ''}

In [8]:
r = requests.get("https://en.wikipedia.org/wiki/List_of_Walt_Disney_Pictures_films")

#Convert to a beautifulsoup object
soup = bs(r.content, 'html.parser')


# movies = soup.select('.wikitable.sortable i')으로는 아래의 Nonetype이 있으므로 조금 더 구체적으로 가져오자
movies = soup.select('.wikitable.sortable i a') #즉 <i>태그 중 <a>를 가진 것만 들고 옴
# print(len(movies)) #NoneType 제외하고 개수를 확인 510개로 나옴


base_path = "https://en.wikipedia.org/" #기본 url을 세팅 후


movie_info_list = [] #들고 오는 내용을 담을 리스트
for index, movie in enumerate(movies):
#     if index % 10 == 0:
#         print(index)
    #디버깅용
#     if index == 10:
#         break     
    try:
        relative_path = movie['href']
        full_path = base_path + relative_path #긁어오는 'href'를 더해서 full_path 만든 후 함수에 삽입
        title = movie['title']
#         relative_path = movie.a['href'] # <i>태그 중 <a>만 들고 오니 .a는 생략
#         title = movie.a['title']
        #위의 결과들을 print 해보면 relative_path가 가져오는 'href'가 없는 부분이 있다. 여기서 try/catch가 필요
#         print(relative_path)
#         print(title)
#         print()
        
        movie_info_list.append(get_info_box(full_path)) #여기에 들어갈 url 세팅 (base_path + full_path)
        
    except Exception as e: #즉 오류부분이면 여기를타고 아래를 실행 
        print(movie.get_text()) #이 부분에서 어떤 movie에서 오류가 나오는지 보인다
        #몇몇의 movie가 테이블의 Title에서 'href'를 들고오는게아닌 'Note'부분에서 들고 온다. 
        #몇몇의 movie가 테이블의 Title에서 'href'를 들고 오지만, 링크가 아는 형태이다.
        
        #최종코드에서 돌려보면 여전히 몇몇의 movie가 find/find_all/get_text가 안된다
        print(e)

Zorro the Avenger
'NoneType' object has no attribute 'find'
The Sign of Zorro
'NoneType' object has no attribute 'find'
True-Life Adventures
'NoneType' object has no attribute 'find_all'
The London Connection
'NoneType' object has no attribute 'find'
The Beatles: Get Back–The Rooftop Concert
'NoneType' object has no attribute 'find'
Sister Act 3
'NoneType' object has no attribute 'find'
Tower of Terror
'NoneType' object has no attribute 'find_all'
Tron: Ares
'NoneType' object has no attribute 'find'


In [9]:
#JSON 형태로 저장
def save_data(title, data):
    with open(title, 'w', encoding='utf-8') as f:
        #json.dumps() = Python 객체를 JSON으로 변환
        #ensure_ascii - true면 모든 비 ASCII 문자가 출력 안됨, False면 문자 그대로 출력
        #indent - 인덱스폭 (문자 수) 지정
        json.dump(data, f, ensure_ascii=False, indent=2)

In [10]:
save_data("Disney_data.json", movie_info_list)

In [11]:
# Split up the long stgrings

In [12]:
movie_info_list[-8]

{'title': 'Three Men and a Baby',
 'Directed by': 'Leonard Nimoy',
 'Screenplay by': ['Jim Cruickshank', 'James Orr'],
 'Based on': ['Trois hommes et un couffin', 'by', 'Coline Serreau'],
 'Produced by': ['Ted Field', 'Robert W. Cort'],
 'Starring': ['Tom Selleck', 'Steve Guttenberg', 'Ted Danson'],
 'Cinematography': 'Adam Greenberg',
 'Edited by': 'Michael A. Stevenson',
 'Music by': 'Marvin Hamlisch',
 'Production companies': ['Touchstone Pictures',
  'Silver Screen Partners III',
  'Interscope Communications'],
 'Distributed by': 'Buena Vista Pictures',
 'Release date': ['November 25, 1987'],
 'Running time': '102 minutes',
 'Country': 'United States',
 'Language': 'English',
 'Budget': '$11 million',
 'Box office': '$240 million'}

In [37]:
print([movie.get('Running time (int)', 'N/A') for movie in movie_info_list])

[41, 83, 88, 126, 74, 64, 70, 42, 70, 71, 75, 94, 73, 75, 82, 68, 74, 96, 75, 84, 77, 92, 69, 81, 60, 127, 92, 76, 75, 73, 85, 81, 70, 90, 80, 75, 83, 83, 72, 97, 75, 104, 93, 105, 95, 97, 134, 69, 92, 126, 79, 97, 128, 73, 91, 105, 98, 130, 89, 93, 67, 98, 100, 118, 103, 110, 80, 79, 91, 91, 97, 118, 139, 131, 92, 87, 116, 93, 110, 110, 131, 101, 108, 84, 78, 75, 164, 106, 110, 99, 113, 108, 112, 93, 91, 93, 100, 100, 79, 96, 113, 89, 118, 92, 88, 92, 87, 93, 93, 93, 90, 83, 96, 88, 89, 91, 93, 92, 97, 100, 100, 89, 91, 112, 115, 95, 91, 97, 104, 74, 48, 77, 104, 128, 101, 94, 104, 90, 100, 88, 93, 98, 112, 84, 97, 97, 114, 96, 97, 109, 83, 90, 107, 96, 103, 91, 95, 105, 113, 80, 101, 90, 74, 90, 89, 110, 74, 93, 84, 83, 74, 77, 107, 93, 88, 108, 84, 121, 89, 104, 90, 86, 84, 108, 107, 96, 98, 105, 108, 94, 106, 102, 88, 102, 102, 97, 111, 100, 96, 98, 78, 81, 108, 89, 99, 89, 81, 92, 100, 89, 79, 91, 101, 104, 103, 86, 105, 75, 93, 92, 98, 95, 93, 87, 93, 87, 128, 77, 86, 95, 114, 93

In [38]:
def minute_to_interger(running_time):
    if running_time == "N/A": #'Running time'이 없는 것은 'N/A'로 전처리를 미리 했기때문에 가능
        return None
    if isinstance(running_time, list): #running_time이 list 형태이면 아래 수행
        return int(running_time[0].split(" ")[0])
        #여기서 리스트 형태중 0번째 인덱스 내용만 가져와서 저장
    else:
        return int(running_time.split(" ")[0])#'85 minutes' 형태에서 [0]번째를 split 하면 85만 남는다

for movie in movie_info_list:
    movie['Running time (int)'] = minute_to_interger(movie.get('Running time', 'N/A'))


In [39]:
movie_info_list[-8]

{'title': 'Three Men and a Baby',
 'Directed by': 'Leonard Nimoy',
 'Screenplay by': ['Jim Cruickshank', 'James Orr'],
 'Based on': ['Trois hommes et un couffin', 'by', 'Coline Serreau'],
 'Produced by': ['Ted Field', 'Robert W. Cort'],
 'Starring': ['Tom Selleck', 'Steve Guttenberg', 'Ted Danson'],
 'Cinematography': 'Adam Greenberg',
 'Edited by': 'Michael A. Stevenson',
 'Music by': 'Marvin Hamlisch',
 'Production companies': ['Touchstone Pictures',
  'Silver Screen Partners III',
  'Interscope Communications'],
 'Distributed by': 'Buena Vista Pictures',
 'Release date': ['November 25, 1987'],
 'Running time': '102 minutes',
 'Country': 'United States',
 'Language': 'English',
 'Budget': '$11 million',
 'Box office': '$240 million',
 'Running time (int)': 102,
 'Budget (float)': 11000000.0,
 'Box office (float)': 240000000.0,
 'Release date (datetime)': datetime.datetime(1987, 11, 25, 0, 0)}

In [40]:
print([movie.get('Budget', 'N/A') for movie in movie_info_list])

['N/A', '$1.49 million', '$2.6 million', '$2.28 million', '$600,000', '$950,000', '$858,000', 'N/A', '$788,000', 'N/A', '$1.35 million', '$2.125 million', 'N/A', '$1.5 million', '$1.5 million', 'N/A', '$2.2 million', '$1,800,000', '$3 million', 'N/A', '$4 million', '$2 million', '$300,000', '$1.8 million', 'N/A', '$5 million', 'N/A', '$4 million', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', '$700,000', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', '$6 million', 'under $1 million or $1,250,000', 'N/A', '$2 million', 'N/A', 'N/A', '$2.5 million', 'N/A', 'N/A', '$4 million', '$3.6 million', 'N/A', 'N/A', 'N/A', 'N/A', '$3 million', 'N/A', '$3 million', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', '$3 million', 'N/A', 'N/A', 'N/A', 'N/A', '$4.4–6 million', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', '$4 million', 'N/A', '$5 million', 'N/A', 'N/A', 'N/A', 'N/A', '$5 million', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', '$4 million', 'N/A', 'N/A', 'N/A', '

In [17]:
import re

In [41]:
number = r"\d+(,\d{3})*\.*\d*"
amount = r"thousand|million|billion"

word_re = rf"\${number}(-|\sto\s|–)?({number})?\s({amount})"
value_re = rf"\${number}"


In [42]:
def word_to_value(word):
    value_dict = {"thousand":1000, "million":1000000, "billion":1000000000}
    return value_dict[word]

def parse_word_syntax(string): #number = \d+(,\d{3})*\.*\d* / string = money_conversion("$790 million")
    value_string = re.search(number, string).group() # value_string = 790, string값중 number와 매칭되는 값만 반환
    value = float(value_string.replace(",", "")) #value_string 값을 소수점으로 바꾸고 replace
    word = re.search(amount, string, flags=re.I).group().lower() #string으로 넘어온값중 amount랑 매치 값만 반환
    world_value = word_to_value(word) #즉 $790 million이 넘어오면 여기서 million이 매칭되고 이 값을 word_to_value 함수를 타고 값 변환
    return value * world_value #(790 * 1000000)

def parse_value_syntax(string):
    value_string = re.search(number, string).group()#string으로 넘어오는 변수를 number로 정규식을 써서 담음
    value = float(value_string.replace(",", ""))#그 값 중 콤마를 없앰
    return value


def money_conversion(money):
    if money == 'N/A':
        return None
    
    if isinstance(money, list):
        money = money[0] #리스트 형식으로 넘어오면 0번째 인덱스만 채택
#     print(money)
    
    word_syntax = re.search(word_re, money, flags=re.I)
    value_syntax = re.search(value_re, money)
    
    if word_syntax:
        return parse_word_syntax(word_syntax.group())
        
    elif value_syntax:
        return parse_value_syntax(value_syntax.group())
    
    else:
        return None

In [43]:
print(money_conversion("N/A"))

None


In [44]:
for movie in movie_info_list:
    movie['Budget (float)'] = money_conversion(movie.get('Budget', 'N/A'))
    movie['Box office (float)'] = money_conversion(movie.get('Box office', 'N/A'))

In [46]:
movie_info_list[-60]

{'title': 'A Wrinkle in Time',
 'Directed by': 'Ava DuVernay',
 'Screenplay by': ['Jennifer Lee', 'Jeff Stockwell'],
 'Based on': ['A Wrinkle in Time', 'by', "Madeleine L'Engle"],
 'Produced by': ['Jim Whitaker', 'Catherine Hand'],
 'Starring': ['Oprah Winfrey',
  'Reese Witherspoon',
  'Mindy Kaling',
  'Gugu Mbatha-Raw',
  'Michael Peña',
  'Zach Galifianakis',
  'Chris Pine'],
 'Cinematography': 'Tobias A. Schliessler',
 'Edited by': 'Spencer Averick',
 'Music by': 'Ramin Djawadi',
 'Production companies': ['Walt Disney Pictures', 'Whitaker Entertainment'],
 'Distributed by': ['Walt Disney Studios', 'Motion Pictures'],
 'Release date': ['February 26, 2018 ( El Capitan Theatre )',
  'March 9, 2018 (United States)'],
 'Running time': '109 minutes',
 'Country': 'United States',
 'Language': 'English',
 'Budget': '$100–130 million',
 'Box office': '$133.4 million',
 'Running time (int)': 109,
 'Budget (float)': 100000000.0,
 'Box office (float)': 133400000.0,
 'Release date (datetime)':

In [23]:
money_conversion(movie_info_list[-60]['Budget'])

100000000.0

In [47]:
# Convert Dates to datetiems
print([movie.get('Release date', 'N/A') for movie in movie_info_list])

[['May 19, 1937'], ['December 21, 1937 ( Carthay Circle Theatre )'], ['February 7, 1940 ( Center Theatre )', 'February 23, 1940 (United States)'], ['November 13, 1940'], ['June 27, 1941'], ['October 23, 1941 (New York City)', 'October 31, 1941 (U.S.)'], ['August 9, 1942 (World Premiere – London)', 'August 13, 1942 (Premiere – New York City)', 'August 21, 1942 (U.S.)'], ['August 24, 1942 (World Premiere – Rio de Janeiro)', 'February 6, 1943 (U.S. Premiere – Boston)', 'February 19, 1943 (U.S.)'], ['July 17, 1943'], ['December 21, 1944 (Mexico City)', 'February 3, 1945 (US)'], ['April 20, 1946 (New York City premiere)', 'August 15, 1946 (U.S.)'], ['November 12, 1946 (Premiere: Atlanta, Georgia)', 'November 20, 1946', 'March 30, 1947 (Stanford Theatre, Palo Alto, California)'], ['September 27, 1947'], 'May 27, 1948', ['November 29, 1948 (Chicago, Illinois)', 'January 19, 1949 (Indianapolis, Indiana)'], ['October 5, 1949'], ['February 15, 1950 (Boston)', 'March 4, 1950 (United States)'], ['

In [48]:
#June 28, 1950

from datetime import datetime

dates = [movie.get('Release date', 'N/A') for movie in movie_info_list]

print(dates)

[['May 19, 1937'], ['December 21, 1937 ( Carthay Circle Theatre )'], ['February 7, 1940 ( Center Theatre )', 'February 23, 1940 (United States)'], ['November 13, 1940'], ['June 27, 1941'], ['October 23, 1941 (New York City)', 'October 31, 1941 (U.S.)'], ['August 9, 1942 (World Premiere – London)', 'August 13, 1942 (Premiere – New York City)', 'August 21, 1942 (U.S.)'], ['August 24, 1942 (World Premiere – Rio de Janeiro)', 'February 6, 1943 (U.S. Premiere – Boston)', 'February 19, 1943 (U.S.)'], ['July 17, 1943'], ['December 21, 1944 (Mexico City)', 'February 3, 1945 (US)'], ['April 20, 1946 (New York City premiere)', 'August 15, 1946 (U.S.)'], ['November 12, 1946 (Premiere: Atlanta, Georgia)', 'November 20, 1946', 'March 30, 1947 (Stanford Theatre, Palo Alto, California)'], ['September 27, 1947'], 'May 27, 1948', ['November 29, 1948 (Chicago, Illinois)', 'January 19, 1949 (Indianapolis, Indiana)'], ['October 5, 1949'], ['February 15, 1950 (Boston)', 'March 4, 1950 (United States)'], ['

In [61]:
def clean_date(date):
    return date.split("(")[0].strip()

def date_conversion(date):
    if isinstance(date, list):
        date = date[0]
        
    if date == "N/A":
        return None
    
    date_str = clean_date(date)
    fmts = ["%B %d, %Y", "%d %B %Y"]
    for fmt in fmts:
        try:    
            return datetime.strptime(date_str, fmt)
        except:
            pass
    return None

In [60]:
for movie in movie_info_list:
    movie['Release date (datetime)'] = date_conversion(movie.get('Release date', 'N/A'))

In [62]:
movie_info_list[-40]

{'title': 'Elephant',
 'Directed by': ['Mark Linfield', 'Vanessa Berlowitz'],
 'Produced by': ['Mark Linfield', 'Vanessa Berlowitz', 'Roy Conli'],
 'Narrated by': 'Meghan, Duchess of Sussex',
 'Cinematography': ['Martyn Colbeck', 'Mike Holding', 'Tom Walker'],
 'Edited by': 'Nigel Buck',
 'Music by': 'Ramin Djawadi',
 'Production company': 'Disneynature',
 'Distributed by': 'Disney+',
 'Release date': ['April 3, 2020'],
 'Running time': '89 minutes',
 'Country': 'United States',
 'Language': 'English',
 'Running time (int)': 89,
 'Budget (float)': None,
 'Box office (float)': None,
 'Release date (datetime)': datetime.datetime(2020, 4, 3, 0, 0)}

In [52]:
import pickle

def save_data_pickle(name, data):
    with open(name, 'wb') as f:
        pickle.dump(data, f)


In [53]:
def load_data_pickle(name):
    with open(name, 'rb') as f:
        return pickle.load(f)


In [54]:
save_data_pickle('disney_movie_cleaned.pickle', movie_info_list)

In [55]:
a = load_data_pickle('disney_movie_cleaned.pickle')

In [63]:
a[5]

{'title': 'Dumbo',
 'Directed by': ['Ben Sharpsteen',
  'Norman Ferguson',
  'Wilfred Jackson',
  'Bill Roberts',
  'Jack Kinney',
  'Samuel Armstrong'],
 'Story by': ['Otto Englander', 'Joe Grant', 'Dick Huemer'],
 'Based on': ['Dumbo, the Flying Elephant',
  'by',
  'Helen Aberson',
  'Harold Pearl'],
 'Produced by': 'Walt Disney',
 'Starring': ['Edward Brophy',
  'Verna Felton',
  'Cliff Edwards',
  'Herman Bing',
  'Sterling Holloway',
  'Margaret Wright',
  'Hall Johnson Choir'],
 'Narrated by': 'John McLeish',
 'Music by': ['Frank Churchill', 'Oliver Wallace'],
 'Production company': 'Walt Disney Productions',
 'Distributed by': 'RKO Radio Pictures',
 'Release date': ['October 23, 1941 (New York City)',
  'October 31, 1941 (U.S.)'],
 'Running time': '64 minutes',
 'Country': 'United States',
 'Language': 'English',
 'Budget': '$950,000',
 'Box office': '>$1.3 million (est. United States/Canada rentals, 1941)',
 'Running time (int)': 64,
 'Budget (float)': 950000.0,
 'Box office (

In [57]:
a == movie_info_list

True

In [58]:
movie_info_list = load_data_pickle('disney_movie_cleaned.pickle')

In [67]:
movie_info_list[-70]

{'title': 'Dangal',
 'Directed by': 'Nitesh Tiwari',
 'Written by': ['Saeed Aadil',
  'Piyush Gupta',
  'Shreyas Jain',
  'Nikhil Meharotra'],
 'Story by': ['Curation:', 'Nitesh Tiwari', 'Concept:', 'Divya V. Rao'],
 'Based on': 'Lives of Mahavir Singh Phogat and Phogat sisters',
 'Produced by': ['Aamir Khan', 'Kiran Rao', 'Siddharth Roy Kapur'],
 'Starring': ['Aamir Khan',
  'Sakshi Tanwar',
  'Fatima Sana Shaikh',
  'Zaira Wasim',
  'Sanya Malhotra',
  'Suhani Bhatnagar',
  'Aparshakti Khurana',
  'Girish Kulkarni'],
 'Narrated by': 'Aparshakti Khurana',
 'Cinematography': 'Setu',
 'Edited by': 'Ballu Saluja',
 'Music by': 'Pritam',
 'Production companies': ['Aamir Khan Productions',
  'Walt Disney Pictures India'],
 'Distributed by': 'UTV Motion Pictures',
 'Release date': ['21 December 2016 (United States)',
  '23 December 2016 (India)'],
 'Running time': '161 minutes',
 'Country': 'India',
 'Language': 'Hindi',
 'Budget': '(US$9.3 million)',
 'Box office': '(US$270 million)',
 'Ru

In [None]:
# http://www.omdbapi.com/?apikey=[30089c0e]&

In [84]:
import requests
import urllib

def get_omdb_info(title):
    base_url = "http://www.omdbapi.com/?"
    parameters = {"apikey":"30089c0e", "t":title}
    params_encoded = urllib.parse.urlencode(parameters)
#     print(params_encoded)
    full_url = base_url + params_encoded
#     print(full_url)
    return requests.get(full_url).json()

def get_rotten_tomato_score(omdb_info):
    ratings = omdb_info.get('Ratings', [])
    for rating in ratings:
#         print(rating) #테스트 용!
        if rating['Source'] == 'Rotten Tomatoes':
            return rating['Value']
    return None
        
        
info = get_omdb_info("into the woods")
get_rotten_tomato_score(info)

'71%'

In [85]:
for movie in movie_info_list:
    title = movie['title']
    omdb_info = get_omdb_info(title)
    #여기서 부터는 새로운 내용을 추가!
    movie['imdb'] = omdb_info.get('imdbRating', 'N/A')
    movie['metascore'] = omdb_info.get('Metascore', 'N/A')
    movie['rotten_tomatoes'] = get_rotten_tomato_score(omdb_info)

In [87]:
movie_info_list[-40]

{'title': 'Elephant',
 'Directed by': ['Mark Linfield', 'Vanessa Berlowitz'],
 'Produced by': ['Mark Linfield', 'Vanessa Berlowitz', 'Roy Conli'],
 'Narrated by': 'Meghan, Duchess of Sussex',
 'Cinematography': ['Martyn Colbeck', 'Mike Holding', 'Tom Walker'],
 'Edited by': 'Nigel Buck',
 'Music by': 'Ramin Djawadi',
 'Production company': 'Disneynature',
 'Distributed by': 'Disney+',
 'Release date': ['April 3, 2020'],
 'Running time': '89 minutes',
 'Country': 'United States',
 'Language': 'English',
 'Running time (int)': 89,
 'Budget (float)': None,
 'Box office (float)': None,
 'Release date (datetime)': datetime.datetime(2020, 4, 3, 0, 0),
 'imdb': '7.2',
 'metascore': '70',
 'rotten_tomatoes': '73%'}

In [88]:
save_data_pickle('disney_movie_data_final.pickle',movie_info_list)

In [91]:
movie_info_copy = [movie.copy() for movie in movie_info_list]

In [93]:
movie_info_copy[-40]

{'title': 'Elephant',
 'Directed by': ['Mark Linfield', 'Vanessa Berlowitz'],
 'Produced by': ['Mark Linfield', 'Vanessa Berlowitz', 'Roy Conli'],
 'Narrated by': 'Meghan, Duchess of Sussex',
 'Cinematography': ['Martyn Colbeck', 'Mike Holding', 'Tom Walker'],
 'Edited by': 'Nigel Buck',
 'Music by': 'Ramin Djawadi',
 'Production company': 'Disneynature',
 'Distributed by': 'Disney+',
 'Release date': ['April 3, 2020'],
 'Running time': '89 minutes',
 'Country': 'United States',
 'Language': 'English',
 'Running time (int)': 89,
 'Budget (float)': None,
 'Box office (float)': None,
 'Release date (datetime)': datetime.datetime(2020, 4, 3, 0, 0),
 'imdb': '7.2',
 'metascore': '70',
 'rotten_tomatoes': '73%'}

In [96]:
for movie in movie_info_copy:
    current_date = movie['Release date (datetime)']
    if current_date:
        movie['Release date (datetime)'] = current_date.strftime("%B %d, %Y")
    else:
        movie['Release date (datetime)'] = None

In [99]:
movie_info_copy[-40]

{'title': 'Elephant',
 'Directed by': ['Mark Linfield', 'Vanessa Berlowitz'],
 'Produced by': ['Mark Linfield', 'Vanessa Berlowitz', 'Roy Conli'],
 'Narrated by': 'Meghan, Duchess of Sussex',
 'Cinematography': ['Martyn Colbeck', 'Mike Holding', 'Tom Walker'],
 'Edited by': 'Nigel Buck',
 'Music by': 'Ramin Djawadi',
 'Production company': 'Disneynature',
 'Distributed by': 'Disney+',
 'Release date': ['April 3, 2020'],
 'Running time': '89 minutes',
 'Country': 'United States',
 'Language': 'English',
 'Running time (int)': 89,
 'Budget (float)': None,
 'Box office (float)': None,
 'Release date (datetime)': 'April 03, 2020',
 'imdb': '7.2',
 'metascore': '70',
 'rotten_tomatoes': '73%'}

In [101]:
save_data("disney_data_final.json", movie_info_copy)

In [102]:
import pandas as pd

df = pd.DataFrame(movie_info_list)

In [103]:
df.head(5)

Unnamed: 0,title,Production company,Distributed by,Release date,Running time,Country,Language,Box office,Running time (int),Budget (float),...,Hepburn,Adaptation by,Animation by,Traditional,Simplified,Original title,Layouts by,Created by,Original work,Owner
0,Academy Award Review of,Walt Disney Productions,RKO Radio Pictures,"[May 19, 1937]",41 minutes (74 minutes 1966 release),United States,English,$45.472,41.0,,...,,,,,,,,,,
1,Snow White and the Seven Dwarfs,Walt Disney Productions,RKO Radio Pictures,"[December 21, 1937 ( Carthay Circle Theatre )]",83 minutes,United States,English,$418 million,83.0,1490000.0,...,,,,,,,,,,
2,Pinocchio,Walt Disney Productions,RKO Radio Pictures,"[February 7, 1940 ( Center Theatre ), February...",88 minutes,United States,English,$164 million,88.0,2600000.0,...,,,,,,,,,,
3,Fantasia,Walt Disney Productions,RKO Radio Pictures,"[November 13, 1940]",126 minutes,United States,English,$76.4–$83.3 million (United States and Canada),126.0,2280000.0,...,,,,,,,,,,
4,The Reluctant Dragon,Walt Disney Productions,RKO Radio Pictures,"[June 27, 1941]",74 minutes,United States,English,"$960,000 (worldwide rentals)",74.0,600000.0,...,,,,,,,,,,


In [104]:
df.to_csv("disney_movie_data_final.csv")

In [107]:
running_times = df.sort_values(['Running time (int)'], ascending=True)
running_times.head()

Unnamed: 0,title,Production company,Distributed by,Release date,Running time,Country,Language,Box office,Running time (int),Budget (float),...,Hepburn,Adaptation by,Animation by,Traditional,Simplified,Original title,Layouts by,Created by,Original work,Owner
285,Sacred Planet,Walt Disney Pictures,Buena Vista Pictures,"[April 22, 2004]",40 minutes,,English,"$1,108,356",40.0,,...,,,,,,,,,,
311,Roving Mars,,Buena Vista Pictures,"[January 27, 2006]",40 minutes,United States,English,$11 million,40.0,1000000.0,...,,,,,,,,,,
0,Academy Award Review of,Walt Disney Productions,RKO Radio Pictures,"[May 19, 1937]",41 minutes (74 minutes 1966 release),United States,English,$45.472,41.0,,...,,,,,,,,,,
7,Saludos Amigos,Walt Disney Productions,RKO Radio Pictures,"[August 24, 1942 (World Premiere – Rio de Jane...",42 minutes,United States,,$1.135 million (worldwide rentals),42.0,,...,,,,,,,,,,
130,A Tale of Two Critters,Walt Disney Productions,Buena Vista Distribution,"[June 20, 1977]",48 minutes,United States,English,,48.0,,...,,,,,,,,,,
