In [1]:
import requests

In [2]:
from bs4 import BeautifulSoup

# Task 1: Scrape Toy Story 3 info from wiki

In [3]:
URL = "https://en.wikipedia.org/wiki/Toy_Story_3"

In [4]:
def clean_references(soup):
    for reference in soup.find_all(["sup", "span"]):
        reference.decompose()
    return soup

In [19]:
def movie_info(movie_URL):
    source = requests.get(movie_URL).text
    soup = BeautifulSoup(source, 'lxml')
    
    info_box = soup.find('table', class_="infobox vevent")   
    if info_box is None:
        return
    
    info_box = clean_references(info_box)
    
    info_dict = {}

    title = info_box.find('tr').text
    info_dict['title'] = title

    for row in info_box.find_all('tr'):
        try:
            # get key
            label = row.find('th', class_="infobox-label").get_text(" ", strip=True)
            info_dict[label] = []

            # get value
            data = row.find('td', class_="infobox-data")
            data_list = data.find('div', class_="plainlist")
            # Non-list data
            if data_list is None:
                if data.find('br'):
                    info_dict[label] = list((filter(lambda a: a!=',', [text for text in data.stripped_strings])))
                else:
                    info_dict[label] = data.get_text(" ", strip=True).replace('\xa0', ' ')
            # list data
            else:
                for info in data_list.find_all('li'):
                    info_text = info.get_text(" ", strip=True).replace('\xa0', ' ')
                    info_dict[label].append(info_text)

        except AttributeError:
            continue
            
    return info_dict

In [6]:
info_dict = movie_info(URL)

In [7]:
info_dict

{'title': 'Toy Story 3',
 'Directed by': 'Lee Unkrich',
 'Produced by': 'Darla K. Anderson',
 'Screenplay by': 'Michael Arndt',
 'Story by': ['John Lasseter', 'Andrew Stanton', 'Lee Unkrich'],
 'Starring': ['Tom Hanks',
  'Tim Allen',
  'Joan Cusack',
  'Don Rickles',
  'Wallace Shawn',
  'John Ratzenberger',
  'Estelle Harris',
  'Ned Beatty',
  'Michael Keaton',
  'Jodi Benson',
  'John Morris'],
 'Music by': 'Randy Newman',
 'Cinematography': ['Jeremy Lasky', 'Kim White'],
 'Edited by': 'Ken Schretzmann',
 'Production companies': ['Walt Disney Pictures', 'Pixar Animation Studios'],
 'Distributed by': ['Walt Disney Studios', 'Motion Pictures'],
 'Release date': ['June 12, 2010 ( Taormina Film Fest )',
  'June 18, 2010 (United States)'],
 'Running time': '103 minutes',
 'Country': 'United States',
 'Language': 'English',
 'Budget': '$200 million',
 'Box office': '$1.067 billion'}

In [8]:
URL2 = "https://en.wikipedia.org/wiki/The_Reluctant_Dragon_(1941_film)"

In [9]:
info_dict2 = movie_info(URL2)

In [10]:
info_dict2

{'title': 'The Reluctant Dragon',
 'Directed by': ['Alfred Werker',
  '(live action)',
  'Hamilton Luske',
  '(animation)',
  'Jack Cutting',
  'Ub Iwerks',
  'Jack Kinney',
  '(sequence directors)'],
 'Produced by': 'Walt Disney',
 'Written by': ['Live-action:',
  'Ted Sears',
  'Al Perkins',
  'Larry Clemmons',
  'Bill Cottrell',
  'Harry Clork',
  'Robert Benchley',
  'The Reluctant Dragon',
  'segment:',
  'Kenneth Grahame',
  '(original book)',
  'Erdman Penner',
  'T. Hee',
  'Baby Weems',
  'segment:',
  'Joe Grant',
  'Dick Huemer',
  'John Miller'],
 'Starring': ['Robert Benchley',
  'Frances Gifford',
  'Buddy Pepper',
  'Nana Bryant'],
 'Music by': ['Frank Churchill', 'Larry Morey'],
 'Cinematography': 'Bert Glennon',
 'Edited by': 'Paul Weatherwax',
 'Production company': 'Walt Disney Productions',
 'Distributed by': 'RKO Radio Pictures',
 'Release date': ['June 27, 1941'],
 'Running time': '74 minutes',
 'Country': 'United States',
 'Language': 'English',
 'Budget': '$600,

# Task 2: Scrape infobox for all movies in List of Disney Films

In [11]:
movie_list_URL = "https://en.wikipedia.org/wiki/List_of_Walt_Disney_Pictures_films"

In [12]:
source1 = requests.get(movie_list_URL).text
soup1 = BeautifulSoup(source1, 'lxml')

In [13]:
movie = soup1.find('table', class_='wikitable sortable')

In [14]:
movie_source = movie.find('i').find('a')['href']

In [15]:
movie_source

'/wiki/Academy_Award_Review_of_Walt_Disney_Cartoons'

In [16]:
movie_info("https://en.wikipedia.org" + movie_source)

{'title': 'Academy Award Review of ',
 'Production company': 'Walt Disney Productions',
 'Release date': ['May 19, 1937'],
 'Running time': '41 minutes (74 minutes 1966 release)',
 'Country': 'United States',
 'Language': 'English',
 'Box office': '$45.472'}

In [22]:
movies_info_list = []

In [25]:
for movie_table in soup1.find_all('table', class_='wikitable sortable'):
    for movie in movie_table.find_all('i'):
        movie_title = movie.find('a')
        if movie_title is None:
            continue
        movie_source = movie_title['href']
        movie_URL = "https://en.wikipedia.org" + movie_source
        print(movie_URL)
        info_dict = movie_info(movie_URL)
        if info_dict != None:
            movies_info_list.append(info_dict)

https://en.wikipedia.org/wiki/Academy_Award_Review_of_Walt_Disney_Cartoons
https://en.wikipedia.org/wiki/Snow_White_and_the_Seven_Dwarfs_(1937_film)
https://en.wikipedia.org/wiki/Pinocchio_(1940_film)
https://en.wikipedia.org/wiki/Fantasia_(1940_film)
https://en.wikipedia.org/wiki/The_Reluctant_Dragon_(1941_film)
https://en.wikipedia.org/wiki/Dumbo
https://en.wikipedia.org/wiki/Bambi
https://en.wikipedia.org/wiki/Saludos_Amigos
https://en.wikipedia.org/wiki/Victory_Through_Air_Power_(film)
https://en.wikipedia.org/wiki/The_Three_Caballeros
https://en.wikipedia.org/wiki/Make_Mine_Music
https://en.wikipedia.org/wiki/Song_of_the_South
https://en.wikipedia.org/wiki/Fun_and_Fancy_Free
https://en.wikipedia.org/wiki/Melody_Time
https://en.wikipedia.org/wiki/So_Dear_to_My_Heart
https://en.wikipedia.org/wiki/The_Adventures_of_Ichabod_and_Mr._Toad
https://en.wikipedia.org/wiki/Cinderella_(1950_film)
https://en.wikipedia.org/wiki/Treasure_Island_(1950_film)
https://en.wikipedia.org/wiki/Alice_in_

In [26]:
movies_info_list

[{'title': 'Academy Award Review of ',
  'Production company': 'Walt Disney Productions',
  'Release date': ['May 19, 1937'],
  'Running time': '41 minutes (74 minutes 1966 release)',
  'Country': 'United States',
  'Language': 'English',
  'Box office': '$45.472'},
 {'title': 'Snow White and the Seven Dwarfs',
  'Directed by': ['David Hand (supervising)',
   'William Cottrell',
   'Wilfred Jackson',
   'Larry Morey',
   'Perce Pearce',
   'Ben Sharpsteen'],
  'Produced by': 'Walt Disney',
  'Written by': ['Ted Sears',
   'Richard Creedon',
   'Otto Englander',
   'Dick Rickard',
   'Earl Hurd',
   'Merrill De Maris',
   'Dorothy Ann Blank',
   'Webb Smith'],
  'Based on': ['Snow White', 'by The', 'Brothers Grimm'],
  'Starring': ['Adriana Caselotti',
   'Lucille La Verne',
   'Harry Stockwell',
   'Roy Atwell',
   'Pinto Colvig',
   'Otis Harlan',
   'Scotty Mattraw',
   'Billy Gilbert',
   'Eddie Collins',
   'Moroni Olsen',
   'Stuart Buchanan'],
  'Music by': ['Frank Churchill', 'P

## Save Data

In [1]:
import json

def save_data(title, data):
    with open(title, 'w', encoding='utf-8') as file:
        json.dump(data, file, ensure_ascii=False, indent=4)

In [2]:
import json

def load_data(title):
    with open(title, encoding='utf-8') as f:
        return json.load(f)

In [29]:
save_data("disney_movies.json", movies_info_list)

# Task 3: Clean data

In [3]:
movies_info_list = load_data("disney_movies.json")

In [30]:
movies_info_list

[{'title': 'Academy Award Review of ',
  'Production company': 'Walt Disney Productions',
  'Release date': ['May 19, 1937'],
  'Running time': '41 minutes (74 minutes 1966 release)',
  'Country': 'United States',
  'Language': 'English',
  'Box office': '$45.472'},
 {'title': 'Snow White and the Seven Dwarfs',
  'Directed by': ['David Hand (supervising)',
   'William Cottrell',
   'Wilfred Jackson',
   'Larry Morey',
   'Perce Pearce',
   'Ben Sharpsteen'],
  'Produced by': 'Walt Disney',
  'Written by': ['Ted Sears',
   'Richard Creedon',
   'Otto Englander',
   'Dick Rickard',
   'Earl Hurd',
   'Merrill De Maris',
   'Dorothy Ann Blank',
   'Webb Smith'],
  'Based on': ['Snow White', 'by The', 'Brothers Grimm'],
  'Starring': ['Adriana Caselotti',
   'Lucille La Verne',
   'Harry Stockwell',
   'Roy Atwell',
   'Pinto Colvig',
   'Otis Harlan',
   'Scotty Mattraw',
   'Billy Gilbert',
   'Eddie Collins',
   'Moroni Olsen',
   'Stuart Buchanan'],
  'Music by': ['Frank Churchill', 'P

In [5]:
len(movies_info_list)

451

## Convert running time to integer

In [41]:
import re

def convert_running_time(movie_dict):
    """
    Convert running time to integer
    """
    try:
        running_time = movie_dict['Running time']
        if type(running_time) == str:
            movie_length = int(re.search(r'\d+', running_time).group())
        elif type(running_time) == list:
            movie_length = int(re.search(r'\d+', running_time[0]).group())
        elif type(running_time) == int:
            movie_length = running_time
    
    except KeyError:
        movie_length = None
    
    finally:
        movie_dict['Running time'] = movie_length

In [42]:
for index, movie in enumerate(movies_info_list):
    print(index)
    convert_running_time(movie)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
27

In [43]:
movies_info_list[327]

{'title': 'Hexe Lilli: Der Drache und das magische Buch',
 'Directed by': 'Stefan Ruzowitzky',
 'Music by': 'Ian Honeyman',
 'Production company': 'Walt Disney Pictures',
 'Distributed by': 'Buena Vista International Germany',
 'Country': 'Germany',
 'Language': 'German',
 'Running time': None}

In [44]:
save_data("disney_movies_integer_run_time.json", movies_info_list)

## Convert box_office and budget to float

In [22]:
movies_info_list = load_data("disney_movies_integer_run_time.json")

In [5]:
[(index, movie.get("Box office", None)) for index, movie in enumerate(movies_info_list)]

[(0, '$45.472'),
 (1, '$418 million'),
 (2, '$164 million'),
 (3, '$76.4–$83.3 million'),
 (4, '$960,000 (worldwide rentals)'),
 (5, '$1.3 million (est. United States/Canada rentals, 1941)'),
 (6, '$267.4 million'),
 (7, '$1,135,000 (worldwide rentals)'),
 (8, '$799,000'),
 (9, '$3,355,000 (worldwide rentals)'),
 (10, '$3.275 million (worldwide rentals)'),
 (11, '$65 million'),
 (12, '$3,165,000 (worldwide rentals)'),
 (13, '$2,560,000 (worldwide rentals)'),
 (14, '$3.7 million (U.S. rental) $575,000 (foreign rental)'),
 (15, '$1,625,000 (worldwide rentals)'),
 (16, '$263.6 million'),
 (17, '$4,100,000 (worldwide rentals)'),
 (18, '$5.6 million (US, 1951)'),
 (19, '$2.1 million (US rentals)'),
 (20, '$87.4 million'),
 (21, '$1 million (US)'),
 (22, '$2.6 million (US)'),
 (23, None),
 (24, '$1.75 million (US and Canadian rentals)'),
 (25, '$28.2 million'),
 (26, '$2,150,000 (US)'),
 (27, '$187 million'),
 (28, '$2.1 million (US)'),
 (29, '$1.6 million (US)'),
 (30, '$1.7 million (US)'),

In [6]:
[(index, movie.get("Budget", None)) for index, movie in enumerate(movies_info_list)]

[(0, None),
 (1, '$1.49 million'),
 (2, '$2.6 million'),
 (3, '$2.28 million'),
 (4, '$600,000'),
 (5, '$950,000'),
 (6, '$858,000'),
 (7, None),
 (8, '$788,000'),
 (9, None),
 (10, '$1.35 million'),
 (11, '$2.125 million'),
 (12, None),
 (13, '$1.5 million'),
 (14, '$1.5 million'),
 (15, None),
 (16, '$2.9 million'),
 (17, '$1,800,000'),
 (18, '$3 million'),
 (19, None),
 (20, '$4 million'),
 (21, '$2 million'),
 (22, '$300,000'),
 (23, '$1.8 million'),
 (24, None),
 (25, '$5 million'),
 (26, None),
 (27, '$4 million'),
 (28, None),
 (29, None),
 (30, None),
 (31, None),
 (32, None),
 (33, None),
 (34, '$700,000'),
 (35, None),
 (36, None),
 (37, None),
 (38, None),
 (39, None),
 (40, '$6 million'),
 (41, 'under $1 million or $1,250,000'),
 (42, None),
 (43, None),
 (44, '$2 million'),
 (45, None),
 (46, None),
 (47, '$2.5 million'),
 (48, None),
 (49, None),
 (50, None),
 (51, '$4 million'),
 (52, '$3.6 million'),
 (53, None),
 (54, None),
 (55, None),
 (56, None),
 (57, '$3 million'

In [17]:
import re 

def find_float(box_office):
    box_office_float = re.search("\d+\.?,?\d+(\,\d+)?", box_office)
    if box_office_float is None:
        return None
    else:
        box_office_float = box_office_float.group()
    box_office_float = box_office_float.replace(',', '')
    box_office_float = float(box_office_float)
    if "million" in box_office:
        box_office_float *= 1000000
        
    return box_office_float

In [18]:
find_float("under $1 million or $1,250,000")

1250000000000.0

In [21]:
find_float("960,000")

960000.0

In [19]:
def convert_box_office_budget(movie_dict):
    box_office = movie_dict.get("Box office", None)
    budget = movie_dict.get("Budget", None)
    
    if box_office is None:
        box_office_float = None
    
    elif type(box_office) == str:
        box_office_float = find_float(box_office)
        
    elif type(box_office) == float:
        box_office_float = box_office
    
    elif type(box_office) == list:
        for string in box_office:
            box_office_float = find_float(string)
            if box_office_float != None:
                break
                
    if budget is None:
        budget_float = None
    
    elif type(budget) == str:
        budget_float = find_float(budget)
        
    elif type(budget) == float:
        budget_float = budget
    
    elif type(budget) == list:
        for string in budget:
            budget_float = find_float(string)
            if budget_float != None:
                break
                
    movie_dict["Box office"] = box_office_float       
    movie_dict["Budget"] = budget_float        

In [23]:
for index, movie in enumerate(movies_info_list):
    print(index)
    convert_box_office_budget(movie)
    print(movie["Box office"])
    print(movie["Budget"])

0
45.472
None
1
418000000.0
1490000.0
2
164000000.0
2600000.0
3
76400000.0
2280000.0
4
960000.0
600000.0
5
1300000.0
950000.0
6
267399999.99999997
858000.0
7
1135000.0
None
8
799000.0
788000.0
9
3355000.0
None
10
3275000.0
1350000.0
11
65000000.0
2125000.0
12
3165000.0
None
13
2560000.0
1500000.0
14
3700000.0
1500000.0
15
1625000.0
None
16
263600000.00000003
2900000.0
17
4100000.0
1800000.0
18
5600000.0
None
19
2100000.0
None
20
87400000.0
None
21
None
None
22
2600000.0
300000.0
23
None
1800000.0
24
1750000.0
None
25
28200000.0
None
26
2150000.0
None
27
187000000.0
None
28
2100000.0
None
29
1600000.0
None
30
1700000.0
None
31
None
None
32
None
None
33
2750000.0
None
34
None
700000.0
35
1750000.0
None
36
6250000.0
None
37
None
None
38
1800000.0
None
39
2500000.0
None
40
51600000.0
None
41
12300000.0
1250000000000.0
42
2600000.0
None
43
None
None
44
1700000.0
None
45
3100000.0
None
46
None
None
47
3750000.0
2500000.0
48
None
None
49
2300000.0
None
50
None
None
51
40000000.0
None
52
30300

Fix some outliers

In [30]:
movies_info_list[41]["Budget"] = 1250000 

In [31]:
movies_info_list[41]

{'title': 'The Shaggy Dog',
 'Directed by': 'Charles Barton',
 'Produced by': ['Walt Disney', 'Bill Walsh'],
 'Written by': ['Lillie Hayward', 'Bill Walsh'],
 'Based on': ['The Hound of Florence', 'by', 'Felix Salten'],
 'Starring': ['Fred MacMurray',
  'Jean Hagen',
  'Tommy Kirk',
  'Annette Funicello',
  'Tim Considine'],
 'Narrated by': 'Paul Frees (opening only)',
 'Music by': 'Paul J. Smith',
 'Cinematography': 'Edward Colman',
 'Edited by': 'James Ballas',
 'Production company': 'Walt Disney Productions',
 'Distributed by': 'Buena Vista Distribution',
 'Release date': ['March 19, 1959'],
 'Running time': 104,
 'Country': 'United States',
 'Language': 'English',
 'Budget': 1250000,
 'Box office': 12300000.0}

In [34]:
movies_info_list[176]["Budget"] = 8700000

In [35]:
movies_info_list[176]

{'title': 'Shipwrecked',
 'Directed by': 'Nils Gaup',
 'Produced by': ['John M. Jacobsen', 'Nigel Wooll'],
 'Written by': ['O.V. Falck-Ytter',
  '(novel)',
  'Nils Gaup',
  'Bob Foss',
  'Greg Dinner',
  'Nick Thiel'],
 'Starring': ['Stian Smestad', 'Gabriel Byrne'],
 'Music by': 'Patrick Doyle',
 'Cinematography': 'Erling Thurmann-Andersen',
 'Edited by': 'Nils Pagh Andersen',
 'Production companies': ['Walt Disney Pictures', 'AB Svensk Filmindustri'],
 'Distributed by': 'Buena Vista Pictures',
 'Release date': ['3 October 1990'],
 'Running time': 93,
 'Countries': ['Norway', 'Sweden', 'United States'],
 'Languages': ['Norwegian', 'English'],
 'Budget': 8700000,
 'Box office': 15100000.0}

In [37]:
movies_info_list[287]["Budget"] = 24000000

In [42]:
movies_info_list[287]["Box office"] = 236000000

In [43]:
movies_info_list[287]

{'title': "Howl's Moving Castle",
 'Japanese': '',
 'Hepburn': 'Hauru no Ugoku Shiro',
 'Directed by': 'Hayao Miyazaki',
 'Produced by': 'Toshio Suzuki',
 'Screenplay by': 'Hayao Miyazaki',
 'Based on': ["Howl's Moving Castle", 'by', 'Diana Wynne Jones'],
 'Starring': ['Chieko Baisho', 'Takuya Kimura', 'Akihiro Miwa'],
 'Music by': 'Joe Hisaishi',
 'Cinematography': 'Atsushi Okui',
 'Edited by': 'Takeshi Seyama',
 'Production company': 'Studio Ghibli',
 'Distributed by': 'Toho',
 'Release date': ['5 September 2004 ( Venice )', '20 November 2004 (Japan)'],
 'Running time': 119,
 'Country': 'Japan',
 'Language': 'Japanese',
 'Budget': 24000000,
 'Box office': 236000000}

In [39]:
movies_info_list[329]["Budget"] = 34000000

In [40]:
movies_info_list[329]

{'title': 'Ponyo',
 'Japanese': '',
 'Hepburn': 'Gake no Ue no Ponyo',
 'Directed by': 'Hayao Miyazaki',
 'Produced by': 'Toshio Suzuki',
 'Written by': 'Hayao Miyazaki',
 'Based on': ['The Little Mermaid', 'by', 'Hans Christian Andersen'],
 'Starring': ['Tomoko Yamaguchi',
  'Kazushige Nagashima',
  'Yūki Amami',
  'George Tokoro',
  'Yuria Nara',
  'Hiroki Doi',
  'Rumi Hiiragi',
  'Akiko Yano',
  'Kazuko Yoshiyuki',
  'Tomoko Naraoka'],
 'Music by': 'Joe Hisaishi',
 'Cinematography': 'Atsushi Okui',
 'Edited by': 'Takeshi Seyama',
 'Production company': 'Studio Ghibli',
 'Distributed by': 'Toho',
 'Release date': ['July 19, 2008'],
 'Running time': 101,
 'Country': 'Japan',
 'Language': 'Japanese',
 'Budget': 34000000,
 'Box office': 203200000.0}

In [44]:
save_data("disney_movies_float_box_office_budget.json", movies_info_list)