<a href="https://colab.research.google.com/github/siglimumuni/my_projects/blob/master/Web_Scraping_with_Python_Disney_Dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#Import all relevant libraries
from bs4 import BeautifulSoup as bs
import requests

In [None]:
def getContentvalue(row_data):
  #Retrieves content from the webpage
  if row_data.find("li"):
    return [li.get_text(" ",strip=True).replace("\xa0"," ") for li in row_data.find_all("li")]
  elif row_data.find("br"):
    return [text for text in row_data.stripped_strings]
  else:
    return row_data.get_text(" ",strip=True).replace("\xa0"," ")

def clean_tags(soup):
  #Removes all the superscript tags and span tags for dates
  for tag in soup.find_all(["sup","span"]):
    tag.decompose()

def get_movie_info(url):
  #Retrieves movie information from the info box on webpage
  r = requests.get(url)

  #convert into a beautiful soup object
  soup = bs(r.content)

  info_box = soup.find(class_="infobox vevent")
  info_rows = info_box.find_all("tr")
  
  clean_tags(soup)

  movie_info = {}

  for index, row in enumerate(info_rows):
    if index == 0:
      movie_info["title"] = row.find("th").get_text(" ",strip=True)
    
    else:
      header = row.find("th")
      if header:
        content_key = row.find("th").get_text(" ",strip=True)
        content_value = getContentvalue(row.find("td"))
        movie_info[content_key] = content_value

  return movie_info


In [None]:
#testing function on one movie
get_movie_info("https://en.wikipedia.org/wiki/One_Little_Indian_(film)")

{'Box office': '$2 million',
 'Cinematography': 'Charles F. Wheeler',
 'Country': 'United States',
 'Directed by': 'Bernard McEveety',
 'Distributed by': 'Buena Vista Distribution',
 'Edited by': 'Robert Stafford',
 'Language': 'English',
 'Music by': 'Jerry Goldsmith',
 'Produced by': 'Winston Hibler',
 'Production company': 'Walt Disney Productions',
 'Release date': ['June 20, 1973'],
 'Running time': '90 Minutes',
 'Starring': ['James Garner',
  'Vera Miles',
  'Pat Hingle',
  'Morgan Woodward',
  'Jodie Foster'],
 'Written by': 'Harry Spalding',
 'title': 'One Little Indian'}

In [None]:
#Get info for all movies
r = requests.get("https://en.wikipedia.org/wiki/List_of_Walt_Disney_Pictures_films")

#Convert to a beautiful soup object
soup = bs(r.content)

movies = soup.select(".wikitable.sortable i a")
print(len(movies))

#Retrive information for all movies
movie_info_list = []
for index, movie in enumerate(movies):
  try:
    url_path = movie["href"]
    title = movie["title"]

    movie_info_list.append(get_movie_info("https://en.wikipedia.org/" + url_path))
    
  except Exception as e:
    print(movie.get_text())
    print(e)

454
Zorro the Avenger
'NoneType' object has no attribute 'find'
The Sign of Zorro
'NoneType' object has no attribute 'find'
True-Life Adventures
'NoneType' object has no attribute 'find_all'
The London Connection
'NoneType' object has no attribute 'find'
Better Nate Than Never
'NoneType' object has no attribute 'find_all'


In [None]:
#check how many movies have been retrieved
len(movie_info_list)

449

In [None]:
import json

#function for saving info to a json file
def save_data(title, data):
  with open(title, 'w', encoding='utf-8') as f:
    json.dump(data, f, ensure_ascii=False,indent=2)

#function for loading saved json file
def load_data(title):
  with open(title, encoding='utf-8') as f:
    return json.load(f)

In [None]:
save_data("disney_data_cleaned.json", movie_info_list)

In [None]:
#convert move running time to integer
def minutes_to_integer(running_time):
  if running_time == "N/A":
    return None
  elif isinstance(running_time, list):
    return int(running_time[0].split(" ")[0])
  return int(running_time.split(" ")[0])

for movie in movie_info_list:
    movie["Running time (int)"] = minutes_to_integer(movie.get("Running time", "N/A"))

In [None]:
#Using regular expressions to convert budget and box office fields to integers
import re

pattern = r"\d+(,\d{3})*\.*\d*"
amounts = r"thousand|million|billion"

word_re = rf"\${pattern}(-|\sto\s|–)?({pattern})?\s({amounts})"
value_re = rf"\${pattern}"

#function to map words to integer value
def word_to_value(word):
  value_dict = {"thousand":1000, "million":1000000,"billion":1000000000}
  return value_dict[word]

#convert string values to integer
def parse_word_syntax(string):
  value_string = re.search(pattern,string).group()
  value = float(value_string.replace(",",""))
  word = re.search(amounts,string,flags=re.I).group().lower()
  word_value = word_to_value(word)
  return value*word_value

#convert digits to integer
def parse_value_syntax(string):
  value_string = re.search(pattern,string).group()
  value = float(value_string.replace(",",""))
  return value

#convert fields
def convert_money(money):
  if money == "N/A":
    return None
  
  if isinstance(money,list):
    money = money[0]

  word_syntax = re.search(word_re,money,flags=re.I)
  value_syntax = re.search(value_re,money)

  if word_syntax:
    return parse_word_syntax(word_syntax.group())

  elif value_syntax:
    return parse_value_syntax(value_syntax.group())

  else:
    return None


In [None]:
#iterate through movie list to convert budgets and box office entries into an integer
for movie in movie_info_list:
  movie["Budget (float)"] = convert_money(movie.get("Budget","N/A"))
  movie["Box office (float)"] = convert_money(movie.get("Box office","N/A"))

In [None]:
#convert date field to python datetime objects
from datetime import datetime


def clean_date(date):
  return date.split("(")[0].strip()

def date_conversion(date):
  if isinstance(date,list):
    date = date[0]
  
  if date == "N/A":
    return None

  date_str = clean_date(date)
  
  fmts = ['%B %d, %Y','%d %B %Y']
  for fmt in fmts:
    try:
      return datetime.strptime(date_str, fmt)
    except:
      pass
  return None


In [None]:
#iterate through the date fields and convert to python datetime objects
for movie in movie_info_list:
  movie["Release date (datetime)"] = date_conversion(movie.get("Release date","N/A"))

In [None]:
import pickle

#function for saving file to pickle format
def save_data_pickle(name,data):
  with open(name,'wb') as f:
    pickle.dump(data, f)

#function for loading saved pickle file
def load_data_pickle(name):
  with open(name, 'rb') as f:
    return pickle.load(f)


In [None]:
save_data_pickle("disney_movie_data_cleaner.pickle",movie_info_list)

In [None]:
a = load_data_pickle("disney_movie_data_cleaner.pickle")

In [None]:
#retrieve rotten tomatoes ratings from omdb database using an API key.
import requests
import urllib
import os

def get_omdb_info(title):
  base_url = "http://www.omdbapi.com/?"
  parameters = {"apikey":"5da7ce82",'t':title}
  params_encoded = urllib.parse.urlencode(parameters)
  full_url = base_url + params_encoded
  return requests.get(full_url).json()

get_omdb_info('into the woods')

def get_rotten_tomato_score(omdb_info):
  ratings = omdb_info.get('Ratings',[])
  for rating in ratings:
    if rating["Source"] == "Rotten Tomatoes":
      return rating["Value"]
  return None

info = get_omdb_info('into the woods')
get_rotten_tomato_score(info)


In [None]:
#iterate through movies and add their respective rotten tomatoes ratings.
for movie in movie_info_list:
  title = movie["title"]
  omdb_info = get_omdb_info(title)
  movie["imdb"] = omdb_info.get('imdbRating',None)
  movie['metascore'] = omdb_info.get('Metascore',None)
  movie['rotten_tomatoes'] = get_rotten_tomato_score(omdb_info)

In [None]:
#converting the final list into a pandas dataframe
import pandas as pd
df = pd.DataFrame(movie_info_list)

In [None]:
df.head()

Unnamed: 0,title,Production company,Release date,Running time,Country,Language,Box office,Running time (int),Budget (float),Box office (float),Release date (datetime),imdb,metascore,rotten_tomatoes,Directed by,Written by,Based on,Produced by,Starring,Music by,Distributed by,Budget,Story by,Narrated by,Cinematography,Edited by,Languages,Screenplay by,Countries,Production companies,Color process,Animation by,Japanese,Hepburn,Adaptation by,Traditional,Simplified
0,Academy Award Review of,Walt Disney Productions,"[May 19, 1937]",41 minutes (74 minutes 1966 release),United States,English,$45.472,41.0,,45.472,1937-05-19,7.0,,,,,,,,,,,,,,,,,,,,,,,,,
1,Snow White and the Seven Dwarfs,Walt Disney Productions,"[December 21, 1937 ( Carthay Circle Theatre )]",83 minutes,United States,English,$418 million,83.0,1490000.0,418000000.0,1937-12-21,7.6,95.0,,"[David Hand, William Cottrell, Wilfred Jackson...","[Ted Sears, Richard Creedon, Otto Englander, D...","[Snow White, by The, Brothers Grimm]",Walt Disney,"[Adriana Caselotti, Lucille La Verne, Harry St...","[Frank Churchill, Paul Smith, Leigh Harline]",RKO Radio Pictures,$1.49 million,,,,,,,,,,,,,,,
2,Pinocchio,Walt Disney Productions,"[February 7, 1940 ( Center Theatre ), February...",88 minutes,United States,English,$164 million,88.0,2600000.0,164000000.0,1940-02-07,7.4,99.0,,"[Ben Sharpsteen, Hamilton Luske, Bill Roberts,...",,"[The Adventures of Pinocchio, by, Carlo Collodi]",Walt Disney,"[Cliff Edwards, Dickie Jones, Christian Rub, W...","[Leigh Harline, Paul J. Smith]",RKO Radio Pictures,$2.6 million,"[Ted Sears, Otto Englander, Webb Smith, Willia...",,,,,,,,,,,,,,
3,Fantasia,Walt Disney Productions,"[November 13, 1940]",126 minutes,United States,English,$76.4–$83.3 million (United States and Canada),126.0,2280000.0,83300000.0,1940-11-13,7.7,96.0,95%,"[Samuel Armstrong, James Algar, Bill Roberts, ...",,,"[Walt Disney, Ben Sharpsteen]","[Leopold Stokowski, Deems Taylor]",See program,RKO Radio Pictures,$2.28 million,"[Joe Grant, Dick Huemer]",Deems Taylor,James Wong Howe,,,,,,,,,,,,
4,The Reluctant Dragon,Walt Disney Productions,"[June 27, 1941]",74 minutes,United States,English,"$960,000 (worldwide rentals)",74.0,600000.0,960000.0,1941-06-27,6.9,,,"[Alfred Werker, (live action), Hamilton Luske,...","[Live-action:, Ted Sears, Al Perkins, Larry Cl...",,Walt Disney,"[Robert Benchley, Frances Gifford, Buddy Peppe...","[Frank Churchill, Larry Morey]",RKO Radio Pictures,"$600,000",,,Bert Glennon,Paul Weatherwax,,,,,,,,,,,
