In [1]:
import numpy as np
import pandas as pd
import json
import requests
from bs4 import BeautifulSoup

In [2]:
# web scraping is something you should try to avoid!

In [3]:
r = requests.get('https://httpbin.org/user-agent')
useragent = json.loads(r.text)['user-agent']
headers = {'useragent': useragent}

In [5]:
url = 'https://www.rottentomatoes.com/browse/movies_in_theaters/sort:a_z?page=5'
r = requests.get(url, headers = headers)
r

<Response [200]>

In [7]:
r.text

'<!DOCTYPE html>\n<html lang="en" dir="ltr" xmlns="http://www.w3.org/1999/xhtml" prefix="fb: http://www.facebook.com/2008/fbml og: http://opengraphprotocol.org/schema/">\n    <head prefix="og: http://ogp.me/ns# flixstertomatoes: http://ogp.me/ns/apps/flixstertomatoes#">\n        \n        \n            <script\n                charset="UTF-8"\n                crossorigin="anonymous"\n                data-domain-script="7e979733-6841-4fce-9182-515fac69187f"\n                integrity="sha384-TKdmlzVmoD70HzftTw4WtOzIBL5mNx8mXSRzEvwrWjpIJ7FZ/EuX758yMDWXtRUN"\n                src="https://cdn.cookielaw.org/consent/7e979733-6841-4fce-9182-515fac69187f/otSDKStub.js"\n                type="text/javascript"\n            >\n            </script>\n            <script type="text/javascript">\n                function OptanonWrapper() { }\n            </script>\n        \n\n        \n            <script\n                ccpa-opt-out-ids="USP"\n                ccpa-opt-out-geo="US"\n               

In [8]:
mysoup = BeautifulSoup(r.text, 'html.parser')
#mysoup

In [10]:
mysoup.extract_text()

TypeError: 'NoneType' object is not callable

In [16]:
# Goal 1: get a list of hyperlinks to individual movie pages
# find will find first occurance, find all will find all and put into list

In [19]:
href_list = mysoup.find_all('a', attrs = {'data-track':'scores'})

In [27]:
href_list[0]['href']

'/m/a_man_of_reason'

In [36]:
movielinks = ['https://www.rottentomatoes.com' + x['href'] for x in href_list]
movielinks


['https://www.rottentomatoes.com/m/a_man_of_reason',
 'https://www.rottentomatoes.com/m/a_quiet_place_day_one',
 'https://www.rottentomatoes.com/m/babes',
 'https://www.rottentomatoes.com/m/bad_boys_ride_or_die',
 'https://www.rottentomatoes.com/m/banel_and_adama',
 'https://www.rottentomatoes.com/m/beverly_hills_cop_axel_f',
 'https://www.rottentomatoes.com/m/blue_lock_the_movie_episode_nagi',
 'https://www.rottentomatoes.com/m/boneyard_2024',
 'https://www.rottentomatoes.com/m/chestnut',
 'https://www.rottentomatoes.com/m/close_encounters_of_the_third_kind',
 'https://www.rottentomatoes.com/m/daddio_2023',
 'https://www.rottentomatoes.com/m/dandelion_2024',
 'https://www.rottentomatoes.com/m/dead_whisper',
 'https://www.rottentomatoes.com/m/despicable_me_4',
 'https://www.rottentomatoes.com/m/talju',
 'https://www.rottentomatoes.com/m/escape_2023',
 'https://www.rottentomatoes.com/m/evil_does_not_exist',
 'https://www.rottentomatoes.com/m/firebrand_2023',
 'https://www.rottentomatoes

In [34]:
## Extract data from one movie page

In [54]:
url = movielinks[3]
url

'https://www.rottentomatoes.com/m/bad_boys_ride_or_die'

In [55]:
r = requests.get(url, headers = headers)
mysoup1 = BeautifulSoup(r.text)

In [56]:
mysoup1.find('title').text.replace(' | Rotten Tomatoes', '')

'Bad Boys: Ride or Die'

In [57]:
moviedata = {}

In [108]:
mysoup1.find('title').text


'Bad Boys: Ride or Die | Rotten Tomatoes'

In [58]:
moviedata['title'] = mysoup1.find('title').text.replace(' | Rotten Tomatoes', '')
moviedata

{'title': 'Bad Boys: Ride or Die'}

In [61]:
moviedata['description'] = mysoup1.find('meta', attrs = {'name':'description'})['content']

In [65]:
moviedata['critic_score'] = mysoup1.find('rt-button', slot='criticsScore').text.strip()
moviedata['audience_score'] = mysoup1.find('rt-button', slot='audienceScore').text.strip()

In [66]:
moviedata

{'title': 'Bad Boys: Ride or Die',
 'description': "This Summer, the world's favorite Bad Boys are back with their iconic mix of edge-of-your seat action and outrageous comedy but this time with a twist: Miami's finest are now on the run.",
 'critic_score': '64%',
 'audience_score': '97%'}

In [72]:
moviedata['critic_concensus'] = mysoup1.find('div', 'consensus').p.text

In [80]:
cat = mysoup1.find_all('div', 'category-wrap')
cats = [x.text.strip().split('\n\n\n') for x in cat]
for c in cats:
    moviedata[c[0]] = c[1].replace('\n', '')
#cat[0].text.strip().split('\n\n\n')

In [81]:
cats

[['Director', 'Adil El Arbi, \nBilall Fallah'],
 ['Producer', 'Jerry Bruckheimer, \nWill Smith, \nChad Oman, \nDoug Belgrad'],
 ['Screenwriter', 'Chris Bremner'],
 ['Distributor', 'Columbia Pictures'],
 ['Production Co',
  'Columbia Pictures, \nWestbrook Studios, \nJerry Bruckheimer Films'],
 ['Rating', 'R (Some Sexual References|Language Throughout|Strong Violence)'],
 ['Genre', 'Action, \nComedy'],
 ['Original Language', 'English'],
 ['Release Date (Theaters)', 'Jun 7, 2024, Wide'],
 ['Box Office (Gross USA)', '$177.5M'],
 ['Runtime', '1h 55m'],
 ['Sound Mix', 'Dolby Atmos, \nDolby Digital'],
 ['Aspect Ratio', 'Digital 2.39:1']]

In [94]:
# Build the spider

def moviescrape(url):

    r = requests.get(url, headers=headers)
    mysoup1 = BeautifulSoup(r.text, 'html.parser')

    moviedata = {}

    moviedata['title'] = mysoup1.find('title').text.replace(' | Rotten Tomatoes', '')
    moviedata['description'] = mysoup1.find('meta', attrs = {'name':'description'})['content']
    moviedata['critic_score'] = mysoup1.find('rt-button', slot='criticsScore').text.strip()
    moviedata['audience_score'] = mysoup1.find('rt-button', slot='audienceScore').text.strip()
    
    try:
        moviedata['critic_concensus'] = mysoup1.find('div', 'consensus').p.text
    except:
        moviedata['critic_concensus'] = ''

    cat = mysoup1.find_all('div', 'category-wrap')
    cats = [x.text.strip().split('\n\n\n') for x in cat]
    for c in cats:
        moviedata[c[0]] = c[1].replace('\n', '')

    return moviedata
    

In [100]:
rt_data = [moviescrape(x) for x in movielinks]

In [101]:
rt_dataframe = pd.DataFrame.from_records(rt_data)

In [102]:
rt_dataframe

Unnamed: 0,title,description,critic_score,audience_score,critic_concensus,Director,Producer,Screenwriter,Distributor,Production Co,Genre,Original Language,Release Date (Theaters),Release Date (Streaming),Runtime,Rating,Box Office (Gross USA),Sound Mix,Aspect Ratio,Rerelease Date (Theaters)
0,A Man of Reason,After 10 years in the prison on behalf of his ...,14%,15%,,Jung Woo-sung,Song Dae-chan,"Jung Hae-sin, Jung Woo-sung",Epic Pictures,Studio Take,"Action, Drama, Mystery & Thriller",Korean,"Jul 5, 2024, Limited","Jul 9, 2024",1h 37m,,,,,
1,A Quiet Place: Day One,Experience the day the world went quiet.,87%,73%,Grounded in raw humanity by Lupita Nyong'o and...,Michael Sarnoski,"Michael Bay, Andrew Form, Brad Fuller, John Kr...",Michael Sarnoski,Paramount Pictures,"Platinum Dunes, Sunday Night","Horror, Sci-Fi, Mystery & Thriller",English,"Jun 28, 2024, Wide",,1h 40m,PG-13 (Violent Content/Bloody Images|Terror),$94.0M,"Dolby Atmos, Dolby Digital",Digital 2.39:1,
2,Babes,BABES follows inseparable childhood friends Ed...,91%,79%,Offering an uproariously unvarnished look at m...,Pamela Adlon,"Ilana Glazer, Josh Rabinowitz, Susie Fox, Ashl...","Ilana Glazer, Josh Rabinowitz",Neon,"FilmNation Entertainment, Starrpix, Range Medi...",Comedy,English,"May 24, 2024, Wide","Jul 2, 2024",1h 44m,R (Language Throughout|Some Drug Use|Sexual Ma...,$3.7M,,,
3,Bad Boys: Ride or Die,"This Summer, the world's favorite Bad Boys are...",64%,97%,Will Smith and Martin Lawrence remain good com...,"Adil El Arbi, Bilall Fallah","Jerry Bruckheimer, Will Smith, Chad Oman, Doug...",Chris Bremner,Columbia Pictures,"Columbia Pictures, Westbrook Studios, Jerry Br...","Action, Comedy",English,"Jun 7, 2024, Wide",,1h 55m,R (Some Sexual References|Language Throughout|...,$177.5M,"Dolby Atmos, Dolby Digital",Digital 2.39:1,
4,Banel & Adama,Banel and Adama are fiercely in love. The youn...,95%,,,Ramata-Toulaye Sy,"Éric Névé, Maud Leclair, Andrey Samoute Diarra...",Ramata-Toulaye Sy,Kino Lorber,"La Chauve Souris, arte France Cinéma, Astou Films",Drama,French (France),"Jun 7, 2024, Limited","Aug 6, 2024",1h 27m,,$38.5K,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71,The Strangers: Chapter 1,After their car breaks down in an eerie small ...,21%,44%,Lacking the chilling suspense of the original ...,Renny Harlin,"Courtney Solomon, Mark Canton, Christopher Mil...","Alan R. Cohen, Alan Freedland, Bryan Bertino",Lionsgate,Fifth Element Productions,"Horror, Mystery & Thriller",English,"May 17, 2024, Wide","Jun 7, 2024",1h 31m,R (Language|Horror Violence|Brief Drug Use),$35.1M,,Digital 2.39:1,
72,The Watchers (2024),"The Watchers follows Mina, a 28-year old artis...",32%,52%,Heavy on eerie mood but undone by an obtuse sc...,Ishana Shyamalan,"M. Night Shyamalan, Ashwin Rajan, Nimitt Mankad",Ishana Shyamalan,Warner Bros. Pictures,"New Line Cinema, Inimitable Pictures, Blinding...","Horror, Mystery & Thriller",English,"Jun 7, 2024, Wide","Jun 28, 2024",1h 42m,PG-13 (Violence|Terror|Some Thematic Elements),$19.1M,Dolby Digital,Flat (1.85:1),
73,Thelma (2024),Inspired by a real-life experience of director...,99%,83%,A stellar showcase for the talented June Squib...,Josh Margolin,"Zoë Worth, Chris Kaye, Nicholas Weinstock, Ben...",Josh Margolin,Magnolia Pictures,"Bandwagon, Zurich Avenue, Invention Studios","Comedy, Action",English,"Jun 21, 2024, Wide",,1h 37m,PG-13 (Strong Language),$6.9M,,,
74,Touch (2024),A romantic and thrilling story that spans seve...,100%,,,Baltasar Kormákur,"Baltasar Kormákur, Agnes Johansen, Mike Goodridge","Baltasar Kormákur, Ólaf Ólafsson",Focus Features,"RVK Studios, Good Chaos","Romance, Drama",English,"Jul 12, 2024, Limited",,2h 1m,R (Some Sexuality),,,,


In [4]:
BeautifulSoup("http://www.google.com")



  BeautifulSoup("http://www.google.com")


http://www.google.com