In [1]:
# load packages
import requests
from bs4 import BeautifulSoup

In [2]:
#the URL of the site
base_site = "https://editorial.rottentomatoes.com/guide/140-essential-action-movies-to-watch-now/"

In [3]:
# sending a request to the webpage
response = requests.get(base_site)
response.status_code

200

In [4]:
# get the HTML from the webpage
html = response.content

In [5]:
# convert the HTML to a BeatifulSoup object
soup = BeautifulSoup(html, 'lxml')

In [6]:
# Exporting the HTML to a file
with open('Rotten_tomatoes_page_2_LXML_Parser.html', 'wb') as file:
    file.write(soup.prettify('utf-8'))

## Finding an element containing all the data

In [7]:
# Find all div tags on the webpage containing the information we want to scrape
divs = soup.find_all("div", {"class": "col-sm-18 col-full-xs countdown-item-content"})

# Extracting the title, year and score of each movie

In [8]:
# The title, year and score of each movie are contained in the 'h2' tags

In [9]:
# Extracting all 'h2' tags
headings = [div.find("h2") for div in divs]

In [10]:
# Inspecting the text inside the headings
[heading.text for heading in headings][0]

'Running Scared (1986)  59%58%'

In [11]:
#  notice that:

# The movie title is in the 'a' tag
# The year is in a 'span' with class 'start-year'
# The score is in a 'span' with class 'tMeterScore'

## Title

In [12]:
# Let's check all heading links
[heading.find('a') for heading in headings][0]

<a href="https://www.rottentomatoes.com/m/1018009-running_scared">Running Scared</a>

In [13]:
# Obtaining the movie titles from the links
movie_names = [heading.find('a').string for heading in headings]
movie_names[:5]

['Running Scared', 'Equilibrium', 'Hero', 'Road House', 'Unstoppable']

## Year

In [14]:
# Extracting the year string
years = [heading.find("span", class_ = 'start-year').string for heading in headings]

In [15]:
# Updating years with stripped values
years = [int(year.strip('()')) for year in years]
years[:5]

[1986, 2002, 2002, 1989, 2010]

## Score

In [16]:
# Filtering only the spans containing the score
[heading.find("span", class_ = 'tMeterScore') for heading in headings][0]

<span class="tMeterScore">59%</span>

In [17]:
# Extracting the score string
scores = [int(heading.find("span", class_ = 'tMeterScore').string.strip('%')) for heading in headings]
scores[:5]

[59, 40, 94, 39, 87]

## Critics Consensus

In [18]:
# The critics consensus is located inside a 'div' tag with the class 'info critics-consensus'
# This can be found inside the original 'div's we scraped
#divs

In [19]:
# Getting the 'div' tags containing the critics consensus
consensus = [div.find("div", {"class": "info critics-consensus"}) for div in divs]

In [20]:
# Defining the phrase to be removed 
common_phrase = 'Critics Consensus: '

In [21]:
# Define a variable to store the length
common_len = len(common_phrase)

In [22]:
# Cleaning the list of the common phrase
consensus_text = [con.text[common_len:] for con in consensus]

In [23]:
# We can add if-else logic to only truncate the string in case it starts with the common phrase
consensus_text = [con.text[common_len:] if con.text.startswith(common_phrase) else con.text for con in consensus ]
consensus_text[0]

'Running Scared struggles to strike a consistent balance between violent action and humor, but the chemistry between its well-matched leads keeps things entertaining.'

## Directors

In [24]:
# Extracting all director divs
directors = [div.find("div", class_ = 'director') for div in divs]

In [25]:
# The director's name can be found as the string of a link

# Obtaining all director links
[director.find("a") for director in directors][0]

<a class="" href="//www.rottentomatoes.com/celebrity/peter_hyams">Peter Hyams</a>

In [26]:
final_directors = [director.find("a").string for director in directors]
final_directors[:5]

['Peter Hyams', 'Kurt Wimmer', 'Zhang Yimou', 'Rowdy Herrington', 'Tony Scott']

## Cast info

In [27]:
cast_info = [div.find("div", class_ = 'cast') for div in divs]

In [28]:
# Each cast member's name is the string of a link
# There are multiple cast members for a movie

In [29]:
# Initialize the list of all cast memners
cast = []
for c in cast_info:
    cast_links = c.find_all('a')
    cast_names = [link.string for link in cast_links]
    
    cast.append(", ".join(cast_names)) 

cast[:5]

['Gregory Hines, Billy Crystal, Jimmy Smits, Steven Bauer',
 'Christian Bale, Emily Watson, Taye Diggs, Angus Macfadyen',
 'Jet Li, Tony Leung Chiu Wai, Maggie Cheung Man-yuk, Donnie Yen',
 'Patrick Swayze, Kelly Lynch, Sam Elliott, Ben Gazzara',
 'Denzel Washington, Chris Pine, Rosario Dawson, Kevin Dunn']

## Synopsis

In [30]:
# The synopsis is located inside a 'div' tag with the class 'info synopsis'
synopsis = [div.find('div', class_='synopsis') for div in divs]

In [31]:
# Extracting the text
synopsis_text = [syn.contents[1].strip() for syn in synopsis]
synopsis_text[:5]

['Ray and Danny (Gregory Hines, Billy Crystal) are two Chicago police detectives hot on the trail of drug kingpin Julio...',
 'In a futuristic world, a regime has eliminated war by suppressing emotions: books, art and music are strictly forbidden and...',
 'In this visually arresting martial arts epic set in ancient China, an unnamed fighter (Jet Li) is being honored for...',
 'The Double Deuce is the meanest, loudest and rowdiest bar south of the Mason-Dixon Line, and Dalton (Patrick Swayze) has...',
 'When a massive, unmanned locomotive roars out of control, the threat is more ominous than just a derailment. The train...']

# Representing the data in structured form

In [32]:
# load the pandas package
import pandas as pd

## Creating a Data Frame

In [33]:
movies_info = pd.DataFrame()
movies_info["Movie Title"] = movie_names
movies_info["Year"] = years
movies_info["Score"] = scores
movies_info["Director"] = final_directors
movies_info["Synopsis"] = synopsis_text  
movies_info["Cast"] = cast
movies_info["Consensus"] = consensus_text

pd.set_option('display.max_colwidth', -1)
movies_info.head(5)

  pd.set_option('display.max_colwidth', -1)


Unnamed: 0,Movie Title,Year,Score,Director,Synopsis,Cast,Consensus
0,Running Scared,1986,59,Peter Hyams,"Ray and Danny (Gregory Hines, Billy Crystal) are two Chicago police detectives hot on the trail of drug kingpin Julio...","Gregory Hines, Billy Crystal, Jimmy Smits, Steven Bauer","Running Scared struggles to strike a consistent balance between violent action and humor, but the chemistry between its well-matched leads keeps things entertaining."
1,Equilibrium,2002,40,Kurt Wimmer,"In a futuristic world, a regime has eliminated war by suppressing emotions: books, art and music are strictly forbidden and...","Christian Bale, Emily Watson, Taye Diggs, Angus Macfadyen",Equilibrium is a reheated mishmash of other sci-fi movies.
2,Hero,2002,94,Zhang Yimou,"In this visually arresting martial arts epic set in ancient China, an unnamed fighter (Jet Li) is being honored for...","Jet Li, Tony Leung Chiu Wai, Maggie Cheung Man-yuk, Donnie Yen","With death-defying action sequences and epic historic sweep, Hero offers everything a martial arts fan could ask for."
3,Road House,1989,39,Rowdy Herrington,"The Double Deuce is the meanest, loudest and rowdiest bar south of the Mason-Dixon Line, and Dalton (Patrick Swayze) has...","Patrick Swayze, Kelly Lynch, Sam Elliott, Ben Gazzara",Whether Road House is simply bad or so bad it's good depends largely on the audience's fondness for Swayze -- and tolerance for violently cheesy action.
4,Unstoppable,2010,87,Tony Scott,"When a massive, unmanned locomotive roars out of control, the threat is more ominous than just a derailment. The train...","Denzel Washington, Chris Pine, Rosario Dawson, Kevin Dunn","As fast, loud, and relentless as the train at the center of the story, Unstoppable is perfect popcorn entertainment -- and director Tony Scott's best movie in years."


## Exporting the data to CSV (comma-separated values) and excel files

In [34]:
# Write data to excel file
movies_info.to_excel("movies_info.xlsx", index = False, header = True)

In [35]:
# or write data to CSV file
movies_info.to_csv("movies_info.csv", index = False, header = True)