In [1]:
# load packages
import requests
from bs4 import BeautifulSoup

In [2]:
#the URL of the site
base_site = "https://editorial.rottentomatoes.com/guide/140-essential-action-movies-to-watch-now/"

In [3]:
# sending a request to the webpage
response = requests.get(base_site)
response.status_code

200

In [4]:
# get the HTML from the webpage
html = response.content

In [5]:
# convert the HTML to a BeatifulSoup object
soup = BeautifulSoup(html, 'lxml')

In [6]:
# Exporting the HTML to a file
with open('Rotten_tomatoes_page_2_LXML_Parser.html', 'wb') as file:
    file.write(soup.prettify('utf-8'))

## Finding an element containing all the data

In [7]:
# Find all div tags on the webpage containing the information we want to scrape
divs = soup.find_all("div", {"class": "col-sm-18 col-full-xs countdown-item-content"})

# Extracting the title, year and score of each movie

In [8]:
# The title, year and score of each movie are contained in the 'h2' tags

In [9]:
# Extracting all 'h2' tags
headings = [div.find("h2") for div in divs]

In [10]:
# Inspecting the text inside the headings
[heading.text for heading in headings][0]

'Running Scared (1986)  59%58%'

In [None]:
#  notice that:

# The movie title is in the 'a' tag
# The year is in a 'span' with class 'start-year'
# The score is in a 'span' with class 'tMeterScore'

## Title

In [11]:
# Let's check all heading links
[heading.find('a') for heading in headings][0]

<a href="https://www.rottentomatoes.com/m/1018009-running_scared">Running Scared</a>

In [12]:
# Obtaining the movie titles from the links
movie_names = [heading.find('a').string for heading in headings]
movie_names

['Running Scared',
 'Equilibrium',
 'Hero',
 'Road House',
 'Unstoppable',
 'Shaft',
 'The Villainess',
 'Highlander',
 'Die Hard 2',
 'National Treasure',
 'The Protector',
 'Revenge',
 'El Mariachi',
 'A Touch of Zen',
 'Top Gun',
 'Con Air',
 'The Expendables 2',
 'The Mummy',
 'Mr. & Mrs. Smith',
 'Rush Hour',
 'The Equalizer',
 'Captain America: Civil War',
 'Air Force One',
 'Bloodsport',
 'Blade',
 'Bad Boys',
 'Die Hard With a Vengeance',
 'The Running Man',
 'Code of Silence',
 "Shoot 'Em Up",
 'Crank',
 'Machete',
 'Drive',
 'Batman',
 'Under Siege',
 'Independence Day',
 'Bullitt',
 'Wanted',
 'Superman: The Movie',
 'Ronin',
 'They Live',
 'Cliffhanger',
 "Marvel's the Avengers",
 'Hot Fuzz',
 'The Warriors',
 'Starship Troopers',
 'Elite Squad 2',
 'Point Break',
 'The Long Kiss Goodnight',
 'The Guest',
 'Taken',
 '300',
 'True Lies',
 'Demolition Man',
 'Hardcore Henry',
 'Police Story',
 'Brotherhood of the Wolf',
 'Kingsman: The Secret Service',
 'The Fifth Element',
 

## Year

In [13]:
# Extracting the year string
years = [heading.find("span", class_ = 'start-year').string for heading in headings]

In [14]:
# Updating years with stripped values
years = [int(year.strip('()')) for year in years]
years

[1986,
 2002,
 2002,
 1989,
 2010,
 1971,
 2017,
 1986,
 1990,
 2004,
 2005,
 2017,
 1992,
 1971,
 1986,
 1997,
 2012,
 1999,
 2005,
 1998,
 2014,
 2016,
 1997,
 1988,
 1998,
 1995,
 1995,
 1987,
 1985,
 2007,
 2006,
 2010,
 2011,
 1989,
 1992,
 1996,
 1968,
 2008,
 1978,
 1998,
 1988,
 1993,
 2012,
 2007,
 1979,
 1997,
 2010,
 1991,
 1996,
 2014,
 2008,
 2006,
 1994,
 1993,
 2015,
 1985,
 2001,
 2014,
 1997,
 1986,
 2017,
 1995,
 2004,
 1984,
 2003,
 2004,
 1993,
 1981,
 2000,
 2004,
 2010,
 1992,
 1989,
 2004,
 1986,
 2008,
 2018,
 2017,
 1964,
 1976,
 2017,
 1972,
 2014,
 2003,
 1971,
 2015,
 1990,
 1992,
 1971,
 2014,
 2003,
 1993,
 2018,
 2010,
 1995,
 2002,
 2019,
 2012,
 2002,
 2008,
 1997,
 1985,
 2008,
 2011,
 2011,
 1987,
 1996,
 1987,
 2017,
 2006,
 2017,
 1994,
 1989,
 2014,
 1973,
 1985,
 1982,
 2015,
 1984,
 2000,
 2003,
 1994,
 1994,
 1994,
 2014,
 2000,
 1987,
 2007,
 1990,
 1981,
 1995,
 2011,
 2018,
 1981,
 1986,
 1992,
 1999,
 1991,
 1988,
 2015]

## Score

In [15]:
# Filtering only the spans containing the score
[heading.find("span", class_ = 'tMeterScore') for heading in headings][0]

<span class="tMeterScore">59%</span>

In [16]:
# Extracting the score string
scores = [int(heading.find("span", class_ = 'tMeterScore').string.strip('%')) for heading in headings]
scores

[59,
 40,
 94,
 39,
 87,
 87,
 85,
 70,
 69,
 46,
 53,
 93,
 91,
 97,
 58,
 58,
 67,
 61,
 59,
 61,
 60,
 90,
 79,
 46,
 57,
 42,
 58,
 67,
 70,
 67,
 62,
 71,
 93,
 73,
 79,
 67,
 98,
 71,
 93,
 69,
 85,
 67,
 91,
 91,
 88,
 68,
 91,
 70,
 68,
 92,
 59,
 61,
 70,
 62,
 51,
 93,
 73,
 75,
 71,
 75,
 79,
 80,
 80,
 83,
 85,
 86,
 91,
 86,
 88,
 93,
 95,
 89,
 88,
 90,
 93,
 94,
 91,
 94,
 99,
 96,
 93,
 83,
 90,
 82,
 96,
 81,
 89,
 92,
 89,
 91,
 85,
 96,
 96,
 87,
 75,
 90,
 94,
 79,
 84,
 86,
 92,
 85,
 94,
 93,
 78,
 80,
 68,
 91,
 89,
 94,
 92,
 100,
 98,
 82,
 95,
 67,
 86,
 94,
 100,
 79,
 85,
 74,
 94,
 84,
 86,
 97,
 80,
 92,
 83,
 94,
 88,
 87,
 97,
 96,
 98,
 94,
 88,
 93,
 94,
 97]

## Critics Consensus

In [17]:
# The critics consensus is located inside a 'div' tag with the class 'info critics-consensus'
# This can be found inside the original 'div's we scraped
#divs

In [18]:
# Getting the 'div' tags containing the critics consensus
consensus = [div.find("div", {"class": "info critics-consensus"}) for div in divs]

In [19]:
# Defining the phrase to be removed 
common_phrase = 'Critics Consensus: '

In [20]:
# Define a variable to store the length
common_len = len(common_phrase)

In [None]:
# Cleaning the list of the common phrase
consensus_text = [con.text[common_len:] for con in consensus]

In [21]:
# We can add if-else logic to only truncate the string in case it starts with the common phrase
consensus_text = [con.text[common_len:] if con.text.startswith(common_phrase) else con.text for con in consensus ]
consensus_text[0]

'Running Scared struggles to strike a consistent balance between violent action and humor, but the chemistry between its well-matched leads keeps things entertaining.'

## Directors

In [22]:
# Extracting all director divs
directors = [div.find("div", class_ = 'director') for div in divs]

In [23]:
# The director's name can be found as the string of a link

# Obtaining all director links
[director.find("a") for director in directors][0]

<a class="" href="//www.rottentomatoes.com/celebrity/peter_hyams">Peter Hyams</a>

In [24]:
final_directors = [director.find("a").string for director in directors]
final_directors

['Peter Hyams',
 'Kurt Wimmer',
 'Zhang Yimou',
 'Rowdy Herrington',
 'Tony Scott',
 'Gordon Parks',
 'Jeong Byeong-gil',
 'Russell Mulcahy',
 'Renny Harlin',
 'Jon Turteltaub',
 'Prachya Pinkaew',
 'Coralie Fargeat',
 'Robert Rodriguez',
 'King Hu',
 'Tony Scott',
 'Simon West',
 'Simon West',
 'Stephen Sommers',
 'Doug Liman',
 'Brett Ratner',
 'Antoine Fuqua',
 'Anthony Russo',
 'Wolfgang Petersen',
 'Newt Arnold',
 'Stephen Norrington',
 'Michael Bay',
 'John McTiernan',
 'Paul Michael Glaser',
 'Andrew Davis',
 'Michael Davis',
 'Mark Neveldine',
 'Robert Rodriguez',
 'Nicolas Winding Refn',
 'Tim Burton',
 'Andrew Davis',
 'Roland Emmerich',
 'Peter Yates',
 'Timur Bekmambetov',
 'Richard Donner',
 'John Frankenheimer',
 'John Carpenter',
 'Renny Harlin',
 'Joss Whedon',
 'Edgar Wright',
 'Walter Hill',
 'Paul Verhoeven',
 'José Padilha',
 'Kathryn Bigelow',
 'Renny Harlin',
 'Adam Wingard',
 'Pierre Morel',
 'Zack Snyder',
 'James Cameron',
 'Marco Brambilla',
 'Ilya Naishuller'

## Cast info

In [25]:
cast_info = [div.find("div", class_ = 'cast') for div in divs]

In [26]:
# Each cast member's name is the string of a link
# There are multiple cast members for a movie

In [27]:
# Initialize the list of all cast memners
cast = []
for c in cast_info:
    cast_links = c.find_all('a')
    cast_names = [link.string for link in cast_links]
    
    cast.append(", ".join(cast_names)) 

cast

['Gregory Hines, Billy Crystal, Jimmy Smits, Steven Bauer',
 'Christian Bale, Emily Watson, Taye Diggs, Angus Macfadyen',
 'Jet Li, Tony Leung Chiu Wai, Maggie Cheung Man-yuk, Donnie Yen',
 'Patrick Swayze, Kelly Lynch, Sam Elliott, Ben Gazzara',
 'Denzel Washington, Chris Pine, Rosario Dawson, Kevin Dunn',
 'Richard Roundtree, Moses Gunn, Christopher St. John, Charles Cioffi',
 'Kim Ok-bin, Shin Ha-kyun, Sung-joon, Kim Seo-hyung',
 'Christopher Lambert, Sean Connery, Roxanne Hart, Clancy Brown',
 'Bruce Willis, Bonnie Bedelia, William Atherton, Reginald VelJohnson',
 'Nicolas Cage, Diane Kruger, Justin Bartha, Sean Bean',
 'Tony Jaa, Johnny Nguyen, Nathan Jones, Petchtai Wongkamlao',
 'Matilda Lutz, Kevin Janssens, Vincent Colombe, Guillaume Bouchède',
 'Carlos Gallardo, Consuelo Gómez, Reinol Martinez, Peter Marquardt',
 'Feng Hsu, Chun Shih, Pai Ying, Roy Chiao',
 'Tom Cruise, Kelly McGillis, Anthony Edwards, Val Kilmer',
 'Nicolas Cage, John Cusack, John Malkovich, Steve Buscemi',


## Synopsis

In [28]:
# The synopsis is located inside a 'div' tag with the class 'info synopsis'
synopsis = [div.find('div', class_='synopsis') for div in divs]

In [29]:
# Extracting the text
synopsis_text = [syn.contents[1].strip() for syn in synopsis]
synopsis_text

['Ray and Danny (Gregory Hines, Billy Crystal) are two Chicago police detectives hot on the trail of drug kingpin Julio...',
 'In a futuristic world, a regime has eliminated war by suppressing emotions: books, art and music are strictly forbidden and...',
 'In this visually arresting martial arts epic set in ancient China, an unnamed fighter (Jet Li) is being honored for...',
 'The Double Deuce is the meanest, loudest and rowdiest bar south of the Mason-Dixon Line, and Dalton (Patrick Swayze) has...',
 'When a massive, unmanned locomotive roars out of control, the threat is more ominous than just a derailment. The train...',
 'John Shaft (Richard Roundtree) is the ultimate in suave black detectives. He first finds himself up against Bumpy (Moses Gunn),...',
 'Honed from childhood to be an elite assassin, Sook-hee embarks on a rampage of violence and revenge to finally earn...',
 'When the mystical Russell Nash (Christopher Lambert) kills a man in a sword fight in a New York City parkin

# Representing the data in structured form

In [30]:
# load the pandas package
import pandas as pd

## Creating a Data Frame

In [31]:
movies_info = pd.DataFrame()
movies_info["Movie Title"] = movie_names
movies_info["Year"] = years
movies_info["Score"] = scores
movies_info["Director"] = final_directors
movies_info["Synopsis"] = synopsis_text  
movies_info["Cast"] = cast
movies_info["Consensus"] = consensus_text

pd.set_option('display.max_colwidth', -1)
movies_info

  pd.set_option('display.max_colwidth', -1)


Unnamed: 0,Movie Title,Year,Score,Director,Synopsis,Cast,Consensus
0,Running Scared,1986,59,Peter Hyams,"Ray and Danny (Gregory Hines, Billy Crystal) are two Chicago police detectives hot on the trail of drug kingpin Julio...","Gregory Hines, Billy Crystal, Jimmy Smits, Steven Bauer","Running Scared struggles to strike a consistent balance between violent action and humor, but the chemistry between its well-matched leads keeps things entertaining."
1,Equilibrium,2002,40,Kurt Wimmer,"In a futuristic world, a regime has eliminated war by suppressing emotions: books, art and music are strictly forbidden and...","Christian Bale, Emily Watson, Taye Diggs, Angus Macfadyen",Equilibrium is a reheated mishmash of other sci-fi movies.
2,Hero,2002,94,Zhang Yimou,"In this visually arresting martial arts epic set in ancient China, an unnamed fighter (Jet Li) is being honored for...","Jet Li, Tony Leung Chiu Wai, Maggie Cheung Man-yuk, Donnie Yen","With death-defying action sequences and epic historic sweep, Hero offers everything a martial arts fan could ask for."
3,Road House,1989,39,Rowdy Herrington,"The Double Deuce is the meanest, loudest and rowdiest bar south of the Mason-Dixon Line, and Dalton (Patrick Swayze) has...","Patrick Swayze, Kelly Lynch, Sam Elliott, Ben Gazzara",Whether Road House is simply bad or so bad it's good depends largely on the audience's fondness for Swayze -- and tolerance for violently cheesy action.
4,Unstoppable,2010,87,Tony Scott,"When a massive, unmanned locomotive roars out of control, the threat is more ominous than just a derailment. The train...","Denzel Washington, Chris Pine, Rosario Dawson, Kevin Dunn","As fast, loud, and relentless as the train at the center of the story, Unstoppable is perfect popcorn entertainment -- and director Tony Scott's best movie in years."
...,...,...,...,...,...,...,...
135,Hard-Boiled,1992,94,John Woo,A cop who loses his partner in a shoot-out with gun smugglers goes on a mission to catch them. In...,"Chow Yun-Fat, Bowie Lam, Philip Chan, Tony Leung Chiu Wai","Boasting impactful action as well as surprising emotional resonance, Hard Boiled is a powerful thriller that hits hard in more ways than one."
136,The Matrix,1999,88,Andy Wachowski,"Neo (Keanu Reeves) believes that Morpheus (Laurence Fishburne), an elusive figure considered to be the most dangerous man alive, can...","Keanu Reeves, Laurence Fishburne, Carrie-Anne Moss, Hugo Weaving","Thanks to the Wachowskis' imaginative vision, The Matrix is a smartly crafted combination of spectacular action and groundbreaking special effects."
137,Terminator 2: Judgment Day,1991,93,James Cameron,"In this sequel set eleven years after ""The Terminator,"" young John Connor (Edward Furlong), the key to civilization's victory over...","Arnold Schwarzenegger, Linda Hamilton, Edward Furlong, Robert Patrick","T2 features thrilling action sequences and eye-popping visual effects, but what takes this sci-fi/ action landmark to the next level is the depth of the human (and cyborg) characters."
138,Die Hard,1988,94,John McTiernan,New York City policeman John McClane (Bruce Willis) is visiting his estranged wife (Bonnie Bedelia) and two daughters on Christmas...,"Bruce Willis, Alan Rickman, Bonnie Bedelia, Reginald VelJohnson",Its many imitators (and sequels) have never come close to matching the taut thrills of the definitive holiday action classic.


## Exporting the data to CSV (comma-separated values) and excel files

In [32]:
# Write data to excel file
movies_info.to_excel("movies_info.xlsx", index = False, header = True)

In [33]:
# or write data to CSV file
movies_info.to_csv("movies_info.csv", index = False, header = True)