In [40]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [41]:
# I am going to look at the best movies of 2018
result = requests.get("https://www.rottentomatoes.com/top/bestofrt/?year=2017")

In [42]:
# Let's check our status code
result

<Response [200]>

In [43]:
# I wonder what cookies and headers we got
result.headers["Content-Type"]

'text/html;charset=UTF-8'

In [44]:
# We have to create a parser and get the contents of our get request
soup = BeautifulSoup(result.content, "html.parser")

In [45]:
# We can use prettify to print the page
soup.prettify()

u'<!DOCTYPE html>\n<html lang="en" xmlns:fb="http://www.facebook.com/2008/fbml" xmlns:og="http://opengraphprotocol.org/schema/">\n <head prefix="og: http://ogp.me/ns# flixstertomatoes: http://ogp.me/ns/apps/flixstertomatoes#">\n  <script src="//cdn.optimizely.com/js/594670329.js">\n  </script>\n  <meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>\n  <meta content="width=device-width,initial-scale=1" name="viewport">\n   <meta content="VPPXtECgUUeuATBacnqnCm4ydGO99reF-xgNklSbNbc" name="google-site-verification"/>\n   <meta content="034F16304017CA7DCF45D43850915323" name="msvalidate.01"/>\n   <link href="https://staticv2-4.rottentomatoes.com/static/images/iphone/apple-touch-icon.png" rel="apple-touch-icon"/>\n   <link href="https://staticv2-4.rottentomatoes.com/static/images/icons/favicon.ico" rel="shortcut icon" type="image/x-icon"/>\n   <link href="https://staticv2-4.rottentomatoes.com/static/styles/css/rt_main.css" rel="stylesheet"/>\n   <script id="jsonLdSchema" typ

In [46]:
# We can look at the title with .title.contents[0]
soup.title.contents[0]

u'Top 100 Movies of 2017 - Rotten Tomatoes'

In [47]:
# There are 2 methods for finding things with beautiful soup find_all and find. Let's explore them both

tables = soup.find_all("table")

In [48]:
# looks like we got some extra
for table in tables:
    print(table)
    print("#####")

<table class="movie_list tv_list" id="tv-list-21">
<tr class="tv_show_tr tvTopListTitle">
<td class="left_col">
<a href="https://www.rottentomatoes.com/tv/black_lightning/s02">
<span class="icon tiny fresh"></span>
<span class="tMeterScore">93%</span>
</a>
</td>
<td class="middle_col">
<a href="https://www.rottentomatoes.com/tv/black_lightning/s02">Black Lightning</a>
</td>
</tr><tr class="tv_show_tr tvTopListTitle">
<td class="left_col">
<a href="https://www.rottentomatoes.com/tv/i_am_the_night/s01">
<span class="icon tiny certified_fresh"></span>
<span class="tMeterScore">75%</span>
</a>
</td>
<td class="middle_col">
<a href="https://www.rottentomatoes.com/tv/i_am_the_night/s01">I Am the Night</a>
</td>
</tr><tr class="tv_show_tr tvTopListTitle">
<td class="left_col">
<a href="https://www.rottentomatoes.com/tv/the_passage/s01">
<span class="icon tiny rotten"></span>
<span class="tMeterScore">58%</span>
</a>
</td>
<td class="middle_col">
<a href="https://www.rottentomatoes.com/tv/the_

In [49]:
tables[3]

<table class="movie_list" id="Top-Box-Office">\n<tr class="">\n<td class="left_col">\n<a href="/m/if_beale_street_could_talk">\n<span class="icon tiny certified_fresh"></span>\n<span class="tMeterScore">94%</span>\n</a>\n</td>\n<td class="middle_col">\n<a href="/m/if_beale_street_could_talk">If Beale Street Could Talk</a>\n</td>\n<td class="right_col right">\n</td>\n</tr><tr class="">\n<td class="left_col">\n<a href="/m/spider_man_into_the_spider_verse">\n<span class="icon tiny certified_fresh"></span>\n<span class="tMeterScore">97%</span>\n</a>\n</td>\n<td class="middle_col">\n<a href="/m/spider_man_into_the_spider_verse">Spider-Man: Into the Spider-Verse</a>\n</td>\n<td class="right_col right">\n</td>\n</tr><tr class="">\n<td class="left_col">\n<a href="/m/they_shall_not_grow_old">\n<span class="icon tiny certified_fresh"></span>\n<span class="tMeterScore">99%</span>\n</a>\n</td>\n<td class="middle_col">\n<a href="/m/they_shall_not_grow_old">They Shall Not Grow Old</a>\n</td>\n<td cl

In [50]:
# To be more exact we can target only our specific class
table = soup.find("table", {"class": "table"})

In [51]:
table

<table class="table">\n<thead>\n<tr>\n<th>Rank</th>\n<th><span class="hidden-xs">Rating</span><span class="visible-xs">Tomatometer</span></th>\n<th>Title</th>\n<th class="right hidden-xs">No. of Reviews</th>\n</tr>\n</thead>\n<tr>\n<td class="bold">1.</td>\n<td>\n<span class="tMeterIcon tiny">\n<span class="icon tiny certified_fresh"></span>\n<span class="tMeterScore">\xa099%</span>\n</span>\n</td>\n<td>\n<a class="unstyled articleLink" href="/m/lady_bird">\n            Lady Bird (2017)</a>\n</td>\n<td class="right hidden-xs">355</td>\n</tr>\n<tr>\n<td class="bold">2.</td>\n<td>\n<span class="tMeterIcon tiny">\n<span class="icon tiny certified_fresh"></span>\n<span class="tMeterScore">\xa098%</span>\n</span>\n</td>\n<td>\n<a class="unstyled articleLink" href="/m/get_out">\n            Get Out (2017)</a>\n</td>\n<td class="right hidden-xs">345</td>\n</tr>\n<tr>\n<td class="bold">3.</td>\n<td>\n<span class="tMeterIcon tiny">\n<span class="icon tiny certified_fresh"></span>\n<span class="

In [52]:
# We still have a lot of information so we need to go another layer deeper
rows = table.find_all("tr")

In [53]:
# Let's see what we got
rows[1]

<tr>\n<td class="bold">1.</td>\n<td>\n<span class="tMeterIcon tiny">\n<span class="icon tiny certified_fresh"></span>\n<span class="tMeterScore">\xa099%</span>\n</span>\n</td>\n<td>\n<a class="unstyled articleLink" href="/m/lady_bird">\n            Lady Bird (2017)</a>\n</td>\n<td class="right hidden-xs">355</td>\n</tr>

In [54]:
rows[1].find("span", {"class": "tMeterScore"}).contents[0]

u'/m/lady_bird'

In [55]:
# We are almost there. Now we need to loop through and find only the contents we want

df_list = []

for row in rows[1:-1]:
    score = row.find("span", {"class":"tMeterScore"}).contents[0].replace(u'\xa0', u' ')
    title = row.find("a", {"class":"unstyled articleLink"}).contents[0]
    reviews = row.find("td", {"class":"right hidden-xs"}).contents[0]
    df_list.append({"score":score,"title":title, "reviews": reviews})
df = pd.DataFrame(df_list, columns = ['title', "score", "reviews"])

In [56]:
df

Unnamed: 0,title,score,reviews
0,Lady Bird (2017),99%,355
1,Get Out (2017),98%,345
2,Wonder Woman (2017),93%,417
3,Coco (2017),97%,316
4,Dunkirk (2017),92%,423
5,The Big Sick (2017),98%,279
6,Star Wars: The Last Jedi (2017),91%,429
7,Logan (2017),93%,381
8,The Shape of Water (2017),92%,401
9,Thor: Ragnarok (2017),92%,381


In [57]:
# Let's save our data to a spreadsheet

df.to_csv("Good_Movies_2017.csv", encoding="utf-8")