In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from time import sleep
from random import randint

# First example of multipage scraping

In [2]:
# we use the 2nd page because it has de "next page" tag in it
url = 'https://www.imdb.com/search/title/?title_type=feature&release_date=1990-01-01,2021-12-31&user_rating=7.0,10.0&start=51&ref_=adv_nxt'

In [3]:
response = requests.get(url)
response.status_code

200

In [4]:
soup=BeautifulSoup(response.content, 'html.parser')

In [5]:
print(soup.prettify())

<!DOCTYPE html>
<html xmlns:fb="http://www.facebook.com/2008/fbml" xmlns:og="http://ogp.me/ns#">
 <head>
  <meta charset="utf-8"/>
  <meta content="IE=edge" http-equiv="X-UA-Compatible"/>
  <script type="text/javascript">
   var IMDbTimer={starttime: new Date().getTime(),pt:'java'};
  </script>
  <script>
   if (typeof uet == 'function') {
      uet("bb", "LoadTitle", {wb: 1});
    }
  </script>
  <script>
   (function(t){ (t.events = t.events || {})["csm_head_pre_title"] = new Date().getTime(); })(IMDbTimer);
  </script>
  <title>
   Feature Film,
Released between 1990-01-01 and 2021-12-31,
User Rating between 7 and 10
(Sorted by Popularity Ascending) - IMDb
  </title>
  <script>
   (function(t){ (t.events = t.events || {})["csm_head_post_title"] = new Date().getTime(); })(IMDbTimer);
  </script>
  <script>
   if (typeof uet == 'function') {
      uet("be", "LoadTitle", {wb: 1});
    }
  </script>
  <script>
   if (typeof uex == 'function') {
      uex("ld", "LoadTitle", {wb: 1});
   

In [6]:
# to get the list of urls (for the different pages) we need an iterator
# iterator - based on 0,51,101,151...
iterations=range(1,2250,50) # 1 round, max of 2250 results, a gap of 50

In [8]:
# create the url list based on iterations
for i in iterations:
    start_at=str(i)
    url='https://www.imdb.com/search/title/?title_type=feature&release_date=1990-01-01,2021-12-31&user_rating=7.0,10.0&start='+start_at+'&ref_=adv_nxt'
    print(url)
# we change the value of the url "start = 51" by '+start_at+' to use the number we are creating with the iteration

https://www.imdb.com/search/title/?title_type=feature&release_date=1990-01-01,2021-12-31&user_rating=7.0,10.0&start=1&ref_=adv_nxt
https://www.imdb.com/search/title/?title_type=feature&release_date=1990-01-01,2021-12-31&user_rating=7.0,10.0&start=51&ref_=adv_nxt
https://www.imdb.com/search/title/?title_type=feature&release_date=1990-01-01,2021-12-31&user_rating=7.0,10.0&start=101&ref_=adv_nxt
https://www.imdb.com/search/title/?title_type=feature&release_date=1990-01-01,2021-12-31&user_rating=7.0,10.0&start=151&ref_=adv_nxt
https://www.imdb.com/search/title/?title_type=feature&release_date=1990-01-01,2021-12-31&user_rating=7.0,10.0&start=201&ref_=adv_nxt
https://www.imdb.com/search/title/?title_type=feature&release_date=1990-01-01,2021-12-31&user_rating=7.0,10.0&start=251&ref_=adv_nxt
https://www.imdb.com/search/title/?title_type=feature&release_date=1990-01-01,2021-12-31&user_rating=7.0,10.0&start=301&ref_=adv_nxt
https://www.imdb.com/search/title/?title_type=feature&release_date=1990-

# Create sleep process

In [9]:
for i in range(5):
    print(i)
    sleep(3) #sleep for 3 seconds
    # we create a sleep process so the website doesn't detect us as a bot

0
1
2
3
4


In [10]:
for i in range(5):
    print(i)
    wait_time=randint(1,6)
    print('now I will sleep for'+str(wait_time)+'seconds, goodnight')
    sleep(wait_time)
# we make the sleep process random to make it more "personal"

0
now I will sleep for2seconds, goodnight
1
now I will sleep for4seconds, goodnight
2
now I will sleep for2seconds, goodnight
3
now I will sleep for3seconds, goodnight
4
now I will sleep for6seconds, goodnight


# We get our response.content from the website (ie scrape all pages)

In [11]:
from tqdm.notebook import tqdm

In [16]:
pages = []
for i in tqdm(iterations):
    start_at=str(i)
    url='https://www.imdb.com/search/title/?title_type=feature&release_date=1990-01-01,2021-12-31&user_rating=7.0,10.0&start='+start_at+'&ref_=adv_nxt'
    response=requests.get(url) #get the page
    print('status_code: '+str(response.status_code)) #get the status code
    pages.append(response.content) #collect the page html in a list
    #have a nap
    wait_time=randint(1,4)
    print('now I sleep for '+str(wait_time))
    sleep(wait_time)

  0%|          | 0/45 [00:00<?, ?it/s]

status_code: 200
now I sleep for 1
status_code: 200
now I sleep for 2
status_code: 200
now I sleep for 2
status_code: 200
now I sleep for 1
status_code: 200
now I sleep for 4
status_code: 200
now I sleep for 1
status_code: 200
now I sleep for 3
status_code: 200
now I sleep for 2
status_code: 200
now I sleep for 3
status_code: 200
now I sleep for 3
status_code: 200
now I sleep for 1
status_code: 200
now I sleep for 4
status_code: 200
now I sleep for 2
status_code: 200
now I sleep for 2
status_code: 200
now I sleep for 2
status_code: 200
now I sleep for 3
status_code: 200
now I sleep for 3
status_code: 200
now I sleep for 4
status_code: 200
now I sleep for 4
status_code: 200
now I sleep for 3
status_code: 200
now I sleep for 1
status_code: 200
now I sleep for 1
status_code: 200
now I sleep for 2
status_code: 200
now I sleep for 4
status_code: 200
now I sleep for 4
status_code: 200
now I sleep for 2
status_code: 200
now I sleep for 1
status_code: 200
now I sleep for 3
status_code: 200
now

In [18]:
#title selector

#main > div > div.lister.list.detail.sub-list > div > div:nth-child(1) > div.lister-item-content > h3 > a
#main > div > div.lister.list.detail.sub-list > div > div:nth-child(2) > div.lister-item-content > h3 > a

# the only thing that changes is the child number, thus we can remove all the previous part of the selector

In [None]:
#synopsis selector

#main > div > div.lister.list.detail.sub-list > div > div:nth-child(1) > div.lister-item-content > p:nth-child(4)
#main > div > div.lister.list.detail.sub-list > div > div:nth-child(2) > div.lister-item-content > p:nth-child(4)

# Collect the content we want from pages []

In [19]:
soup=BeautifulSoup(pages[0], 'html.parser') #we do it for first page only to see if it's ok

In [20]:
soup


<!DOCTYPE html>

<html xmlns:fb="http://www.facebook.com/2008/fbml" xmlns:og="http://ogp.me/ns#">
<head>
<meta charset="utf-8"/>
<meta content="IE=edge" http-equiv="X-UA-Compatible"/>
<script type="text/javascript">var IMDbTimer={starttime: new Date().getTime(),pt:'java'};</script>
<script>
    if (typeof uet == 'function') {
      uet("bb", "LoadTitle", {wb: 1});
    }
</script>
<script>(function(t){ (t.events = t.events || {})["csm_head_pre_title"] = new Date().getTime(); })(IMDbTimer);</script>
<title>Feature Film,
Released between 1990-01-01 and 2021-12-31,
User Rating between 7 and 10
(Sorted by Popularity Ascending) - IMDb</title>
<script>(function(t){ (t.events = t.events || {})["csm_head_post_title"] = new Date().getTime(); })(IMDbTimer);</script>
<script>
    if (typeof uet == 'function') {
      uet("be", "LoadTitle", {wb: 1});
    }
</script>
<script>
    if (typeof uex == 'function') {
      uex("ld", "LoadTitle", {wb: 1});
    }
</script>
<link href="https://www.imdb.com/

In [22]:
soup.select('div:nth-child(1) > div.lister-item-content > h3 > a')[0].get_text(strip=True) #to see one result

'Doctor Strange (Doctor Extraño)'

In [26]:
# to select all the films:
soup.select('h3 > a') # we use this to see more results, we could also use the previous command and then use an iterator but this is a faster solution in that case

[<a href="/title/tt1211837/">Doctor Strange (Doctor Extraño)</a>,
 <a href="/title/tt10872600/">Spider-Man: No Way Home</a>,
 <a href="/title/tt4513678/">Cazafantasmas: Más allá</a>,
 <a href="/title/tt1189340/">El inocente (The Lincoln Lawyer)</a>,
 <a href="/title/tt0499549/">Avatar</a>,
 <a href="/title/tt1160419/">Dune</a>,
 <a href="/title/tt0325980/">Piratas del Caribe: La maldición de la Perla Negra</a>,
 <a href="/title/tt0120669/">Miedo y asco en Las Vegas</a>,
 <a href="/title/tt6398184/">Downton Abbey</a>,
 <a href="/title/tt8367814/">The Gentlemen: Los señores de la mafia</a>,
 <a href="/title/tt2382320/">Sin tiempo para morir</a>,
 <a href="/title/tt4154796/">Vengadores: Endgame</a>,
 <a href="/title/tt11271038/">Licorice Pizza</a>,
 <a href="/title/tt6467266/">¡Canta! 2</a>,
 <a href="/title/tt0111161/">Cadena perpetua</a>,
 <a href="/title/tt2953050/">Encanto</a>,
 <a href="/title/tt8772262/">Midsommar</a>,
 <a href="/title/tt10366460/">CODA: Los sonidos del silencio</a>

In [25]:
soup.select('div:nth-child(1) > div.lister-item-content > p:nth-child(4)')[0].get_text(strip=True)

'While on a journey of physical and spiritual healing, a brilliant neurosurgeon is drawn into the world of the mystic arts.'

In [27]:
soup.select('p:nth-child(4)')

[<p class="text-muted">
 While on a journey of physical and spiritual healing, a brilliant neurosurgeon is drawn into the world of the mystic arts.</p>,
 <p class="text-muted">
 With Spider-Man's identity now revealed, Peter asks Doctor Strange for help. When a spell goes wrong, dangerous foes from other worlds start to appear, forcing Peter to discover what it truly means to be Spider-Man.</p>,
 <p class="text-muted">
 When a single mom and her two kids arrive in a small town, they begin to discover their connection to the original Ghostbusters and the secret legacy their grandfather left behind.</p>,
 <p class="text-muted">
 A lawyer defending a wealthy man begins to believe his client is guilty of more than just one crime.</p>,
 <p class="text-muted">
 A paraplegic Marine dispatched to the moon Pandora on a unique mission becomes torn between following his orders and protecting the world he feels is his home.</p>,
 <p class="text-muted">
 A noble family becomes embroiled in a war fo

In [None]:
# for each set of results (page) create soup, 
# select relevant data from soup 
# create two lists - title() and synopsis()
# create df 

In [53]:
len(pages)

45

In [54]:
page_parsed=[]
titles=[]
synopsis=[]

for i in tqdm(range(len(pages))):
    page_parsed.append(BeautifulSoup(pages[i],'html.parser'))
    movies_html=page_parsed[i].select('div.lister-item-content')
    for m in range(len(movies_html)):
        titles.append(movies_html[m].select('h3 > a')[0].get_text())
        synopsis.append(movies_html[m].select('p:nth-child(4)')[0].get_text(strip=True))

  0%|          | 0/45 [00:00<?, ?it/s]

In [55]:
len(titles)

2250

In [56]:
titles

['Doctor Strange (Doctor Extraño)',
 'Spider-Man: No Way Home',
 'Cazafantasmas: Más allá',
 'El inocente (The Lincoln Lawyer)',
 'Avatar',
 'Dune',
 'Piratas del Caribe: La maldición de la Perla Negra',
 'Miedo y asco en Las Vegas',
 'Downton Abbey',
 'The Gentlemen: Los señores de la mafia',
 'Sin tiempo para morir',
 'Vengadores: Endgame',
 'Licorice Pizza',
 '¡Canta! 2',
 'Cadena perpetua',
 'Encanto',
 'Midsommar',
 'CODA: Los sonidos del silencio',
 'El caballero oscuro',
 'El callejón de las almas perdidas',
 'El escuadrón suicida',
 'American Psycho',
 'La peor persona del mundo',
 'Free Guy',
 'Interstellar',
 'The Innocents',
 'Black Phone',
 'Infiltrados',
 'El lobo de Wall Street',
 'Jurassic Park (Parque Jurásico)',
 'Forrest Gump',
 'Harry Potter y la piedra filosofal',
 'Shang-Chi y la leyenda de los Diez Anillos',
 'Despierta la furia',
 'Titanic',
 'No mires arriba',
 'Érase una vez en... Hollywood',
 'Más allá del tiempo',
 'Vengadores: Infinity War',
 'Puñales por la

In [57]:
len(synopsis)

2250

In [58]:
synopsis

['While on a journey of physical and spiritual healing, a brilliant neurosurgeon is drawn into the world of the mystic arts.',
 "With Spider-Man's identity now revealed, Peter asks Doctor Strange for help. When a spell goes wrong, dangerous foes from other worlds start to appear, forcing Peter to discover what it truly means to be Spider-Man.",
 'When a single mom and her two kids arrive in a small town, they begin to discover their connection to the original Ghostbusters and the secret legacy their grandfather left behind.',
 'A lawyer defending a wealthy man begins to believe his client is guilty of more than just one crime.',
 'A paraplegic Marine dispatched to the moon Pandora on a unique mission becomes torn between following his orders and protecting the world he feels is his home.',
 "A noble family becomes embroiled in a war for control over the galaxy's most valuable asset while its heir becomes troubled by visions of a dark future.",
 'Blacksmith Will Turner teams up with ecc

In [59]:
movies_all=pd.DataFrame({'title':titles,'synopsis':synopsis})

In [60]:
movies_all.head()

Unnamed: 0,title,synopsis
0,Doctor Strange (Doctor Extraño),While on a journey of physical and spiritual h...
1,Spider-Man: No Way Home,"With Spider-Man's identity now revealed, Peter..."
2,Cazafantasmas: Más allá,When a single mom and her two kids arrive in a...
3,El inocente (The Lincoln Lawyer),A lawyer defending a wealthy man begins to bel...
4,Avatar,A paraplegic Marine dispatched to the moon Pan...


In [61]:
movies_all['rank']=movies_all.index+1

In [62]:
movies_all.head()

Unnamed: 0,title,synopsis,rank
0,Doctor Strange (Doctor Extraño),While on a journey of physical and spiritual h...,1
1,Spider-Man: No Way Home,"With Spider-Man's identity now revealed, Peter...",2
2,Cazafantasmas: Más allá,When a single mom and her two kids arrive in a...,3
3,El inocente (The Lincoln Lawyer),A lawyer defending a wealthy man begins to bel...,4
4,Avatar,A paraplegic Marine dispatched to the moon Pan...,5


# Example 2 - Scraping presidents of the US from wikipedia

- the list of presidents page
- from the list we take the name or href to next page - this gives a url
- go to the presidents own page
- pull information from the table in that page
- assemble a data frame

interesting data: name, birthdate, party...

In [28]:
url = 'https://en.wikipedia.org/wiki/List_of_presidents_of_the_United_States'

In [29]:
response = requests.get(url)
response.status_code

200

In [30]:
soup=BeautifulSoup(response.content,'html.parser')

In [None]:
#for the name:
#mw-content-text > div.mw-parser-output > table > tbody > tr:nth-child(1) > td:nth-child(3) > b > a
#mw-content-text > div.mw-parser-output > table > tbody > tr:nth-child(3) > td:nth-child(3) > b > a

In [35]:
presi_urls=soup.select('td:nth-child(3) > b > a')

In [44]:
soup.select('td:nth-child(3) > b > a')[1]['href'] #just to understand the next step

'/wiki/John_Adams'

In [37]:
presi_urls

[<a href="/wiki/George_Washington" title="George Washington">George Washington</a>,
 <a href="/wiki/John_Adams" title="John Adams">John Adams</a>,
 <a href="/wiki/Thomas_Jefferson" title="Thomas Jefferson">Thomas Jefferson</a>,
 <a href="/wiki/James_Madison" title="James Madison">James Madison</a>,
 <a href="/wiki/James_Monroe" title="James Monroe">James Monroe</a>,
 <a href="/wiki/John_Quincy_Adams" title="John Quincy Adams">John Quincy Adams</a>,
 <a href="/wiki/Andrew_Jackson" title="Andrew Jackson">Andrew Jackson</a>,
 <a href="/wiki/Martin_Van_Buren" title="Martin Van Buren">Martin Van Buren</a>,
 <a href="/wiki/William_Henry_Harrison" title="William Henry Harrison">William Henry Harrison</a>,
 <a href="/wiki/John_Tyler" title="John Tyler">John Tyler</a>,
 <a href="/wiki/James_K._Polk" title="James K. Polk">James K. Polk</a>,
 <a href="/wiki/Zachary_Taylor" title="Zachary Taylor">Zachary Taylor</a>,
 <a href="/wiki/Millard_Fillmore" title="Millard Fillmore">Millard Fillmore</a>,
 

In [36]:
len(presi_urls)

46

In [40]:
links=[]
for p in presi_urls:
    href1=p['href']
    url='https://en.wikipedia.org'+href1
    links.append(url)

In [41]:
links

['https://en.wikipedia.org/wiki/George_Washington',
 'https://en.wikipedia.org/wiki/John_Adams',
 'https://en.wikipedia.org/wiki/Thomas_Jefferson',
 'https://en.wikipedia.org/wiki/James_Madison',
 'https://en.wikipedia.org/wiki/James_Monroe',
 'https://en.wikipedia.org/wiki/John_Quincy_Adams',
 'https://en.wikipedia.org/wiki/Andrew_Jackson',
 'https://en.wikipedia.org/wiki/Martin_Van_Buren',
 'https://en.wikipedia.org/wiki/William_Henry_Harrison',
 'https://en.wikipedia.org/wiki/John_Tyler',
 'https://en.wikipedia.org/wiki/James_K._Polk',
 'https://en.wikipedia.org/wiki/Zachary_Taylor',
 'https://en.wikipedia.org/wiki/Millard_Fillmore',
 'https://en.wikipedia.org/wiki/Franklin_Pierce',
 'https://en.wikipedia.org/wiki/James_Buchanan',
 'https://en.wikipedia.org/wiki/Abraham_Lincoln',
 'https://en.wikipedia.org/wiki/Andrew_Johnson',
 'https://en.wikipedia.org/wiki/Ulysses_S._Grant',
 'https://en.wikipedia.org/wiki/Rutherford_B._Hayes',
 'https://en.wikipedia.org/wiki/James_A._Garfield',


In [None]:
# info box label, info box data

In [45]:
url = 'https://en.wikipedia.org/wiki/Joe_Biden'

In [63]:
response=requests.get(url)

In [64]:
response.status_code

200

In [49]:
soup=BeautifulSoup(response.content,'html.parser')

In [65]:
soup.find('table',{'class':'infobox vcard'})

In [None]:
# Finally we just need to put that into a df