In [None]:
# 1. import libraries
from bs4 import BeautifulSoup
import requests

In [None]:
# 2. url: we start with the 'second' page
url = "https://www.imdb.com/search/title/?title_type=feature&release_date=1990-01-01,1992-12-31&user_rating=7.5,&start=51&ref_=adv_nxt"

In [None]:
# 3. download html with a get request
response = requests.get(url)
response.status_code # 200 status code means OK!

In [None]:
# 4.1. parse html (create the 'soup')
soup = BeautifulSoup(response.content, "html.parser")
# 4.2. check that the html code looks like it should
soup

In [None]:
iterations = range(1, 631, 50)

for i in iterations:
    start_at= str(i)
    url = "https://www.imdb.com/search/title/?title_type=feature&release_date=1990-01-01,1992-12-31&user_rating=7.5,&start=" + start_at + "&ref_=adv_nxt"
    print(url)

Respectful scraping:

There we have it, all the URLs we need! Before starting with the actual scraping, though, there's something we need to note when sending massive, automated requests to websites: it's rude. 

We just have 13 of them, which is not too many, but it's still a good practice to let a few seconds pass in between requests. 

Some pages don't like being scraped and will block your IP if they detect it's sending automated requests. Others might have a small server for the traffic they handle, and sending too many requests might crash the site. The sleep module will help us with that. 

Here's how it works, waiting 2 seconds between each iteration in a for loop:

In [None]:
#we need a few more tools for this one 
from random import randint
import time
time.sleep(2)

In [None]:
#basic use of sleep in for loop 
for i in range(5):
    print(i)
    time.sleep(2)

In [None]:
#more human use of sleep 

for i in range(5):
    print(i)
    wait_time = randint(1,4)
    print("I will sleep for " + str(wait_time) + " seconds.")
    time.sleep(wait_time)

to make this more interactive 
- you can split the below scripts out into separate cells and run them yourself or re-write them in your own python style 

In [None]:
#Assembling the script to send and store multiple requests

pages = []

for i in iterations:
    # assemble the url:
    start_at= str(i)
    url = "https://www.imdb.com/search/title/?title_type=feature&release_date=1990-01-01,1992-12-31&user_rating=7.5,&start=" + start_at + "&ref_=adv_nxt"

    # download html with a get request:
    response = requests.get(url)

    # monitor the process by printing the status code
    print("Status code: " + str(response.status_code))

    # store response into "pages" list
    pages.append(response)

    # respectful nap:
    wait_time = randint(1,4)
    print("I will sleep for " + str(wait_time) + " second/s.")
    time.sleep(wait_time)

In [None]:
# print the object pages after running the code above, you'll just see the response code messages, but the html code is still accessible and you can parse it the same way we've always done:

BeautifulSoup(pages[0].content, "html.parser")

In [None]:
# Parse just the first page, for testing purposes
soup = BeautifulSoup(pages[0].content, "html.parser")

# Paste the Selector from the first movie title copied from Chrome Dev Tools
soup.select("#main > div > div.lister.list.detail.sub-list > div > div:nth-child(1) > div.lister-item-content > h3 > a")

# Trim the selection: now it grabs all the titles
soup.select("div.lister-item-content > h3 > a")

In [None]:
# Paste the Selector from the first movie title copied from Chrome Dev Tools
soup.select("#main > div > div.lister.list.detail.sub-list > div > div:nth-child(1) > div.lister-item-content  > p:nth-child(4)")

# Trim the selection: now it grabs all the titles
soup.select("div.lister-item-content > p:nth-child(4)")

One of the ugliest things about the code above is that the HTML element containing the synopsis does not have any combination of tag and attribute that makes it unique. We've had to use select("p:nth-child(4)") and simply grab the 4th <p> element. Not very elegant... potentially will break... but, for now, it works.

We have noticed how both the title and the synopsis are children of div.lister-item-content. That will make our looping task a bit simpler.

There are many approaches to do this. The one we'll follow is:

Loop through the pages we collected, parse them ("create the soup") and store the parsed pages in a list.
For each parsed page, select the "blocks of HTML elements" that contain all the information of each movie (the title, the synopsis and other stuff).
For each one of the "blocks" we collected in the previous step:
Get the movie titles and store them in a list
Get the synopsis and store them in a list
Here's the code that does that:

In [None]:
pages_parsed = []
titles = []
synopsis = []

for i in range(len(pages)):
    # parse all pages
    pages_parsed.append(BeautifulSoup(pages[i].content, "html.parser"))
    # select only the info about the movies
    movies_html = pages_parsed[i].select("div.lister-item-content")
    # for movie, store titles and reviews into lists
    for j in range(len(movies_html)):
        titles.append(movies_html[j].select("h3 > a")[0].get_text())
        synopsis.append(movies_html[j].select("p:nth-child(4)")[0].get_text().strip())