## How To Web Scrape the IMBD Website with Beautiful Soup

In [17]:
# import modules for API calling and 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option("display.max_rows", 50)
import requests
import json
import time
import random
from tqdm.notebook import trange, tqdm


In [18]:
# ensure we get English-translated titles from movies
headers = {"Accept-Language": "en-US, en;q=0.5"}


In [19]:
# instantiate lists for storage of scraped data
titles = []
imdb_ids = []
pg_ratings = []
years = []
runtimes = []
genres = []
imdb_ratings = []
metascores = []
votes = []
gross_us = []
names = []


In [21]:
# create for loop to iterate through each page containing 50 movies of the top 1000
# sorted by US gross box office to get dataset with mostly American films
# tqdm creates the progress bar for the web scrape
for n in tqdm(range(1, 1000, 50), desc='Download Progress:'):
    # Create an expression that represents each page for the iteration
    url = 'https://www.imdb.com/search/title/?groups=top_1000&sort=boxoffice_gross_us,desc&start={}&ref_=adv_nxt'.format(n)
    # The requests library makes a get request to the url for data, which is saved to results
    results = requests.get(url, headers=headers)
    # Create an instance of BeautifulSoup to parse results
    soup = BS(results.text, "html.parser")
    # Find the div container in the HTML that contains the wanted information.  
    movie_div = soup.find_all('div', class_='lister-item mode-advanced')
    # Use function to suspend execution of calling thread at random
    time.sleep(random.randint(3, 12))

    # for each container in the div container created above by Beautiful Soup
    for div in movie_div:
        # to get the titles, we use attribute notation to access the title contained as text in the <a> tag nested inside the <h3> tag 
        # dot notation only works with the first instance of the tag
        title = div.h3.a.text
        # we append the scraped title to the titles list through each iteration
        titles.append(title)
        # to get the IMDB id's, we use the find() method to find the first instance 
        imdb_id = div.find('img')['data-tconst']
        imdb_ids.append(imdb_id)
        # to scrape the year, we use the find() method to search nested inside the h3 tag to get the text inside the span tag with the class lister-item-year
        year = div.h3.find('span', class_='lister-item-year').text
        years.append(year)
        # to scrape the pg rating, we use the find method again but create a condition in the case of blank values
        pg_rating = div.find('span', class_='certificate').text if div.p.find('span', class_='certificate') else '--'
        pg_ratings.append(pg_rating)
        # to scrape the runtime, we use a similar method from above
        runtime = div.find('span', class_='runtime').text if div.p.find('span', class_='runtime') else '--'
        runtimes.append(runtime)
        # to scrape the genres, we employed the same method
        genre = div.find('span', class_='genre').text
        genres.append(genre)
        # to scrape the IMDB rating, we call the distinctive strong tag which wraps the desired text
        imdb_rating = div.strong.text
        imdb_ratings.append(imdb_rating)
        # to scrape the etascore, we use a similar code as above
        metascore = div.find('span', class_='metascore').text if div.find('span', class_='metascore') else '--'
        metascores.append(metascore)
        # to scrape the votes and gross us earnings, we use the find_all method, which finds all the instances of the span tag with the name attribute and value of nv
        # if there is one item in the list, it represents the vote, and if there are two items, then we get the gross earning and return a string if it's empty to ensure all our lists are the same length to make the DataFrame
        nv = div.find_all('span', attrs={'name': 'nv'})
        vote = nv[0].text
        votes.append(vote)
        gross = nv[1].text if len(nv) > 1 else '--'
        gross_us.append(gross)
        



HBox(children=(HTML(value='Download Progress:'), FloatProgress(value=0.0, max=20.0), HTML(value='')))




In [91]:
# Create a Pandas dataframe with the 
movies = pd.DataFrame({
    'movie': titles, 'year' : years, 'pg_rating' : pg_ratings, 'imdb_id' : imdb_ids, 'runtime' : runtimes, 'genre' : genres, 'metascore' : metascores, 'imdb_rating' : imdb_ratings, 'votes' : votes, 'gross_us' : gross_us
})

In [92]:
#
movies['imdb_id'] = movies['imdb_id'].str.extract('(\d+)').astype(int)
# 
movies['year'] = movies['year'].str.extract('(\d+)').astype(int)

movies['runtime'] = movies['runtime'].str.extract('(\d+)').astype(int)

movies['metascore'] = movies['metascore'].str.rstrip()

movies['metascore'] = pd.to_numeric(movies['metascore'], errors='coerce')

movies['votes'] = movies['votes'].str.replace(',', '').astype(int)

movies['gross_us'] = movies['gross_us'].map(lambda x: x.lstrip('$').rstrip('M'))
movies['genre'] = movies['genre'].map(lambda x: x.strip('\n').rstrip())


In [95]:
# Write dataframe to a CSV file
movies.to_csv('top_1000_by_us_box_office.csv')


In [None]:
# Potential webscrape 
https://www.imdb.com/search/title/?title_type=feature&num_votes=25000,&genres=action&sort=user_rating,desc&start=51&ref_=adv_nxt
