In [1]:
from requests import get
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from time import sleep
from random import randint
from IPython.core.display import clear_output
from time import time
import re

In [2]:
url = r"https://www.imdb.com/search/title/?title_type=feature&release_date=1990-01-01,&num_votes=1000,&languages=en&sort=num_votes,desc&count=250"

In [3]:
def get_search_html(url):
    response = get(url)
    html_soup = BeautifulSoup(response.text, 'html.parser')
    return html_soup

In [4]:
def get_next_url(soup):
    path = soup.find('a', class_="lister-page-next next-page")
    if path is None:
        return None
    else:
        path = path['href']
        next_url = r"https://www.imdb.com/" + path
        return next_url

In [5]:
def make_db(url):
    start_time = time()
    requests = 0
    no_meta = 0
    
    titles = []
    release_year = []
    runtimes = []
    genres = []
    imdb_ratings = []
    meta_ratings = []
    imdb_votes = []
    
    while url is not None:
        soup = get_search_html(url)
        url = get_next_url(soup)
        movie_containers = soup.find_all('div', class_='lister-item mode-advanced')
        
        for movie in movie_containers:
            if movie.find('div', class_='ratings-metascore') is not None:

                if movie.h3 is not None:
                    title = movie.h3.a.text
                    if title is None:
                        title = np.nan
                else:
                    title = np.nan
                titles.append(title)

                if movie.h3.find('span', class_='lister-item-year') is not None:
                    year = movie.h3.find('span', class_='lister-item-year').text
                    if year is None:
                        year = np.nan
                else:
                    year = np.nan
                release_year.append(year)

                if movie.find('span', class_='runtime') is not None:
                    runtime = movie.find('span', class_='runtime').text
                    if runtime is None:
                        runtime = np.nan
                else:
                    runtime = np.nan
                runtimes.append(runtime)

                if movie.find('span', class_='genre') is not None:
                    genre = movie.find('span', class_='genre').text
                    if genre is None:
                        genre = np.nan
                    else:
                        genre = genre.strip()
                else:
                    genre = np.nan
                genres.append(genre)

                if movie.find('div', class_='ratings-bar').find('div', class_='ratings-imdb-rating') is not None:
                    imdb_rating = float(movie.find('div', class_='ratings-bar').find('div', class_='ratings-imdb-rating').strong.text)
                    if imdb_rating is None:
                        imdb_rating = np.nan
                else:
                    imdb_rating = np.nan
                imdb_ratings.append(imdb_rating)

                if movie.find('span', class_='metascore') is not None:
                    meta_rating = int(movie.find('span', class_='metascore').text)
                    if meta_rating is None:
                        meta_rating = np.nan
                else:
                    meta_rating = np.nan
                meta_ratings.append(meta_rating)

                num_imdb_votes = movie.find('span', {"name":"nv"})
                if num_imdb_votes is None:
                    num_imdb_votes = np.nan
                else:
                    num_imdb_votes = int(num_imdb_votes["data-value"].strip().replace(',',''))
                    imdb_votes.append(num_imdb_votes)
            else:
                print("No Metacritic Score available for this movie!!")
                no_meta += 1
        
        sleep(randint(2,5))
    
        requests += 1
        elapsed_time = time() - start_time
        print('Request:{}; Frequency: {} requests/s'.format(requests, requests/elapsed_time))
        clear_output(wait = True)
        
    df = pd.DataFrame({'movie':titles,
                          'year':release_year,
                          'runtime':runtimes,
                          'genre':genres,
                          'imdb':imdb_ratings,
                          'meta':meta_ratings,
                          'imdb_votes':imdb_votes
                         })
    return df

    

In [6]:
imdb_meta_df = make_db(url)

No Metacritic Score available for this movie!!
No Metacritic Score available for this movie!!
No Metacritic Score available for this movie!!
No Metacritic Score available for this movie!!
No Metacritic Score available for this movie!!
No Metacritic Score available for this movie!!
No Metacritic Score available for this movie!!
No Metacritic Score available for this movie!!
No Metacritic Score available for this movie!!
No Metacritic Score available for this movie!!
No Metacritic Score available for this movie!!
Request:60; Frequency: 0.11099970214989238 requests/s


In [9]:
imdb_meta_df['year'] = imdb_meta_df['year'].str.extract(r'([0-9]\w+)')

In [11]:
imdb_meta_df.to_csv('imdb_meta.csv', index=False)