In [1]:
from bs4 import BeautifulSoup
import requests
import time, os
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.keys import Keys

In [2]:
chromedriver = "/Users/rawanawad/Downloads/chromedriver" # path to the chromedriver executable
os.environ["webdriver.chrome.driver"] = chromedriver

In [3]:
driver = webdriver.Chrome(chromedriver)

In [4]:
# take movie and return clean movie data in dictionary
def clean_movie(movie):
    clean_movie = {}
    try:
        # get the values
        header = movie.find_element_by_class_name('lister-item-header')
        index = header.find_element_by_class_name('lister-item-index').text
        name = header.find_element_by_tag_name('a').text
        year = header.find_element_by_class_name('lister-item-year').text
        certificate = movie.find_element_by_class_name('certificate').text
        runtime = movie.find_element_by_class_name('runtime').text
        genre = movie.find_element_by_class_name('genre').text
        rating = movie.find_element_by_class_name('ratings-imdb-rating').get_attribute('data-value')
        ratings_metascore = movie.find_element_by_class_name('ratings-metascore').find_element_by_tag_name('span').text
        vote_gross_section = movie.find_element_by_class_name('sort-num_votes-visible').find_elements_by_name('nv')
        vote = vote_gross_section[0].get_attribute('data-value')
        gross = vote_gross_section[1].get_attribute('data-value')
        
        # add key and formated value to movie dictionary
        clean_movie['index'] = str(index.strip('.'))
        clean_movie['name'] = name.strip()
        clean_movie['year'] = year.strip('()')
        clean_movie['certificate'] = certificate.strip()
        clean_movie['runtime_in_min'] = int(runtime.strip(' min'))
        clean_movie['genre'] = genre.split(',')
        clean_movie['rating'] = float(rating)
        clean_movie['ratings_metascore'] = int(ratings_metascore.strip())
        clean_movie['vote'] = int(vote)
        clean_movie['gross'] = int(gross.replace(',',''))
    finally:
        return clean_movie 

In [5]:
def get_movies_from_page(page): 
    cleaned_movies = []
    
    unclean_movies_list = page.find_elements_by_class_name("lister-item")

    for unclean_movie in unclean_movies_list:
            cleaned_movies.append(clean_movie(unclean_movie))
    return cleaned_movies

In [6]:
def get_imdb_movies_data(num_of_pages):
    current_page = 'https://www.imdb.com/search/title/?title_type=feature&sort=num_votes,desc'
    entire_movies = []
    for i in range(num_of_pages):
        driver.get(current_page)
        time.sleep(1)  #pause to be sure page has loaded
        entire_movies.extend(get_movies_from_page(driver))
    
        # update the current_page
        next_page = driver.find_element_by_class_name("lister-page-next")
        current_page = next_page.get_attribute('href')
    return entire_movies

In [7]:
imdb_movies_data = get_imdb_movies_data(25)

In [8]:
len(imdb_movies_data)

1250

In [9]:
df = pd.DataFrame(imdb_movies_data)

In [10]:
df

Unnamed: 0,index,name,year,certificate,runtime_in_min,genre,rating,ratings_metascore,vote,gross
0,1,The Shawshank Redemption,1994,R,142.0,[Drama],9.3,80.0,2473258.0,28341469.0
1,2,The Dark Knight,2008,PG-13,152.0,"[Action, Crime, Drama]",9.0,84.0,2427277.0,534858444.0
2,3,Inception,2010,PG-13,148.0,"[Action, Adventure, Sci-Fi]",8.8,74.0,2178023.0,292576195.0
3,4,Fight Club,1999,R,139.0,[Drama],8.8,66.0,1947366.0,37030102.0
4,5,Pulp Fiction,1994,R,154.0,"[Crime, Drama]",8.9,94.0,1914649.0,107928762.0
...,...,...,...,...,...,...,...,...,...,...
1245,1246,Road Trip,2000,R,93.0,[Comedy],6.4,55.0,163565.0,68540777.0
1246,1247,Terminator: Dark Fate,2019,R,128.0,"[Action, Adventure, Sci-Fi]",6.2,54.0,163493.0,62253077.0
1247,1248,Miss Peregrine's Home for Peculiar Children,2016,PG-13,127.0,"[Adventure, Drama, Family]",6.7,57.0,163366.0,87242834.0
1248,1249,King Arthur,2004,PG-13,126.0,"[Action, Adventure, Drama]",6.3,46.0,163332.0,51882244.0


In [11]:
df.to_json('imdb_movies_dataframe.json')