In [24]:
import requests
from bs4 import BeautifulSoup

# Prepare CSV file
topRatedCSV = "topRatedMovies.csv"
sep = ";"
CSV_headers = "tmdb_id"+sep+"title"+sep+"year"+sep+"runtime"+sep+"rating"+sep+"cast"+sep+"genres\n"
f = open(topRatedCSV, "w+")
f.write(CSV_headers)

# Get the top rated movies from the top rated list
def scrapeTopList(headers, url, n_pages):
    topMoviesList = []

    for page in range(1, n_pages+1):
        res_topRated = requests.get(url+'?page='+str(page), headers=headers)
        topRated_html = BeautifulSoup(res_topRated.content, "html.parser")

        print(url+'?page='+str(page), res_topRated.status_code)

        topListHTML = topRated_html.find("div", {"class": "media_items"}).find_all('a', {"class": "image"}) # type: ignore
        
        for title in topListHTML:
            movie_id = topListHTML[topListHTML.index(title)].get('href').split('/')[2]
            topMoviesList.append(movie_id)
    
    return topMoviesList

# Convert runtime to runtime in minutes
def runtime_convertion(movieRuntime):
    movieRuntime = movieRuntime.strip().lower()

    if "h" in movieRuntime and "m" in movieRuntime:
        hours, minutes = movieRuntime.split("h")
        minutes = minutes.split("m")[0]
        runtimeMinutes = int(hours) * 60 + int(minutes)

    elif "h" in movieRuntime:
        hours = movieRuntime.split("h")[0]
        runtimeMinutes = int(hours) * 60

    elif "m" in movieRuntime:
        minutes = movieRuntime.split("m")[0]
        runtimeMinutes = int(minutes)

    else:
        runtimeMinutes = int(movieRuntime)

    return runtimeMinutes

# Scrape movie details from movie page
def scrapeMovieDetails(headers, movie_id):
    url = 'https://www.themoviedb.org/movie/' + movie_id
    res_moviePage = requests.get(url, headers=headers)
    moviePage_html = BeautifulSoup(res_moviePage.content, "html.parser")

    print(url, res_moviePage.status_code)

    movieName = moviePage_html.find('div', {"class": "title"}).find('a').get_text() # type: ignore
    movieYear = moviePage_html.select_one('span.release_date').get_text(strip=True)[-5:-1]
    movieRuntime = runtime_convertion(moviePage_html.find('span', {"class": "runtime"}).get_text().strip().replace('\n', '').replace(' ', ''))
    movieRating = moviePage_html.find('div', {"class": "user_score_chart"}).get('data-percent').split('.')[0]
    movieGenresTags = moviePage_html.find_all('span', {"class": "genres"})[0].find_all('a')
    movieGenresString = ''
    for genre in movieGenresTags:
        movieGenresString += genre.get_text().replace(' ', '_') + '|'
    movieGenresString = movieGenresString[:-1]
    movieCastTags = moviePage_html.find_all('img', {"class": "profile"})
    movieCastString = ''
    for castTag in movieCastTags:
        castName = castTag.get('alt')
        movieCastString += castName.replace(' ', '_').replace('-', '_') + '|'
    movieCastString = movieCastString[:-1]

    print(f'"{movie_id}";"{movieName}";{movieYear};{movieRuntime};{movieRating};{movieCastString};{movieGenresString}', sep=sep, file=f)

In [25]:
# Scrape top rated movies
headers = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 13.3; rv:109.0) Gecko/20100101 Firefox/113.0'}
url = 'https://www.themoviedb.org/movie/top-rated'
n_pages = 25 # 1 page = 20 movies
topList = scrapeTopList(headers, url, n_pages)

https://www.themoviedb.org/movie/top-rated?page=1 200
https://www.themoviedb.org/movie/top-rated?page=2 200
https://www.themoviedb.org/movie/top-rated?page=3 200
https://www.themoviedb.org/movie/top-rated?page=4 200
https://www.themoviedb.org/movie/top-rated?page=5 200
https://www.themoviedb.org/movie/top-rated?page=6 200
https://www.themoviedb.org/movie/top-rated?page=7 200
https://www.themoviedb.org/movie/top-rated?page=8 200
https://www.themoviedb.org/movie/top-rated?page=9 200
https://www.themoviedb.org/movie/top-rated?page=10 200
https://www.themoviedb.org/movie/top-rated?page=11 200
https://www.themoviedb.org/movie/top-rated?page=12 200
https://www.themoviedb.org/movie/top-rated?page=13 200
https://www.themoviedb.org/movie/top-rated?page=14 200
https://www.themoviedb.org/movie/top-rated?page=15 200
https://www.themoviedb.org/movie/top-rated?page=16 200
https://www.themoviedb.org/movie/top-rated?page=17 200
https://www.themoviedb.org/movie/top-rated?page=18 200
https://www.themovi

In [26]:
# Scrape details from movie page and write to CSV file
for movie_id in topList:
    scrapeMovieDetails(headers, movie_id)

# Close CSV file
f.close()

https://www.themoviedb.org/movie/238 200
https://www.themoviedb.org/movie/278 200
https://www.themoviedb.org/movie/240 200
https://www.themoviedb.org/movie/19404 200
https://www.themoviedb.org/movie/424 200
https://www.themoviedb.org/movie/129 200
https://www.themoviedb.org/movie/389 200
https://www.themoviedb.org/movie/372058 200
https://www.themoviedb.org/movie/496243 200
https://www.themoviedb.org/movie/155 200
https://www.themoviedb.org/movie/497 200
https://www.themoviedb.org/movie/680 200
https://www.themoviedb.org/movie/429 200
https://www.themoviedb.org/movie/13 200
https://www.themoviedb.org/movie/372754 200
https://www.themoviedb.org/movie/122 200
https://www.themoviedb.org/movie/769 200
https://www.themoviedb.org/movie/346 200
https://www.themoviedb.org/movie/696374 200
https://www.themoviedb.org/movie/11216 200
https://www.themoviedb.org/movie/637 200
https://www.themoviedb.org/movie/995133 200
https://www.themoviedb.org/movie/12477 200
https://www.themoviedb.org/movie/311 