In [5]:
# 1. Establish a connection to the webpage - "https://www.themoviedb.org/movie"
import requests

tmdb_url = 'https://www.themoviedb.org/movie'

needed_headers = {
  'User-Agent': "Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36"
}

page_source = ''

response = requests.get(tmdb_url, headers=needed_headers) # 1a
status_code = response.status_code
if response.ok: # 1b
  page_source = response.text # 1c
  # print(page_source) # 1c
  print(response.encoding) # 1d
  print(type(page_source)) # 1d
  print(page_source[:200]) # 1d


utf-8
<class 'str'>
<!DOCTYPE html>
<html lang="en" class="no-js">
  <head>
    <title>Popular Movies &#8212; The Movie Database (TMDB)</title>
    <meta http-equiv="cleartype" content="on">
    <meta charset="utf-8">
  


In [6]:
pip install validators

Collecting validators
  Obtaining dependency information for validators from https://files.pythonhosted.org/packages/3a/0c/785d317eea99c3739821718f118c70537639aa43f96bfa1d83a71f68eaf6/validators-0.22.0-py3-none-any.whl.metadata
  Downloading validators-0.22.0-py3-none-any.whl.metadata (4.7 kB)
Downloading validators-0.22.0-py3-none-any.whl (26 kB)
Installing collected packages: validators
Successfully installed validators-0.22.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.11 -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [6]:
# 2. Parse the content of HTML response using the BeautifulSoup library and execute the tasks specified in the guidelines mentioned below
from bs4 import BeautifulSoup
import requests
import validators

soup = BeautifulSoup(page_source, features="html.parser")

page_title = soup.find(class_='title').text
print(page_title)


def fetch_parse_url(url):
  if not validators.url(url):
    raise Exception('Invalid url')
  
  needed_headers = {
    'User-Agent': "Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36"
  }
  
  try:
    response = requests.get(url, headers=needed_headers)
    response.raise_for_status()
  except requests.exceptions.HTTPError as e:
    raise Exception(f"HTTP Error: {e.response.status_code}") from e
  except requests.exceptions.RequestException as e:
    raise Exception("Error fetching url") from e
  else:
    soup = BeautifulSoup(response.content, 'html.parser')
    return soup


working_url = "https://www.scrapethissite.com/pages/simple/"
try:
  result = fetch_parse_url(working_url)
  print(f"Content fetched successfully for url: {working_url}")
except Exception as e:
  print(f"failed to fetch: {e}")

broken_url = "https://www.cannotscrapethissite.com/"
try:
  result = fetch_parse_url(broken_url)
  print(f"Content fetched successfully for url: {broken_url}")
except Exception as e:
  print(f"failed to fetch: {e}")


Popular Movies

Content fetched successfully for url: https://www.scrapethissite.com/pages/simple/
failed to fetch: Error fetching url


In [62]:
# 3. Extract content from TBDB website
tmdb_content_soup = None
try:
  tmdb_content_soup = fetch_parse_url(tmdb_url) # 3a
except Exception as e:
  print(f"Cannot fetch TMDB content: {e}")

# not printing as html output is too long
# print(tmdb_content_soup.text)

In [63]:
# 3 Extract from TMDB website continued...

movie_containers = tmdb_content_soup.find(id="media_results").find_all("div", class_="card style_1")
first_movie_container = None
if len(movie_containers) > 0:
  first_movie_container = movie_containers[0]
  print(first_movie_container.prettify()[:200]) # 3b printing only 200 characters as html output is too long
else:
  # print("Could not find the first movie container")
  raise Exception("Could not find the first movie container")

try:
  first_movie_content = first_movie_container.find("div", class_="content")
  first_movie_title = first_movie_content.find('h2').find('a').text
  print("------- First movie title --------")
  print(first_movie_title) # 3c
except Exception as e:
  raise Exception("Could not find the first movie title")

try:
  first_movie_rating = first_movie_content.find("div", class_="user_score_chart").get("data-percent")
  print("------- First movie rating --------")
  print(first_movie_rating) # 3d
except Exception as e:
  raise Exception("Could not find the first movie rating")

try:
  first_movie_content_2 = first_movie_container.find("div", class_="content")
  first_movie_path = first_movie_content_2.find('h2').find('a').get('href')
  print("------- First movie path --------")
  print(first_movie_path) # 3e
except Exception as e:
  raise Exception("Could not find the first movie path")
  

<div class="card style_1">
 <div class="image">
  <div class="wrapper glyphicons_v2 picture grey no_image_holder">
   <a class="image" href="/movie/1072790" title="Anyone But You">
    <img alt="Anyon
------- First movie title --------
Anyone But You
------- First movie rating --------
68.94999999999999
------- First movie path --------
/movie/1072790


In [1]:
# Write user defined functions
import time

def is_float(s):
  try:
    float(s)
    return True
  except ValueError:
    return False

# 4a
def get_titles(page_soup):
  titles = []
  try:
    elements = page_soup.find(id="media_results").find_all("div", class_="card style_1")
    for element in elements:
      titles.append(element.find("div", class_="content").find('h2').find('a')['title'])
  except Exception as e:
    raise Exception("Couldnot get movie titles") from e
  return titles

# 4b
def get_user_ratings(page_soup):
  ratings = []
  try:
    elements = page_soup.find(id="media_results").find_all("div", class_="card style_1")
    for element in elements:
      rating = element.find("div", class_="user_score_chart").get("data-percent", 'n/a')
      if is_float(rating):
        ratings.append(rating)
      else:
        ratings.append("not rated")
  except Exception as e:
    raise Exception("Couldnot get movie ratings") from e
  return ratings

# 4c
def get_movie_urls(page_soup):
  urls = []
  try:
    elements = page_soup.find(id="media_results").find_all("div", class_="card style_1")
    for element in elements:
      urls.append(element.find("div", class_="content").find('h2').find('a').get('href'))
  except Exception as e:
    raise Exception("Could not get movie URLs") from e
  return urls

def get_movie_pages(urls):
  page_soups = []
  base_url = 'https://www.themoviedb.org'
  try:
    for url in urls:
      page_soup = fetch_parse_url(base_url + url)
      page_soups.append(page_soup)
      time.sleep(2)
  except Exception as e:
    raise Exception('Could not get movie pages')
  return page_soups

# 4d - modified for efficiency
def get_movies_genres(page_soups):
  genres = []
  try:
    for page_soup in page_soups:
      movie_genres_tags = page_soup.find('span', class_='genres').find_all('a')
      movie_genres = list(map(lambda x: x.text, movie_genres_tags))
      genres.append(movie_genres)
  except Exception as e:
    raise Exception("Could not get genres") from e
  return genres

# 4e - modified for efficiency
def get_movies_cast(page_soups):
  movies_cast = []
  try:
    for page_soup in page_soups:
      movie_cast = []
      cast_section = page_soup.find(id='cast_scroller').find('ol', class_='people scroller').find_all('li', class_='card')
      for cast_card in cast_section:
        movie_cast.append(cast_card.find('p').text)
      movies_cast.append(movie_cast)
  except Exception as e:
    raise Exception("Could not get movies cast") from e
  return movies_cast


In [2]:
# 5. Write an user defined function that returns a pandas data frame with following data
import pandas as pd

def get_movies_df(page_soup):
  data_frame = pd.DataFrame()
  data_frame['title'] = get_titles(page_soup)
  data_frame['rating'] = get_user_ratings(page_soup)
  movie_pages = get_movie_pages(get_movie_urls(page_soup))
  data_frame['genres'] = get_movies_genres(movie_pages)
  data_frame['cast'] = get_movies_cast(movie_pages)
  return data_frame

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [8]:
# 6. Scraping the data and combining the dataframes
import pandas as pd
import os

# Save the data frame to a csv file, append if already exists
def save_df(file_path, df):
  file_exists = os.path.isfile(file_path)
  df.to_csv(file_path, mode='a', header=not  file_exists)

# Fetch the movies page, based on number and base url. Save each extracted data to a file within a folder
def scrape_save_movies_page(base_url, dir_path, base_csv_name, page):
  url = base_url
  if page != 1:
    url = f"{base_url}?page={page}"
  page_soup = fetch_parse_url(url)
  df = get_movies_df(page_soup)
  if not os.path.exists(dir_path):
    os.makedirs(dir_path)
  save_df(f"{dir_path}/{base_csv_name}_{page}.csv", df)
  return df

# Fetch for page 1 to 5 by calling the scrape_save_movies_page, concat the data frames into a single data frame and return.
def scrape_movie_pages(base_url, dir_path):
  dfs = []
  for i in range(1, 6):
    df = scrape_save_movies_page(base_url, dir_path, 'tmtb_db', i)
    dfs.append(df)
  return pd.concat(dfs, ignore_index=True)
    
base_url = 'https://www.themoviedb.org/movie'
dir_path = './movies_data'
movie_df = scrape_movie_pages(base_url, dir_path)

# combined csv
save_df("tmtb_db_all.csv", movie_df)

In [9]:
print(movie_df.head())

            title rating                                          genres  \
0     Land of Bad  71.99                         [Action, Thriller, War]   
1  Anyone But You  68.92                               [Comedy, Romance]   
2       Migration  76.38  [Animation, Action, Adventure, Comedy, Family]   
3     The Marvels  62.56            [Science Fiction, Adventure, Action]   
4       No Way Up   57.0                      [Action, Horror, Thriller]   

                                                cast  
0  [Liam Hemsworth, Russell Crowe, Luke Hemsworth...  
1  [Sydney Sweeney, Glen Powell, Alexandra Shipp,...  
2  [Kumail Nanjiani, Elizabeth Banks, Caspar Jenn...  
3  [Brie Larson, Teyonah Parris, Iman Vellani, Za...  
4  [Sophie McIntosh, Will Attenborough, Jeremias ...  


In [10]:
print(movie_df.describe())

              title rating              genres  \
count           100    100                 100   
unique          100     94                  78   
top     Land of Bad   61.0  [Action, Thriller]   
freq              1      3                   5   

                                                     cast  
count                                                 100  
unique                                                100  
top     [Liam Hemsworth, Russell Crowe, Luke Hemsworth...  
freq                                                    1  


In [11]:
print(movie_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   title   100 non-null    object
 1   rating  100 non-null    object
 2   genres  100 non-null    object
 3   cast    100 non-null    object
dtypes: object(4)
memory usage: 3.3+ KB
None
