# Part B : Scraping data from The Movie Database (TMDB) 

## 1. Establish a connection to the webpage

In [1]:
# 1a

import requests

url = "https://www.themoviedb.org/movie"

needed_headers = {'User-Agent': "Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36"}

response = requests.get(("https://www.themoviedb.org/movie"),headers = needed_headers)

In [2]:
#1b

if response.status_code == 200:
    print("Request was successful. Status code:", response.status_code)
else:
    print("Request failed. Status code:", response.status_code)

Request was successful. Status code: 200


In [3]:
#1c

web_content = response.text
print("Content of the webpage:")
print(web_content)

Content of the webpage:
<!DOCTYPE html>
<html lang="en" class="no-js">
  <head>
    <title>Popular Movies &#8212; The Movie Database (TMDB)</title>
    <meta http-equiv="cleartype" content="on">
    <meta charset="utf-8">
    <meta name="keywords" content="Movies, TV Shows, Streaming, Reviews, API, Actors, Actresses, Photos, User Ratings, Synopsis, Trailers, Teasers, Credits, Cast">
    <meta name="mobile-web-app-capable" content="yes">
    <meta name="apple-mobile-web-app-capable" content="yes">
    <meta name="viewport" content="width=device-width,initial-scale=1">
      <meta name="description" content="The Movie Database (TMDB) is a popular, user editable database for movies and TV shows.">
    <meta name="msapplication-TileImage" content="/assets/2/v4/icons/mstile-144x144-30e7905a8315a080978ad6aeb71c69222b72c2f75d26dab1224173a96fecc962.png">
<meta name="msapplication-TileColor" content="#032541">
<meta name="theme-color" content="#032541">
<link rel="apple-touch-icon" sizes="180x1

In [4]:
#1d

print("First 200 characters of the content:")
print(web_content[:200])

First 200 characters of the content:
<!DOCTYPE html>
<html lang="en" class="no-js">
  <head>
    <title>Popular Movies &#8212; The Movie Database (TMDB)</title>
    <meta http-equiv="cleartype" content="on">
    <meta charset="utf-8">
  


## 2. Parse the content of HTML response using the BeautifulSoup library

In [5]:
#2a

from bs4 import BeautifulSoup
soup = BeautifulSoup(web_content, 'html.parser')

print(soup.prettify())

<!DOCTYPE html>
<html class="no-js" lang="en">
 <head>
  <title>
   Popular Movies — The Movie Database (TMDB)
  </title>
  <meta content="on" http-equiv="cleartype"/>
  <meta charset="utf-8"/>
  <meta content="Movies, TV Shows, Streaming, Reviews, API, Actors, Actresses, Photos, User Ratings, Synopsis, Trailers, Teasers, Credits, Cast" name="keywords"/>
  <meta content="yes" name="mobile-web-app-capable"/>
  <meta content="yes" name="apple-mobile-web-app-capable"/>
  <meta content="width=device-width,initial-scale=1" name="viewport"/>
  <meta content="The Movie Database (TMDB) is a popular, user editable database for movies and TV shows." name="description"/>
  <meta content="/assets/2/v4/icons/mstile-144x144-30e7905a8315a080978ad6aeb71c69222b72c2f75d26dab1224173a96fecc962.png" name="msapplication-TileImage"/>
  <meta content="#032541" name="msapplication-TileColor"/>
  <meta content="#032541" name="theme-color"/>
  <link href="/assets/2/apple-touch-icon-57ed4b3b0450fd5e9a0c20f34e814b

In [6]:
#2b

print("The title of the webpage is :", soup.title.string)

The title of the webpage is : Popular Movies — The Movie Database (TMDB)


In [7]:
#2c
import requests
from bs4 import BeautifulSoup

def get_BS(url):
  needed_headers = {'User-Agent': "Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36"}

  try:
    res = requests.get(url,headers=needed_headers)
    res.raise_for_status()
    soup = BeautifulSoup(res.content,"html.parser")
    return soup

  except requests.exceptions.RequestException as e:
    print("Error : ", e)
    return None


In [8]:
#1st Test case: Working url
working_url = "https://www.google.com"
soup_working = get_BS(working_url)

print("Test Case 1:\n",soup_working.prettify())


Test Case 1:
 <!DOCTYPE html>
<html itemscope="" itemtype="http://schema.org/WebPage" lang="en-IN">
 <head>
  <meta charset="utf-8"/>
  <meta content="origin" name="referrer"/>
  <meta content="/images/branding/googleg/1x/googleg_standard_color_128dp.png" itemprop="image"/>
  <title>
   Google
  </title>
  <script nonce="f034ZlhBZBxdteuHkqNh1w">
   (function(){var _g={kEI:'dkDmZNzIMdDlseMP7vWtmA4',kEXPI:'31',kBL:'Atu7',kOPI:89978449};(function(){var a;(null==(a=window.google)?0:a.stvsc)?google.kEI=_g.kEI:window.google=_g;}).call(this);})();(function(){google.sn='webhp';google.kHL='en-IN';})();(function(){
var h=this||self;function l(){return void 0!==window.google&&void 0!==window.google.kOPI&&0!==window.google.kOPI?window.google.kOPI:null};var m,n=[];function p(a){for(var b;a&&(!a.getAttribute||!(b=a.getAttribute("eid")));)a=a.parentNode;return b||m}function q(a){for(var b=null;a&&(!a.getAttribute||!(b=a.getAttribute("leid")));)a=a.parentNode;return b}function r(a){/^http:/i.test(a)&&

In [9]:
#2nd Test case: url with 404 response
invalid_url = "https://www.google.com/nonexistant"
soup_invalid = get_BS(invalid_url)

Error :  404 Client Error: Not Found for url: https://www.google.com/nonexistant


## 3. Extract the content of the webpage

In [10]:
#3a
working_url = "https://www.themoviedb.org/movie"
soup = get_BS(working_url)
print("Test Case 1:\n",soup.prettify())


Test Case 1:
 <!DOCTYPE html>
<html class="no-js" lang="en">
 <head>
  <title>
   Popular Movies — The Movie Database (TMDB)
  </title>
  <meta content="on" http-equiv="cleartype"/>
  <meta charset="utf-8"/>
  <meta content="Movies, TV Shows, Streaming, Reviews, API, Actors, Actresses, Photos, User Ratings, Synopsis, Trailers, Teasers, Credits, Cast" name="keywords"/>
  <meta content="yes" name="mobile-web-app-capable"/>
  <meta content="yes" name="apple-mobile-web-app-capable"/>
  <meta content="width=device-width,initial-scale=1" name="viewport"/>
  <meta content="The Movie Database (TMDB) is a popular, user editable database for movies and TV shows." name="description"/>
  <meta content="/assets/2/v4/icons/mstile-144x144-30e7905a8315a080978ad6aeb71c69222b72c2f75d26dab1224173a96fecc962.png" name="msapplication-TileImage"/>
  <meta content="#032541" name="msapplication-TileColor"/>
  <meta content="#032541" name="theme-color"/>
  <link href="/assets/2/apple-touch-icon-57ed4b3b0450fd5e

In [11]:
#3b
div_card = soup.find("div", class_="card style_1")
print(div_card.contents)

['\n', <div class="image">
<div class="wrapper">
<a class="image" href="/movie/976573" title="Elemental">
<img alt="" class="poster" loading="lazy" src="/t/p/w220_and_h330_face/6oH378KUfCEitzJkm07r97L0RsZ.jpg" srcset="/t/p/w220_and_h330_face/6oH378KUfCEitzJkm07r97L0RsZ.jpg 1x, /t/p/w440_and_h660_face/6oH378KUfCEitzJkm07r97L0RsZ.jpg 2x"/>
</a>
</div>
<div class="options" data-id="976573" data-media-type="movie" data-object-id="62825b8b0d5d854560ddaf84">
<a class="no_click" href="#"><div class="glyphicons_v2 circle-more white"></div></a>
</div>
</div>, '\n', <div class="content">
<div class="consensus tight">
<div class="outer_ring">
<div class="user_score_chart 62825b8b0d5d854560ddaf84" data-bar-color="#21d07a" data-percent="78.0" data-track-color="#204529">
<div class="percent">
<span class="icon icon-r78"></span>
</div>
</div>
</div>
</div>
<h2><a href="/movie/976573" title="Elemental">Elemental</a></h2>
<p>Jun 14, 2023</p>
</div>, '\n', <div class="hover 976573"></div>, '\n']


In [12]:
a = div_card.h2.a
data_percent = div_card.find("div", class_="user_score_chart")["data-percent"]

print("Details of the first Movie:")
#3c
print("Movie Name:", a.get_text())
#3d
print(f"User Rating:{data_percent}%")
#3e
print(a["href"])

Details of the first Movie:
Movie Name: Elemental
User Rating:78.0%
/movie/976573


## 4. Write user defined functions for each subsection

In [13]:
#4a,4b

def movie_info(soup):
  div_cards = soup.find_all("div", class_="card style_1")
  movie_title_list = []
  ratings_list = []
  links_list = []

  for i in range(0, len(div_cards)):
    div = div_cards[i]
    title = div.h2.a
    rate = div.find("div", class_="user_score_chart")["data-percent"]
    link = title["href"]
    movie_title_list.append(title.get_text())
    ratings_list.append(rate)
    links_list.append(link)

  return movie_title_list, ratings_list,links_list

mov,rate,links = movie_info(soup)
print(mov) #List of Movie Titles(4a)
print(rate) #List of User Ratings(4b)
print(links) #List of href links

['Elemental', 'Heart of Stone', 'Spider-Man: Across the Spider-Verse', 'Transformers: Rise of the Beasts', 'Barbie', 'The Flash', 'Meg 2: The Trench', 'The Flood', 'Cobweb', 'No Hard Feelings', 'Fast X', 'Insidious: The Red Door', 'Babylon 5: The Road Home', 'Soulcatcher', 'Hidden Strike', 'The Little Mermaid', 'Dampyr', 'Resident Evil: Death Island', 'Guardians of the Galaxy Vol. 3', 'Zom 100: Bucket List of the Dead']
['78.0', '70', '85.0', '75.0', '74.0', '70', '69.0', '69.0', '68.0', '71.0', '73.0', '69.0', '73.0', '65.0', '71.0', '66.0', '66.0', '77.0', '80', '67.0']
['/movie/976573', '/movie/724209', '/movie/569094', '/movie/667538', '/movie/346698', '/movie/298618', '/movie/615656', '/movie/1006462', '/movie/709631', '/movie/884605', '/movie/385687', '/movie/614479', '/movie/1121575', '/movie/1149381', '/movie/457332', '/movie/447277', '/movie/644124', '/movie/1083862', '/movie/447365', '/movie/1070514']


In [14]:
#4c
def html_content(links):

  html_content_list = []
  for link in links:
    html_con = get_BS("https://www.themoviedb.org"+link)
    html_content_list.append(html_con)
  return html_content_list

html_list = html_content(links)

In [15]:
#4d
def get_movie_genres(html_list):
  genre = []

  for index, html_content in enumerate(html_list, start=1):
      sub_list = []
      genre_span = html_content.find("span", class_="genres")
      a_tags = genre_span.find_all("a")

      for a_tag in a_tags:
        sub_list.append(a_tag.get_text())

      genre.append(sub_list)

  return genre

genre = get_movie_genres(html_list)

print(genre)

[['Animation', 'Comedy', 'Family', 'Fantasy', 'Romance'], ['Thriller', 'Action'], ['Animation', 'Action', 'Adventure'], ['Action', 'Adventure', 'Science Fiction'], ['Comedy', 'Adventure', 'Fantasy'], ['Action', 'Adventure', 'Science Fiction'], ['Action', 'Science Fiction', 'Horror'], ['Action', 'Horror', 'Thriller'], ['Horror'], ['Comedy', 'Romance'], ['Action', 'Crime', 'Thriller'], ['Horror', 'Mystery', 'Thriller'], ['Animation', 'Science Fiction', 'Action', 'Adventure'], ['Action', 'Adventure', 'Thriller'], ['Action', 'Adventure', 'Thriller', 'Comedy'], ['Adventure', 'Family', 'Fantasy', 'Romance'], ['Horror', 'Fantasy', 'Action'], ['Animation', 'Action', 'Horror'], ['Science Fiction', 'Adventure', 'Action'], ['Comedy', 'Horror', 'Action']]


In [16]:
#4e

def get_movie_cast(html_list):

  cast_list =[]
  for index, html_content in enumerate(html_list, start=1):
    sub_list = []

    casts = html_content.find_all("li", class_="card")
    for cast in casts:
      p = cast.p.get_text()
      sub_list.append(p)


    cast_list.append(sub_list)
  return cast_list


cast = get_movie_cast(html_list)
print(cast)

[['Leah Lewis', 'Mamoudou Athie', 'Ronnie del Carmen', 'Shila Ommi', 'Wendi McLendon-Covey', "Catherine O'Hara", 'Mason Wertheimer', 'Ronobir Lahiri', 'Wilma Bonet'], ['Gal Gadot', 'Jamie Dornan', 'Alia Bhatt', 'Sophie Okonedo', 'Matthias Schweighöfer', 'Paul Ready', 'Jing Lusi', 'Enzo Cilenti', 'Joe Reisig'], ['Shameik Moore', 'Hailee Steinfeld', 'Jason Schwartzman', 'Oscar Isaac', 'Brian Tyree Henry', 'Luna Lauren Velez', 'Jake Johnson', 'Issa Rae', 'Karan Soni'], ['Anthony Ramos', 'Dominique Fishback', 'Peter Cullen', 'Ron Perlman', 'Peter Dinklage', 'Michelle Yeoh', 'Pete Davidson', 'Liza Koshy', 'Cristo Fernández'], ['Margot Robbie', 'Ryan Gosling', 'America Ferrera', 'Kate McKinnon', 'Ariana Greenblatt', 'Michael Cera', 'Will Ferrell', 'Issa Rae', 'Alexandra Shipp'], ['Ezra Miller', 'Sasha Calle', 'Michael Keaton', 'Michael Shannon', 'Ron Livingston', 'Maribel Verdú', 'Kiersey Clemons', 'Antje Traue', 'Saoirse-Monica Jackson'], ['Jason Statham', 'Wu Jing', 'Shuya Sophia Cai', 'Se

 ## 5. Write an user defined function that returns a pandas data frame

In [17]:
import pandas as pd
import numpy as np


def data_from_page(page_soup):
    mov_page, rate_page,links = movie_info(page_soup)
    html_content_list = html_content(links)
    genre_page = get_movie_genres(html_content_list)
    cast_page = get_movie_cast(html_content_list)

    data = {
        "Title": mov_page,
        "User Rating": rate_page,
        "Genres": genre_page,
        "Cast": cast_page
    }
    movie_dataframe = pd.DataFrame(data)
    return movie_dataframe


## 6. Scraping the data and combining the dataframes

In [18]:
#6a
def data_from_multiple_pages():
    base_url = "https://www.themoviedb.org/movie"
    data_frames = []
    no_pages = 5
    for i in range(1,no_pages+1):
        page_soup = get_BS(base_url+f"?page={i}")
        movie_dataframe = data_from_page(page_soup)
        data_frames.append(movie_dataframe)
    return data_frames
        

In [20]:
#6b
data_frames = data_from_multiple_pages()
combined_dataframe = pd.concat(data_frames, ignore_index=True)
combined_dataframe['User Rating'] = combined_dataframe['User Rating'].replace('0', np.nan)
print(combined_dataframe)

# Export the combined DataFrame to a CSV file
combined_dataframe.to_csv('movie_data_rishab.csv', index=False)
print("successfully combined and csv exported!!")

                                  Title User Rating  \
0                             Elemental        78.0   
1                        Heart of Stone          70   
2   Spider-Man: Across the Spider-Verse        85.0   
3      Transformers: Rise of the Beasts        75.0   
4                                Barbie        74.0   
..                                  ...         ...   
95                             Remember        74.0   
96       The Last Voyage of the Demeter        75.0   
97                              Ghosted        71.0   
98                            Quicksand         NaN   
99                           Black Adam          70   

                                           Genres  \
0   [Animation, Comedy, Family, Fantasy, Romance]   
1                              [Thriller, Action]   
2                  [Animation, Action, Adventure]   
3            [Action, Adventure, Science Fiction]   
4                    [Comedy, Adventure, Fantasy]   
..                   