In [8]:
# Importing libraries needed to scrape
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
# This will be heavily commented as you can imagine it's easy to forget a python library lol

In [9]:
needed_headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}

res = requests.get("https://www.themoviedb.org/movie", headers=needed_headers)
res.status_code
# An exit status of 200 means OK :)


200

In [10]:
# Creating an instance of the BeautifulSoup class
# also im going to be using lxml instead html.parser lets see how that goes
soup = BeautifulSoup(res.text, "lxml")

# We won't call soup for our own sanity!

In [11]:
web_content = soup.get_text().strip()
# using .strip() right now gets rid of trailing whitespaces but not leading??? wtf? will figure out
print(web_content[:15])

Popular Movies 


### Extracting the title of the web page using various methods:

In [12]:
# Extracting title
soup.title


<title>Popular Movies — The Movie Database (TMDB)</title>

In [13]:
# Another method of fetching the title
soup.select("title")[0].getText()

'Popular Movies — The Movie Database (TMDB)'

### We will use functions to help ease our workflow:

In [14]:
# Generalizing initial task with the help of functional stuff
def vaccum(url: str) -> BeautifulSoup:
    try:
        # I am not a robot lmao
        needed_headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
        res_func = requests.get(url,headers=needed_headers)

        # Raise exception if status code is 5xx or 4xx
        res_func.raise_for_status()

        return BeautifulSoup(res_func.text, "lxml")
    
    except requests.exceptions.RequestException as e:
        # Catch all request-related errors 
        print(f"An Error occured!: {e}")
        return None
    
    finally:
        print("Excecution FIN")

In [15]:
# test case 1: a working url
vaccumed = vaccum("https://www.themoviedb.org/movie")
# This works, won't call it for obvious reasons haha

Excecution FIN


In [16]:
# test case 2: malformed/incorrect URLs
vaccum("https://www.themovipepepe.org/what")

An Error occured!: HTTPSConnectionPool(host='www.themovipepepe.org', port=443): Max retries exceeded with url: /what (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x00000242ABA3C980>: Failed to resolve 'www.themovipepepe.org' ([Errno 11001] getaddrinfo failed)"))
Excecution FIN


In [17]:
# Displaying the name of the first movie
first_movie = soup.select(".content h2")[3].getText()
first_movie

'Sonic the Hedgehog 3'

In [18]:
# Extracting User rating of the first movie
span_class = soup.select_one(".user_score_chart .icon")['class'][1]
span_class
perc1 = span_class.split("icon-r")[1]
print(f"User score of the movie: {first_movie} is {perc1}%")




User score of the movie: Sonic the Hedgehog 3 is 79%


In [19]:
# we're gonna use regex for extracting the part of the url following the string “https://www.themoviedb.org/” 
text = str(soup.select(".content h2"))
match = re.search(r'movie/\d\d\d\d\d\d',text)
match.group()
# This is going to help us later (i think)

'movie/939243'

In [20]:
# Grabbing the title of all the movies
def title_grabber(soup: BeautifulSoup) -> list:
    titles = []
    movies = soup.select(".content h2")[3:]
    for movie in movies:
        a_tag = movie.find("a", title = True)
        if a_tag:
            titles.append(a_tag["title"])
    return titles
    
        
title_grabber(vaccumed)

['Sonic the Hedgehog 3',
 'Kraven the Hunter',
 'बैक इन ऐक्शन',
 'Nosferatu',
 'Mufasa: The Lion King',
 'Moana 2',
 'Devara: Part 1',
 'Venom: The Last Dance',
 'Gladiator II',
 'The Gardener',
 'Wicked',
 'Kingdom IV: Return of the Great General',
 'The Substance',
 'Alarum',
 'River of Blood',
 'यॉर फ़ॉल्ट',
 'Werewolves',
 'Aftermath',
 'Red One',
 'The Lord of the Rings: The War of the Rohirrim']

In [21]:
# Grabbing the user ratings of all the movies in a page
def rating_grabber(soup: BeautifulSoup) -> list:
    user_score = []
    scores = soup.select(".content .user_score_chart")

    for score in scores:
        percent = score["data-percent"]
        user_score.append(percent)

    return user_score
rating_grabber(vaccumed)

['79',
 '66',
 '67',
 '66',
 '75',
 '70',
 '72',
 '68',
 '68',
 '49',
 '69',
 '71',
 '71',
 '59',
 '65',
 '72',
 '63',
 '63',
 '71',
 '66']

In [22]:
soup.select(".content h2")

[<h2>Sort</h2>,
 <h2>Where To Watch <span>71</span></h2>,
 <h2>Filters</h2>,
 <h2><a href="/movie/939243-sonic-the-hedgehog-3" title="Sonic the Hedgehog 3">Sonic the Hedgehog 3</a></h2>,
 <h2><a href="/movie/539972-kraven-the-hunter" title="Kraven the Hunter">Kraven the Hunter</a></h2>,
 <h2><a href="/movie/993710-back-in-action" title="बैक इन ऐक्शन">बैक इन ऐक्शन</a></h2>,
 <h2><a href="/movie/426063-nosferatu" title="Nosferatu">Nosferatu</a></h2>,
 <h2><a href="/movie/762509-mufasa-the-lion-king" title="Mufasa: The Lion King">Mufasa: The Lion King</a></h2>,
 <h2><a href="/movie/1241982-moana-2" title="Moana 2">Moana 2</a></h2>,
 <h2><a href="/movie/811941-part-1" title="Devara: Part 1">Devara: Part 1</a></h2>,
 <h2><a href="/movie/912649-venom-the-last-dance" title="Venom: The Last Dance">Venom: The Last Dance</a></h2>,
 <h2><a href="/movie/558449-gladiator-ii" title="Gladiator II">Gladiator II</a></h2>,
 <h2><a href="/movie/1255788-le-jardinier" title="The Gardener">The Gardener</a><

In [23]:
# Extracting the HTML golddd of all the individual pages of movies
def html_content(soup: BeautifulSoup) -> list:
    my_content = str(soup.select(".content h2"))
    my_list = []

    stuff = re.findall(r"movie/\d\d\d\d\d\d", my_content)
    my_list.extend(stuff)

    return my_list
   
html_content(vaccumed)
    

['movie/939243',
 'movie/539972',
 'movie/993710',
 'movie/426063',
 'movie/762509',
 'movie/124198',
 'movie/811941',
 'movie/912649',
 'movie/558449',
 'movie/125578',
 'movie/402431',
 'movie/124132',
 'movie/933260',
 'movie/124928',
 'movie/122206',
 'movie/115659',
 'movie/970450',
 'movie/108101',
 'movie/845781',
 'movie/839033']

In [24]:
soup.select(".multi_select")[-2]

<ul class="multi_select text" id="with_genres" name="with_genres[]">
<li data-value="28"><a class="no_click" href="/discover/movie?with_genres=28">Action</a></li>
<li data-value="12"><a class="no_click" href="/discover/movie?with_genres=12">Adventure</a></li>
<li data-value="16"><a class="no_click" href="/discover/movie?with_genres=16">Animation</a></li>
<li data-value="35"><a class="no_click" href="/discover/movie?with_genres=35">Comedy</a></li>
<li data-value="80"><a class="no_click" href="/discover/movie?with_genres=80">Crime</a></li>
<li data-value="99"><a class="no_click" href="/discover/movie?with_genres=99">Documentary</a></li>
<li data-value="18"><a class="no_click" href="/discover/movie?with_genres=18">Drama</a></li>
<li data-value="10751"><a class="no_click" href="/discover/movie?with_genres=10751">Family</a></li>
<li data-value="14"><a class="no_click" href="/discover/movie?with_genres=14">Fantasy</a></li>
<li data-value="36"><a class="no_click" href="/discover/movie?with_ge

In [25]:
# Grabbing the genres of the movies
def genre_grabber(soup: BeautifulSoup) -> list:
    genres = soup.select(".multi_select")[-2]
    my_genres = []
    for gen in genres.find_all("li"): # found them in list item using inspect
        a_tag = gen.find("a")
        if a_tag:
            my_genres.append(a_tag.text.strip())
    return my_genres
genre_grabber(vaccumed)
    

['Action',
 'Adventure',
 'Animation',
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Family',
 'Fantasy',
 'History',
 'Horror',
 'Music',
 'Mystery',
 'Romance',
 'Science Fiction',
 'TV Movie',
 'Thriller',
 'War',
 'Western']

In [41]:
res = requests.get("https://www.themoviedb.org/939243", headers=needed_headers)
soup.select(".people")

ConnectTimeout: HTTPSConnectionPool(host='www.themoviedb.org', port=443): Max retries exceeded with url: /939243 (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x00000242AEC931D0>, 'Connection to www.themoviedb.org timed out. (connect timeout=None)'))

In [None]:
# Grabbing the casts of all the movies
def cast_grabber(soup: BeautifulSoup) -> list:
    
    
    pass
    