In [1]:
# Importing libraries needed to scrape
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
# This will be heavily commented as you can imagine it's easy to forget a python library lol

In [2]:
needed_headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}

res = requests.get("https://www.themoviedb.org/movie", headers=needed_headers)
res.status_code
# An exit status of 200 means OK :)


200

In [3]:
# Creating an instance of the BeautifulSoup class
# also im going to be using lxml instead html.parser lets see how that goes
soup = BeautifulSoup(res.text, "lxml")

# We won't call soup for our own sanity!

In [4]:
web_content = soup.get_text().strip()
# using .strip() right now gets rid of trailing whitespaces but not leading??? wtf? will figure out
print(web_content[:15])

Popular Movies 


### Extracting the title of the web page using various methods:

In [5]:
# Extracting title
soup.title


<title>Popular Movies — The Movie Database (TMDB)</title>

In [7]:
# Another method of fetching the title
soup.select("title")[0].getText()

'Popular Movies — The Movie Database (TMDB)'

### We will use functions to help ease our workflow:

In [9]:
# Generalizing initial task with the help of functions..
def vaccum(url: str) -> BeautifulSoup:
    try:
        # I am not a robot
        needed_headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
        res_func = requests.get(url,headers=needed_headers)

        # Raise exception if status code is 5xx or 4xx
        res_func.raise_for_status()

        return BeautifulSoup(res_func.text, "lxml")
    
    except requests.exceptions.RequestException as e:
        # Catch all request-related errors 
        print(f"An Error occured!: {e}")
        return None
    
    finally:
        print("Excecution FIN")

In [10]:
# test case 1: a working url
vaccumed = vaccum("https://www.themoviedb.org/movie")
# This works, won't call it for obvious reasons

Excecution FIN


In [None]:
# test case 2: malformed/incorrect URLS
vaccum("https://www.themovipepepe.org/what")

An Error occured!: HTTPSConnectionPool(host='www.themovipepepe.org', port=443): Max retries exceeded with url: /what (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x000002F7067E4C10>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed'))
Excecution FIN


In [8]:
# Displaying the name of the first movie
first_movie = soup.select(".content h2")[3].getText()
first_movie

'Red One'

In [9]:
#  Extracting User rating of the first movie
span_class = soup.select_one(".user_score_chart .icon")['class'][1]
span_class
perc1 = span_class.split("icon-r")[1]
print(f"User score of the movie: {first_movie} is {perc1}%")




User score of the movie: Red One is 70%


In [10]:
# we're gonna use regex for extracting the part of the url following the string “https://www.themoviedb.org/” 
text = str(soup.select(".content h2"))
match = re.search(r'movie/\d\d\d\d\d\d',text)
match.group()
# This is going to help us later



'movie/845781'

In [26]:
# Titles of all the movies on the page as a Python list
def title_grabber(soup: BeautifulSoup) -> list:
    titles = []
    movies = soup.select(".content h2")[3:]
    for movie in movies:
        a_tag = movie.find("a", title = True)
        if a_tag:
            titles.append(a_tag["title"])
    return titles
    
        
title_grabber(vaccumed)

['Red One',
 'Venom: The Last Dance',
 'Moana 2',
 'Mufasa: The Lion King',
 'Sonic the Hedgehog 3',
 'Elevation',
 'The Price of Money: A Largo Winch Adventure',
 'कैरी-ऑन',
 'Kraven the Hunter',
 'Absolution',
 'Gladiator II',
 'The Wild Robot',
 'Heretic',
 'वह यादगार क्रिसमस',
 'Armor',
 'The Substance',
 'Terrifier 3',
 'My Fault',
 'Miraculous World, London: At the Edge of Time',
 'Weekend in Taipei']

In [17]:
# User ratings of all the movies on the page as a Python list 
def rating_grabber(soup: BeautifulSoup) -> list:
    user_score = []
    scores = soup.select(".content .user_score_chart")

    for score in scores:
        percent = score["data-percent"]
        user_score.append(percent)

    return user_score
rating_grabber(vaccumed)

['70',
 '68',
 '71',
 '70',
 '70',
 '79',
 '61',
 '57',
 '63',
 '58',
 '61',
 '67',
 '84',
 '55',
 '73',
 '72',
 '77',
 '75',
 '79',
 '72']

In [18]:
soup.select(".content h2")

[<h2>Sort</h2>,
 <h2>Where To Watch <span>103</span></h2>,
 <h2>Filters</h2>,
 <h2><a href="/movie/845781-red-one" title="Red One">Red One</a></h2>,
 <h2><a href="/movie/912649-venom-the-last-dance" title="Venom: The Last Dance">Venom: The Last Dance</a></h2>,
 <h2><a href="/movie/762509-mufasa-the-lion-king" title="Mufasa: The Lion King">Mufasa: The Lion King</a></h2>,
 <h2><a href="/movie/1241982-moana-2" title="Moana 2">Moana 2</a></h2>,
 <h2><a href="/movie/1005331-carry-on" title="Carry-On">Carry-On</a></h2>,
 <h2><a href="/movie/939243-sonic-the-hedgehog-3" title="Sonic the Hedgehog 3">Sonic the Hedgehog 3</a></h2>,
 <h2><a href="/movie/1043905-dirty-angels" title="Dirty Angels">Dirty Angels</a></h2>,
 <h2><a href="/movie/1000075-largo-winch-le-prix-de-l-argent" title="The Price of Money: A Largo Winch Adventure">The Price of Money: A Largo Winch Adventure</a></h2>,
 <h2><a href="/movie/1035048-elevation" title="Elevation">Elevation</a></h2>,
 <h2><a href="/movie/539972-kraven-th

In [24]:
# HTML content of all the individual pages of movies collected into a Python list
def html_content(soup: BeautifulSoup) -> list:
    my_content = str(soup.select(".content h2"))
    my_list = []

    stuff = re.findall(r"movie/\d\d\d\d\d\d", my_content)
    my_list.extend(stuff)

    return my_list
   
html_content(vaccumed)
    

['movie/845781',
 'movie/912649',
 'movie/762509',
 'movie/124198',
 'movie/100533',
 'movie/939243',
 'movie/104390',
 'movie/100007',
 'movie/103504',
 'movie/539972',
 'movie/974453',
 'movie/558449',
 'movie/118491',
 'movie/118238',
 'movie/645757',
 'movie/933260',
 'movie/533535',
 'movie/129965',
 'movie/101058',
 'movie/113819']

In [7]:
soup.select(".multi_select")[-2]


<ul class="multi_select text" id="with_genres" name="with_genres[]">
<li data-value="28"><a class="no_click" href="/discover/movie?with_genres=28">Action</a></li>
<li data-value="12"><a class="no_click" href="/discover/movie?with_genres=12">Adventure</a></li>
<li data-value="16"><a class="no_click" href="/discover/movie?with_genres=16">Animation</a></li>
<li data-value="35"><a class="no_click" href="/discover/movie?with_genres=35">Comedy</a></li>
<li data-value="80"><a class="no_click" href="/discover/movie?with_genres=80">Crime</a></li>
<li data-value="99"><a class="no_click" href="/discover/movie?with_genres=99">Documentary</a></li>
<li data-value="18"><a class="no_click" href="/discover/movie?with_genres=18">Drama</a></li>
<li data-value="10751"><a class="no_click" href="/discover/movie?with_genres=10751">Family</a></li>
<li data-value="14"><a class="no_click" href="/discover/movie?with_genres=14">Fantasy</a></li>
<li data-value="36"><a class="no_click" href="/discover/movie?with_ge

In [13]:
# Genres of all the movies on the page as a Python list 
def genre_grabber(soup: BeautifulSoup) -> list:
    genres = soup.select(".multi_select")[-2]
    my_genres = []
    for gen in genres.find_all("li"):
        a_tag = gen.find("a")
        if a_tag:
            my_genres.append(a_tag.text.strip())
    return my_genres
genre_grabber(vaccumed)
    

['Action',
 'Adventure',
 'Animation',
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Family',
 'Fantasy',
 'History',
 'Horror',
 'Music',
 'Mystery',
 'Romance',
 'Science Fiction',
 'TV Movie',
 'Thriller',
 'War',
 'Western']

In [None]:
soup.select(".")

In [None]:
# Casts
def cast_grabber(soup: BeautifulSoup) -> list:
    pass
    