In [2]:
# Importing libraries needed to scrape
import requests
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
import re
# This is going to be heavily commented, its easy to forget a python library lol

In [5]:
needed_headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}

res = requests.get("https://www.themoviedb.org/movie", headers=needed_headers)
res.status_code
# An exit status of 200 means OK :)


200

In [6]:
# Creating an instance of the BeautifulSoup class
# also im going to be using lxml instead of the conventional html.parser lets see how that goes
soup = BeautifulSoup(res.text, "lxml")

# won't call soup for my own sanity

In [7]:
web_content = soup.get_text().strip()
# using .strip() right now gets rid of trailing whitespaces but not leading??? wtf? will figure out
print(web_content[:15])

Popular Movies 


### Extracting the title of the web page using various methods:

In [8]:
# Extracting title
soup.title

<title>Popular Movies — The Movie Database (TMDB)</title>

In [9]:
# Another method of fetching the title
soup.select("title")[0].getText()

'Popular Movies — The Movie Database (TMDB)'

### Using functions to help ease our workflow:

In [10]:
# Generalizing initial task with the help of functional stuff
def vaccum(url: str) -> BeautifulSoup:
    try:
        # I am not a robot lmao
        needed_headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
        res_func = requests.get(url,headers=needed_headers)

        # Raise exception if status code is 5xx or 4xx
        res_func.raise_for_status()

        return BeautifulSoup(res_func.text, "lxml")
    
    except requests.exceptions.RequestException as e:
        # Catch all request-related errors 
        print(f"An Error occured!: {e}")
        return None
    
    finally:
        print("Excecution FIN")

In [11]:
# test case 1: a working url
vaccumed = vaccum("https://www.themoviedb.org/movie")
# This works, won't call it for obvious reasons haha

Excecution FIN


In [12]:
# test case 2: malformed/incorrect URLs
vaccum("https://www.themovipepepe.org/what")

An Error occured!: HTTPSConnectionPool(host='www.themovipepepe.org', port=443): Max retries exceeded with url: /what (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x0000025E58059CA0>: Failed to resolve 'www.themovipepepe.org' ([Errno 11001] getaddrinfo failed)"))
Excecution FIN


In [13]:
# Displaying the name of the first movie
first_movie = soup.select(".content h2")[3].getText()
first_movie

'A Minecraft Movie'

In [14]:
# Extracting User rating of the first movie
span_class = soup.select_one(".user_score_chart .icon")['class'][1]
span_class
perc1 = span_class.split("icon-r")[1]
print(f"User score of the movie: {first_movie} is {perc1}%")

User score of the movie: A Minecraft Movie is 65%


In [15]:
# we're gonna use regex for extracting the part of the url following the string “https://www.themoviedb.org/” 
text = str(soup.select(".content h2"))
match = re.search(r'movie/\d\d\d\d\d\d',text)
match.group()
# This is going to help us later (i think)

'movie/950387'

### Grabbing the title of all the movies

In [16]:
def title_grabber(soup: BeautifulSoup) -> list:
    titles = []
    movies = soup.select(".content h2")[3:]
    for movie in movies:
        a_tag = movie.find("a", title = True)
        if a_tag:
            titles.append(a_tag["title"])
    return titles
    
        
title_grabber(vaccumed)

['A Minecraft Movie',
 'Final Destination Bloodlines',
 'Snow White',
 'A Working Man',
 'Warfare',
 'The Great Escape',
 'Tin Soldier',
 'Bambi: A Life in the Woods',
 'Thunderbolts*',
 'Last Bullet',
 'Conjuring the Cult',
 'Brave Citizen',
 'Exterritorial',
 'Mission: Impossible - The Final Reckoning',
 "The Haunting at Saint Joseph's",
 'Captain America: Brave New World',
 'In the Lost Lands',
 'Rust',
 'From the Shadows',
 'Sinners']

### Grabbing the user ratings of all the movies in a page

In [17]:
def rating_grabber(soup: BeautifulSoup) -> list:
    user_score = []
    scores = soup.select(".content .user_score_chart")

    for score in scores:
        # edge case
        if score is None:
            user_score.append("No rating")
        else:
            percent = score["data-percent"]
            user_score.append(percent)
        
        

    return user_score
rating_grabber(vaccumed)

['65',
 '72',
 '43',
 '65',
 '72',
 '0',
 '54',
 '58',
 '75',
 '67',
 '57',
 '71',
 '67',
 '76',
 '44',
 '61',
 '63',
 '64',
 '31',
 '76']

In [18]:
soup.select(".content h2")

[<h2>Sort</h2>,
 <h2>Where To Watch <span>279</span></h2>,
 <h2>Filters</h2>,
 <h2><a href="/movie/950387-a-minecraft-movie" title="A Minecraft Movie">A Minecraft Movie</a></h2>,
 <h2><a href="/movie/574475-final-destination-bloodlines" title="Final Destination Bloodlines">Final Destination Bloodlines</a></h2>,
 <h2><a href="/movie/447273-snow-white" title="Snow White">Snow White</a></h2>,
 <h2><a href="/movie/1197306-a-working-man" title="A Working Man">A Working Man</a></h2>,
 <h2><a href="/movie/1241436-warfare" title="Warfare">Warfare</a></h2>,
 <h2><a href="/movie/1480799-the-great-escape" title="The Great Escape">The Great Escape</a></h2>,
 <h2><a href="/movie/977294-tin-soldier" title="Tin Soldier">Tin Soldier</a></h2>,
 <h2><a href="/movie/1094473-bambi-l-histoire-d-une-vie-dans-les-bois" title="Bambi: A Life in the Woods">Bambi: A Life in the Woods</a></h2>,
 <h2><a href="/movie/986056-thunderbolts" title="Thunderbolts*">Thunderbolts*</a></h2>,
 <h2><a href="/movie/1144430-bal

### Extracting the HTML gold of all the individual pages of movies

In [19]:
def html_content(soup: BeautifulSoup) -> list:
    my_content = str(soup.select(".content h2"))
    my_list_of_contents = []

    stuff = re.findall(r"movie/\d\d\d\d\d\d", my_content)
    my_list_of_contents.extend(stuff)

    return my_list_of_contents
   
html_content(vaccumed)
    

['movie/950387',
 'movie/574475',
 'movie/447273',
 'movie/119730',
 'movie/124143',
 'movie/148079',
 'movie/977294',
 'movie/109447',
 'movie/986056',
 'movie/114443',
 'movie/135997',
 'movie/897160',
 'movie/123306',
 'movie/575265',
 'movie/109207',
 'movie/822119',
 'movie/324544',
 'movie/710258',
 'movie/118076',
 'movie/123341']

In [20]:
soup.select(".multi_select")[-2]

<ul class="multi_select text" id="with_genres" name="with_genres[]">
<li data-value="28"><a class="no_click" href="/discover/movie?with_genres=28">Action</a></li>
<li data-value="12"><a class="no_click" href="/discover/movie?with_genres=12">Adventure</a></li>
<li data-value="16"><a class="no_click" href="/discover/movie?with_genres=16">Animation</a></li>
<li data-value="35"><a class="no_click" href="/discover/movie?with_genres=35">Comedy</a></li>
<li data-value="80"><a class="no_click" href="/discover/movie?with_genres=80">Crime</a></li>
<li data-value="99"><a class="no_click" href="/discover/movie?with_genres=99">Documentary</a></li>
<li data-value="18"><a class="no_click" href="/discover/movie?with_genres=18">Drama</a></li>
<li data-value="10751"><a class="no_click" href="/discover/movie?with_genres=10751">Family</a></li>
<li data-value="14"><a class="no_click" href="/discover/movie?with_genres=14">Fantasy</a></li>
<li data-value="36"><a class="no_click" href="/discover/movie?with_ge

### Grabbing the genres of the movies

In [21]:
def genre_grabber(soup: BeautifulSoup) -> list:
    genres = soup.select(".multi_select")[-2]
    my_genres = []
    for gen in genres.find_all("li"): # found them in list item (li) using inspect
        a_tag = gen.find("a") # because its in a hyerplink?
        if a_tag:
            my_genres.append(a_tag.text.strip())
    return my_genres
genre_grabber(vaccumed)
    

['Action',
 'Adventure',
 'Animation',
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Family',
 'Fantasy',
 'History',
 'Horror',
 'Music',
 'Mystery',
 'Romance',
 'Science Fiction',
 'TV Movie',
 'Thriller',
 'War',
 'Western']

In [22]:
# ok so what I plan to do is that
# I already have a list of all the movie content IDs
# gonna iterate through them to create a new list object which will be my inputs for my cast func
html_content(vaccumed)
load = []
for info in html_content(vaccumed):
    load.append('https://www.themoviedb.org/' + info)
load

['https://www.themoviedb.org/movie/950387',
 'https://www.themoviedb.org/movie/574475',
 'https://www.themoviedb.org/movie/447273',
 'https://www.themoviedb.org/movie/119730',
 'https://www.themoviedb.org/movie/124143',
 'https://www.themoviedb.org/movie/148079',
 'https://www.themoviedb.org/movie/977294',
 'https://www.themoviedb.org/movie/109447',
 'https://www.themoviedb.org/movie/986056',
 'https://www.themoviedb.org/movie/114443',
 'https://www.themoviedb.org/movie/135997',
 'https://www.themoviedb.org/movie/897160',
 'https://www.themoviedb.org/movie/123306',
 'https://www.themoviedb.org/movie/575265',
 'https://www.themoviedb.org/movie/109207',
 'https://www.themoviedb.org/movie/822119',
 'https://www.themoviedb.org/movie/324544',
 'https://www.themoviedb.org/movie/710258',
 'https://www.themoviedb.org/movie/118076',
 'https://www.themoviedb.org/movie/123341']

### Grabbing the casts of alll the movies

In [23]:
def cast_grabber(soup: BeautifulSoup) -> list: 

    cast_list = []

    def castisize():
        cards = soup.select("li.card")

        # this loop creates a beautifulsoup instance of every link request
        for card in cards:
              name = card.find("p").get_text(strip=True)
              print(name)
              cast_list.append(name)
        
    castisize()
    return cast_list  

# all the leetcoding makes sense now, epic use of hashmap 
all_casts = {}
for link in load:
     res = requests.get(link,headers=needed_headers)
     soup = BeautifulSoup(res.text,"lxml")
     all_casts[link] = cast_grabber(soup)



Jason Momoa
Jack Black
Sebastian Eugene Hansen
Emma Myers
Danielle Brooks
Jennifer Coolidge
Rachel House
Allan Henry
Bram Scott-Breheny
Kaitlyn Santa Juana
Teo Briones
Rya Kihlstedt
Richard Harmon
Owen Patrick Joyner
Anna Lore
Brec Bassinger
Tony Todd
Andrew Tinpo Lee
Rachel Zegler
Gal Gadot
Andrew Burnap
Jeremy Swift
Jason Kravits
Martin Klebba
George Salazar
Tituss Burgess
Andy Grotelueschen
Scott Eastwood
Robert De Niro
Jamie Foxx
John Leguizamo
Nora Arnezeder
Shamier Anderson
Yul Vazquez
Rita Ora
Saïd Taghmaoui
Florence Pugh
Sebastian Stan
Julia Louis-Dreyfus
Lewis Pullman
David Harbour
Wyatt Russell
Hannah John-Kamen
Olga Kurylenko
Geraldine Viswanathan
Shin Hye-sun
Lee Jun-young
Park Jung-woo
Park Hyuk-kwon
Cha Chung-hwa
Lee Chan-hyeong
Bae Hyeon-jun
Cha Woo-min
Kim Sang-woo
Tom Cruise
Hayley Atwell
Ving Rhames
Simon Pegg
Esai Morales
Pom Klementieff
Henry Czerny
Holt McCallany
Janet McTeer
Anthony Mackie
Harrison Ford
Danny Ramirez
Shira Haas
Tim Blake Nelson
Carl Lumbly
Giancar

In [24]:
# iteratin through the items of the dict to get clean result
for url, cast in all_casts.items():
     print(f"Cast for {url} -> {cast}")

Cast for https://www.themoviedb.org/movie/950387 -> ['Jason Momoa', 'Jack Black', 'Sebastian Eugene Hansen', 'Emma Myers', 'Danielle Brooks', 'Jennifer Coolidge', 'Rachel House', 'Allan Henry', 'Bram Scott-Breheny']
Cast for https://www.themoviedb.org/movie/574475 -> ['Kaitlyn Santa Juana', 'Teo Briones', 'Rya Kihlstedt', 'Richard Harmon', 'Owen Patrick Joyner', 'Anna Lore', 'Brec Bassinger', 'Tony Todd', 'Andrew Tinpo Lee']
Cast for https://www.themoviedb.org/movie/447273 -> ['Rachel Zegler', 'Gal Gadot', 'Andrew Burnap', 'Jeremy Swift', 'Jason Kravits', 'Martin Klebba', 'George Salazar', 'Tituss Burgess', 'Andy Grotelueschen']
Cast for https://www.themoviedb.org/movie/119730 -> []
Cast for https://www.themoviedb.org/movie/124143 -> []
Cast for https://www.themoviedb.org/movie/148079 -> []
Cast for https://www.themoviedb.org/movie/977294 -> ['Scott Eastwood', 'Robert De Niro', 'Jamie Foxx', 'John Leguizamo', 'Nora Arnezeder', 'Shamier Anderson', 'Yul Vazquez', 'Rita Ora', 'Saïd Taghma

In [None]:
# next need a user defined function that returns a pandas data frame with the above data
def bamboo():
    