In [1]:
import requests
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
import re

In [3]:
needed_headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}

res = requests.get("https://www.themoviedb.org/movie", headers=needed_headers)
res.status_code
# An exit status of 200 means OK 

200

In [4]:
# Creating an instance of the BeautifulSoup class
# also im going to be using lxml instead of the conventional html.parser lets see how that goes
soup = BeautifulSoup(res.text, "lxml")

# won't call soup for my own sanity

In [5]:
web_content = soup.get_text().strip()
# using .strip() right now gets rid of trailing whitespaces but not leading??? wtf? will figure out
print(web_content[:15])

Popular Movies 


### Extracting the title of the web page using various methods:

In [6]:
# Extracting title
soup.title

<title>Popular Movies — The Movie Database (TMDB)</title>

In [7]:
# Another method of fetching the title
soup.select("title")[0].getText()

'Popular Movies — The Movie Database (TMDB)'

### Using functions to help ease our workflow:

In [8]:
def vaccum(url: str) -> BeautifulSoup:
    try:
        # I am not a robot lmao
        needed_headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
        res_func = requests.get(url,headers=needed_headers)

        # Raise exc if status code is 5xx or 4xx
        res_func.raise_for_status()

        return BeautifulSoup(res_func.text, "lxml")
    
    except requests.exceptions.RequestException as e:
        # Catch all request-related errors 
        print(f"An Error occured!: {e}")
        return None
    
    finally:
        print("Excecution FIN")

In [9]:
# test case 1: a working url
vaccumed = vaccum("https://www.themoviedb.org/movie")
# This works, won't call it for obvious reasons 

Excecution FIN


In [10]:
# test case 2: malformed/incorrect URLs
vaccum("https://www.themovipepepe.org/what")

An Error occured!: HTTPSConnectionPool(host='www.themovipepepe.org', port=443): Max retries exceeded with url: /what (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x000001B9EE5AEF00>: Failed to resolve 'www.themovipepepe.org' ([Errno 11001] getaddrinfo failed)"))
Excecution FIN


In [11]:
# Displaying the name of the first movie
first_movie = soup.select(".content h2")[3].getText()
first_movie

'Predator: Killer of Killers'

In [12]:
# Extracting User rating of the first movie
span_class = soup.select_one(".user_score_chart .icon")['class'][1]
span_class
perc1 = span_class.split("icon-r")[1]
print(f"User score of the movie: {first_movie} is {perc1}%")

User score of the movie: Predator: Killer of Killers is 80%


In [13]:
# gonna use regex for extracting the part of the url following the string “https://www.themoviedb.org/” 
text = str(soup.select(".content h2"))
match = re.search(r'movie/\d\d\d\d\d\d',text)
match.group()
# This is going to help us later (i think)

'movie/137643'

### Grabbing the title of all the movies

In [14]:
def title_grabber(soup: BeautifulSoup) -> list:
    titles = []
    # edge case
    if soup is None:
        return titles
    movies = soup.select(".content h2")[3:]
    for movie in movies:
        a_tag = movie.find("a", title = True)
        if a_tag:
            titles.append(a_tag["title"])
    return titles
    
        
title_grabber(vaccumed)

['Predator: Killer of Killers',
 'The Accountant²',
 'Lilo & Stitch',
 'Mikaela',
 'Sinners',
 'K.O.',
 'STRAW',
 'Shadow Force',
 'A Minecraft Movie',
 'Final Destination Bloodlines',
 'A Working Man',
 'Ballerina',
 'Warrior Queen',
 'The Ugly Stepsister',
 'Mission: Impossible - The Final Reckoning',
 'The Last Stand of Ellen Cole',
 'How to Train Your Dragon',
 'Woodwalkers',
 'The Accountant',
 'Thor: Ragnarok']

### Grabbing the user ratings of all the movies in a page

In [15]:
def rating_grabber(soup: BeautifulSoup) -> list:
    user_score = []
    scores = soup.select(".content .user_score_chart")

    for score in scores:
        # edge case
        if score is None:
            user_score.append("No rating")
        else:
            percent = score["data-percent"]
            user_score.append(percent)
        
        

    return user_score
rating_grabber(vaccumed)

['80',
 '72',
 '71',
 '63',
 '75',
 '69',
 '83',
 '63',
 '65',
 '70',
 '67',
 '71',
 '34',
 '71',
 '72',
 '82',
 '76',
 '61',
 '71',
 '76']

In [16]:
soup.select(".content h2")

[<h2>Sort</h2>,
 <h2>Where To Watch <span>280</span></h2>,
 <h2>Filters</h2>,
 <h2><a href="/movie/1376434-predator-killer-of-killers" title="Predator: Killer of Killers">Predator: Killer of Killers</a></h2>,
 <h2><a href="/movie/870028-the-accountant" title="The Accountant²">The Accountant²</a></h2>,
 <h2><a href="/movie/552524-lilo-stitch" title="Lilo &amp; Stitch">Lilo &amp; Stitch</a></h2>,
 <h2><a href="/movie/1315988-mikaela" title="Mikaela">Mikaela</a></h2>,
 <h2><a href="/movie/1233413-sinners" title="Sinners">Sinners</a></h2>,
 <h2><a href="/movie/1450599-k-o" title="K.O.">K.O.</a></h2>,
 <h2><a href="/movie/1426776-straw" title="STRAW">STRAW</a></h2>,
 <h2><a href="/movie/757725-shadow-force" title="Shadow Force">Shadow Force</a></h2>,
 <h2><a href="/movie/950387-a-minecraft-movie" title="A Minecraft Movie">A Minecraft Movie</a></h2>,
 <h2><a href="/movie/574475-final-destination-bloodlines" title="Final Destination Bloodlines">Final Destination Bloodlines</a></h2>,
 <h2><a h

### Extracting the HTML content of all the individual pages of movies


In [17]:
def html_content(soup: BeautifulSoup) -> list:
    my_content = str(soup.select(".content h2"))
    my_list_of_contents = []

    stuff = re.findall(r"movie/\d\d\d\d\d\d", my_content)
    my_list_of_contents.extend(stuff)

    return my_list_of_contents
   
html_content(vaccumed)
    

['movie/137643',
 'movie/870028',
 'movie/552524',
 'movie/131598',
 'movie/123341',
 'movie/145059',
 'movie/142677',
 'movie/757725',
 'movie/950387',
 'movie/574475',
 'movie/119730',
 'movie/541671',
 'movie/128412',
 'movie/575265',
 'movie/141177',
 'movie/108719',
 'movie/124189',
 'movie/302946',
 'movie/284053']

In [18]:
soup.select(".multi_select")[-2]

<ul class="multi_select text" id="with_genres" name="with_genres[]">
<li data-value="28"><a class="no_click" href="/discover/movie?with_genres=28">Action</a></li>
<li data-value="12"><a class="no_click" href="/discover/movie?with_genres=12">Adventure</a></li>
<li data-value="16"><a class="no_click" href="/discover/movie?with_genres=16">Animation</a></li>
<li data-value="35"><a class="no_click" href="/discover/movie?with_genres=35">Comedy</a></li>
<li data-value="80"><a class="no_click" href="/discover/movie?with_genres=80">Crime</a></li>
<li data-value="99"><a class="no_click" href="/discover/movie?with_genres=99">Documentary</a></li>
<li data-value="18"><a class="no_click" href="/discover/movie?with_genres=18">Drama</a></li>
<li data-value="10751"><a class="no_click" href="/discover/movie?with_genres=10751">Family</a></li>
<li data-value="14"><a class="no_click" href="/discover/movie?with_genres=14">Fantasy</a></li>
<li data-value="36"><a class="no_click" href="/discover/movie?with_ge

### Grabbing the genres of the movies

In [19]:
def genre_grabber(soup: BeautifulSoup) -> list:
    genres = soup.select(".multi_select")[-2]
    my_genres = []
    for gen in genres.find_all("li"): # found them in list item (li) using inspect
        a_tag = gen.find("a") # because its in a hyerplink?
        if a_tag:
            my_genres.append(a_tag.text.strip())
    my_genres.append(np.nan)
    return my_genres
genre_grabber(vaccumed)
    

['Action',
 'Adventure',
 'Animation',
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Family',
 'Fantasy',
 'History',
 'Horror',
 'Music',
 'Mystery',
 'Romance',
 'Science Fiction',
 'TV Movie',
 'Thriller',
 'War',
 'Western',
 nan]

In [20]:
# ok so what I plan to do is that
# I already have a list of all the movie content IDs
# gonna iterate through them to create a new list object which will be my inputs for my cast func
html_content(vaccumed)
load = []
for info in html_content(vaccumed):
    load.append('https://www.themoviedb.org/' + info)
load

['https://www.themoviedb.org/movie/137643',
 'https://www.themoviedb.org/movie/870028',
 'https://www.themoviedb.org/movie/552524',
 'https://www.themoviedb.org/movie/131598',
 'https://www.themoviedb.org/movie/123341',
 'https://www.themoviedb.org/movie/145059',
 'https://www.themoviedb.org/movie/142677',
 'https://www.themoviedb.org/movie/757725',
 'https://www.themoviedb.org/movie/950387',
 'https://www.themoviedb.org/movie/574475',
 'https://www.themoviedb.org/movie/119730',
 'https://www.themoviedb.org/movie/541671',
 'https://www.themoviedb.org/movie/128412',
 'https://www.themoviedb.org/movie/575265',
 'https://www.themoviedb.org/movie/141177',
 'https://www.themoviedb.org/movie/108719',
 'https://www.themoviedb.org/movie/124189',
 'https://www.themoviedb.org/movie/302946',
 'https://www.themoviedb.org/movie/284053']

### Grabbing the casts of alll the movies

In [21]:
def cast_grabber(soup: BeautifulSoup) -> list: 

    cast_list = []

    def castisize():
        cards = soup.select("li.card")

        # this loop creates a beautifulsoup instance of every link request
        for card in cards:
              name = card.find("p").get_text(strip=True)
              print(name)
              cast_list.append(name)
        
    castisize()
    return cast_list  

# my epic use of hashmap 
all_casts = {}
for link in load:
     res = requests.get(link,headers=needed_headers)
     soup = BeautifulSoup(res.text,"lxml")
     all_casts[link] = cast_grabber(soup)

Suzuki Mint
Eiji Nakamura
Shou Nishino
Ryôichi Inaba
Hirokazu Iijima
Takahiro Tsutsumi
Masahiko Itoh
Yuuki Kawanishi
Kazuya Nomura
Ben Affleck
Jon Bernthal
Cynthia Addai-Robinson
J.K. Simmons
Allison Robertson
Alison Wright
Daniella Pineda
Robert Morgan
Grant Harvey
Maia Kealoha
Sydney Agudong
Chris Sanders
Zach Galifianakis
Billy Magnussen
Courtney B. Vance
Amy Hill
Tia Carrere
Kaipo Dudoit
Rie Nakagawa
Yuri Yamashina
Kiyoshi Abe
Chizuyu Azami
Moeko Ezawa
Toshihiko Oda
Hiroshi Chō
Akira Takahashi
Ikunosuke Koizumi
Amélie Daure
Gianpaolo Lupori
Stefan Morawietz
Kerry Washington
Omar Sy
Jahleel Kamara
Mark Strong
Da'Vine Joy Randolph
Method Man
Ed Quinn
Natalia Reyes
Marvin Jones III
Jason Momoa
Jack Black
Sebastian Eugene Hansen
Emma Myers
Danielle Brooks
Jennifer Coolidge
Rachel House
Allan Henry
Bram Scott-Breheny
Kaitlyn Santa Juana
Teo Briones
Rya Kihlstedt
Richard Harmon
Owen Patrick Joyner
Anna Lore
Brec Bassinger
Tony Todd
Andrew Tinpo Lee
Ana de Armas
Anjelica Huston
Keanu Reev

In [22]:
all_casts.values()

dict_values([['Suzuki Mint', 'Eiji Nakamura', 'Shou Nishino', 'Ryôichi Inaba', 'Hirokazu Iijima', 'Takahiro Tsutsumi', 'Masahiko Itoh', 'Yuuki Kawanishi', 'Kazuya Nomura'], ['Ben Affleck', 'Jon Bernthal', 'Cynthia Addai-Robinson', 'J.K. Simmons', 'Allison Robertson', 'Alison Wright', 'Daniella Pineda', 'Robert Morgan', 'Grant Harvey'], ['Maia Kealoha', 'Sydney Agudong', 'Chris Sanders', 'Zach Galifianakis', 'Billy Magnussen', 'Courtney B. Vance', 'Amy Hill', 'Tia Carrere', 'Kaipo Dudoit'], [], ['Rie Nakagawa', 'Yuri Yamashina', 'Kiyoshi Abe', 'Chizuyu Azami', 'Moeko Ezawa', 'Toshihiko Oda', 'Hiroshi Chō', 'Akira Takahashi', 'Ikunosuke Koizumi'], ['Amélie Daure', 'Gianpaolo Lupori', 'Stefan Morawietz'], [], ['Kerry Washington', 'Omar Sy', 'Jahleel Kamara', 'Mark Strong', "Da'Vine Joy Randolph", 'Method Man', 'Ed Quinn', 'Natalia Reyes', 'Marvin Jones III'], ['Jason Momoa', 'Jack Black', 'Sebastian Eugene Hansen', 'Emma Myers', 'Danielle Brooks', 'Jennifer Coolidge', 'Rachel House', 'All

In [23]:
# iteratin through the items of the dict to get clean result
for url, cast in all_casts.items():
     print(f"Cast for {url} -> {cast}")

Cast for https://www.themoviedb.org/movie/137643 -> ['Suzuki Mint', 'Eiji Nakamura', 'Shou Nishino', 'Ryôichi Inaba', 'Hirokazu Iijima', 'Takahiro Tsutsumi', 'Masahiko Itoh', 'Yuuki Kawanishi', 'Kazuya Nomura']
Cast for https://www.themoviedb.org/movie/870028 -> ['Ben Affleck', 'Jon Bernthal', 'Cynthia Addai-Robinson', 'J.K. Simmons', 'Allison Robertson', 'Alison Wright', 'Daniella Pineda', 'Robert Morgan', 'Grant Harvey']
Cast for https://www.themoviedb.org/movie/552524 -> ['Maia Kealoha', 'Sydney Agudong', 'Chris Sanders', 'Zach Galifianakis', 'Billy Magnussen', 'Courtney B. Vance', 'Amy Hill', 'Tia Carrere', 'Kaipo Dudoit']
Cast for https://www.themoviedb.org/movie/131598 -> []
Cast for https://www.themoviedb.org/movie/123341 -> ['Rie Nakagawa', 'Yuri Yamashina', 'Kiyoshi Abe', 'Chizuyu Azami', 'Moeko Ezawa', 'Toshihiko Oda', 'Hiroshi Chō', 'Akira Takahashi', 'Ikunosuke Koizumi']
Cast for https://www.themoviedb.org/movie/145059 -> ['Amélie Daure', 'Gianpaolo Lupori', 'Stefan Morawie

In [24]:
values = [item for value in all_casts.values() for item in value] # nested comprehension
values
    

['Suzuki Mint',
 'Eiji Nakamura',
 'Shou Nishino',
 'Ryôichi Inaba',
 'Hirokazu Iijima',
 'Takahiro Tsutsumi',
 'Masahiko Itoh',
 'Yuuki Kawanishi',
 'Kazuya Nomura',
 'Ben Affleck',
 'Jon Bernthal',
 'Cynthia Addai-Robinson',
 'J.K. Simmons',
 'Allison Robertson',
 'Alison Wright',
 'Daniella Pineda',
 'Robert Morgan',
 'Grant Harvey',
 'Maia Kealoha',
 'Sydney Agudong',
 'Chris Sanders',
 'Zach Galifianakis',
 'Billy Magnussen',
 'Courtney B. Vance',
 'Amy Hill',
 'Tia Carrere',
 'Kaipo Dudoit',
 'Rie Nakagawa',
 'Yuri Yamashina',
 'Kiyoshi Abe',
 'Chizuyu Azami',
 'Moeko Ezawa',
 'Toshihiko Oda',
 'Hiroshi Chō',
 'Akira Takahashi',
 'Ikunosuke Koizumi',
 'Amélie Daure',
 'Gianpaolo Lupori',
 'Stefan Morawietz',
 'Kerry Washington',
 'Omar Sy',
 'Jahleel Kamara',
 'Mark Strong',
 "Da'Vine Joy Randolph",
 'Method Man',
 'Ed Quinn',
 'Natalia Reyes',
 'Marvin Jones III',
 'Jason Momoa',
 'Jack Black',
 'Sebastian Eugene Hansen',
 'Emma Myers',
 'Danielle Brooks',
 'Jennifer Coolidge',


In [25]:
len(title_grabber(vaccumed))

20

In [26]:
# next need a user defined function that returns a pandas data frame with the above data
def bamboo():
    mydata = {"Cast" : values[0:20], "Genre" : genre_grabber(vaccumed),"User ratings" : rating_grabber(vaccumed), "Title" : title_grabber(vaccumed)}
    df = pd.DataFrame(data=mydata)
    return df.head(10)

bamboo()

Unnamed: 0,Cast,Genre,User ratings,Title
0,Suzuki Mint,Action,80,Predator: Killer of Killers
1,Eiji Nakamura,Adventure,72,The Accountant²
2,Shou Nishino,Animation,71,Lilo & Stitch
3,Ryôichi Inaba,Comedy,63,Mikaela
4,Hirokazu Iijima,Crime,75,Sinners
5,Takahiro Tsutsumi,Documentary,69,K.O.
6,Masahiko Itoh,Drama,83,STRAW
7,Yuuki Kawanishi,Family,63,Shadow Force
8,Kazuya Nomura,Fantasy,65,A Minecraft Movie
9,Ben Affleck,History,70,Final Destination Bloodlines


In [28]:
# exporting to csv
def absolute_cinema():
    all_titles = []
    all_genres = []
    all_ratings = []

    for page in range(1, 6):
        myurl = f"https://www.themoviedb.org/movie?page={page}"
        titles = title_grabber(vaccum(myurl))
        genres = genre_grabber(vaccum(myurl))
        ratings = rating_grabber(vaccum(myurl))
        all_titles.extend(titles)
        all_genres.extend(genres)
        all_ratings.extend(ratings)
    book = { "Genre" : all_genres ,"User ratings" : all_ratings, "Title" : all_titles}
    df = pd.DataFrame(data=book)
    return df
    

    
absolute_cinema()

Excecution FIN
Excecution FIN
Excecution FIN
Excecution FIN
Excecution FIN
Excecution FIN
Excecution FIN
Excecution FIN
Excecution FIN
Excecution FIN
Excecution FIN
Excecution FIN
Excecution FIN
Excecution FIN
Excecution FIN


Unnamed: 0,Genre,User ratings,Title
0,Action,80,Predator: Killer of Killers
1,Adventure,72,The Accountant²
2,Animation,71,Lilo & Stitch
3,Comedy,63,Mikaela
4,Crime,75,Sinners
...,...,...,...
95,TV Movie,64,Fountain of Youth
96,Thriller,53,Rosario
97,War,54,Fear Street: Prom Queen
98,Western,78,How to Train Your Dragon


In [1]:
pwd

'c:\\Users\\ther0\\Documents\\repos\\scraping_corns'