In [1]:
import requests
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
import re

In [2]:
needed_headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}

res = requests.get("https://www.themoviedb.org/movie", headers=needed_headers)
res.status_code
# An exit status of 200 means OK 

200

In [3]:
# Creating an instance of the BeautifulSoup class
# also im going to be using lxml instead of the conventional html.parser lets see how that goes
soup = BeautifulSoup(res.text, "lxml")

# won't call soup for my own sanity

In [4]:
web_content = soup.get_text().strip()
# using .strip() right now gets rid of trailing whitespaces but not leading??? wtf? will figure out
print(web_content[:15])

Popular Movies 


### Extracting the title of the web page using various methods:

In [5]:
# Extracting title
soup.title

<title>Popular Movies — The Movie Database (TMDB)</title>

In [6]:
# Another method of fetching the title
soup.select("title")[0].getText()

'Popular Movies — The Movie Database (TMDB)'

### Using functions to help ease our workflow:

In [7]:
def vaccum(url: str) -> BeautifulSoup:
    try:
        # I am not a robot lmao
        needed_headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
        res_func = requests.get(url,headers=needed_headers)

        # Raise exc if status code is 5xx or 4xx
        res_func.raise_for_status()

        return BeautifulSoup(res_func.text, "lxml")
    
    except requests.exceptions.RequestException as e:
        # Catch all request-related errors 
        print(f"An Error occured!: {e}")
        return None
    
    finally:
        print("Excecution FIN")

In [8]:
# test case 1: a working url
vaccumed = vaccum("https://www.themoviedb.org/movie")
# This works, won't call it for obvious reasons 

Excecution FIN


In [9]:
# test case 2: malformed/incorrect URLs
vaccum("https://www.themovipepepe.org/what")

An Error occured!: HTTPSConnectionPool(host='www.themovipepepe.org', port=443): Max retries exceeded with url: /what (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x000001CB7D5E12E0>: Failed to resolve 'www.themovipepepe.org' ([Errno 11001] getaddrinfo failed)"))
Excecution FIN


In [61]:
# Displaying the name of the first movie
first_movie = soup.select(".content h2")[3].getText()
first_movie

IndexError: list index out of range

In [55]:
# Extracting User rating of the first movie
span_class = soup.select_one(".user_score_chart .icon")['class'][1]
span_class
perc1 = span_class.split("icon-r")[1]
print(f"User score of the movie: {first_movie} is {perc1}%")

User score of the movie: द कोन्जूरिंग: लास्ट राइट्स is 52%


In [12]:
# gonna use regex for extracting the part of the url following the string “https://www.themoviedb.org/” 
text = str(soup.select(".content h2"))
match = re.search(r'movie/\d\d\d\d\d\d',text)
match.group()
# This is going to help us later (i think)

'movie/941109'

### Grabbing the title of all the movies

In [13]:
def title_grabber(soup: BeautifulSoup) -> list:
    titles = []
    # edge case
    if soup is None:
        return titles
    movies = soup.select(".content h2")[3:]
    for movie in movies:
        a_tag = movie.find("a", title = True)
        if a_tag:
            titles.append(a_tag["title"])
    return titles
    
        
title_grabber(vaccumed)

['प्ले डर्टी',
 'द कोन्जूरिंग: लास्ट राइट्स',
 'Marco',
 'Primitive War',
 'The Toxic Avenger Unrated',
 'Demon Slayer: Kimetsu no Yaiba Infinity Castle',
 'द फ़ैंटॅस्टिक 4: फ़र्स्ट स्टेप्स',
 'The Lost Princess',
 'Holy Night: Demon Hunters',
 'वॉर ऑफ़ द वर्ल्ड्स',
 'Mantis',
 'Django Undisputed',
 'Assassin',
 'Prisoner of War',
 'Fight Another Day',
 'One Battle After Another',
 'द लॉस्ट बस',
 'Valiant One',
 'सुपरमैन',
 'The Man in My Basement']

### Grabbing the user ratings of all the movies in a page

In [14]:
def rating_grabber(soup: BeautifulSoup) -> list:
    user_score = []
    scores = soup.select(".content .user_score_chart")

    for score in scores:
        # edge case
        if score is None:
            user_score.append("No rating")
        else:
            percent = score["data-percent"]
            user_score.append(percent)
        
        

    return user_score
rating_grabber(vaccumed)

['65',
 '67',
 '64',
 '70',
 '63',
 '78',
 '72',
 '0',
 '66',
 '44',
 '63',
 '61',
 '42',
 '71',
 '51',
 '78',
 '70',
 '63',
 '75',
 '52']

In [15]:
soup.select(".content h2")

[<h2>Sort</h2>,
 <h2>Where To Watch <span>78</span></h2>,
 <h2>Filters</h2>,
 <h2><a href="/movie/941109-play-dirty" title="प्ले डर्टी">प्ले डर्टी</a></h2>,
 <h2><a href="/movie/1038392-the-conjuring-last-rites" title="द कोन्जूरिंग: लास्ट राइट्स">द कोन्जूरिंग: लास्ट राइट्स</a></h2>,
 <h2><a href="/movie/1186350" title="Marco">Marco</a></h2>,
 <h2><a href="/movie/1257009-primitive-war" title="Primitive War">Primitive War</a></h2>,
 <h2><a href="/movie/338969-the-toxic-avenger-unrated" title="The Toxic Avenger Unrated">The Toxic Avenger Unrated</a></h2>,
 <h2><a href="/movie/1311031" title="Demon Slayer: Kimetsu no Yaiba Infinity Castle">Demon Slayer: Kimetsu no Yaiba Infinity Castle</a></h2>,
 <h2><a href="/movie/617126-the-fantastic-4-first-steps" title="द फ़ैंटॅस्टिक 4: फ़र्स्ट स्टेप्स">द फ़ैंटॅस्टिक 4: फ़र्स्ट स्टेप्स</a></h2>,
 <h2><a href="/movie/1086910-the-lost-princess" title="The Lost Princess">The Lost Princess</a></h2>,
 <h2><a href="/movie/793387" title="Holy Night: Demon Hu

### Extracting the HTML content of all the individual pages of movies


In [16]:
def html_content(soup: BeautifulSoup) -> list:
    my_content = str(soup.select(".content h2"))
    my_list_of_contents = []

    stuff = re.findall(r"movie/\d\d\d\d\d\d", my_content)
    my_list_of_contents.extend(stuff)

    return my_list_of_contents
   
html_content(vaccumed)
    

['movie/941109',
 'movie/103839',
 'movie/118635',
 'movie/125700',
 'movie/338969',
 'movie/131103',
 'movie/617126',
 'movie/108691',
 'movie/793387',
 'movie/755898',
 'movie/126731',
 'movie/135788',
 'movie/155281',
 'movie/132880',
 'movie/105300',
 'movie/105486',
 'movie/123647',
 'movie/100964',
 'movie/106147',
 'movie/957121']

In [17]:
soup.select(".multi_select")[-2]

<ul class="multi_select text" id="with_genres" name="with_genres[]">
<li data-value="28"><a class="no_click" href="/discover/movie?with_genres=28">Action</a></li>
<li data-value="12"><a class="no_click" href="/discover/movie?with_genres=12">Adventure</a></li>
<li data-value="16"><a class="no_click" href="/discover/movie?with_genres=16">Animation</a></li>
<li data-value="35"><a class="no_click" href="/discover/movie?with_genres=35">Comedy</a></li>
<li data-value="80"><a class="no_click" href="/discover/movie?with_genres=80">Crime</a></li>
<li data-value="99"><a class="no_click" href="/discover/movie?with_genres=99">Documentary</a></li>
<li data-value="18"><a class="no_click" href="/discover/movie?with_genres=18">Drama</a></li>
<li data-value="10751"><a class="no_click" href="/discover/movie?with_genres=10751">Family</a></li>
<li data-value="14"><a class="no_click" href="/discover/movie?with_genres=14">Fantasy</a></li>
<li data-value="36"><a class="no_click" href="/discover/movie?with_ge

### Grabbing the genres of the movies

In [18]:
def genre_grabber(soup: BeautifulSoup) -> list:
    genres = soup.select(".multi_select")[-2]
    my_genres = []
    for gen in genres.find_all("li"): # found them in list item (li) using inspect
        a_tag = gen.find("a") # because its in a hyerplink?
        if a_tag:
            my_genres.append(a_tag.text.strip())
    my_genres.append(np.nan)
    return my_genres
genre_grabber(vaccumed)
    

['Action',
 'Adventure',
 'Animation',
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Family',
 'Fantasy',
 'History',
 'Horror',
 'Music',
 'Mystery',
 'Romance',
 'Science Fiction',
 'TV Movie',
 'Thriller',
 'War',
 'Western',
 nan]

In [19]:
# ok so what I plan to do is that
# I already have a list of all the movie content IDs
# gonna iterate through them to create a new list object which will be my inputs for my cast func
html_content(vaccumed)
load = []
for info in html_content(vaccumed):
    load.append('https://www.themoviedb.org/' + info)
load

['https://www.themoviedb.org/movie/941109',
 'https://www.themoviedb.org/movie/103839',
 'https://www.themoviedb.org/movie/118635',
 'https://www.themoviedb.org/movie/125700',
 'https://www.themoviedb.org/movie/338969',
 'https://www.themoviedb.org/movie/131103',
 'https://www.themoviedb.org/movie/617126',
 'https://www.themoviedb.org/movie/108691',
 'https://www.themoviedb.org/movie/793387',
 'https://www.themoviedb.org/movie/755898',
 'https://www.themoviedb.org/movie/126731',
 'https://www.themoviedb.org/movie/135788',
 'https://www.themoviedb.org/movie/155281',
 'https://www.themoviedb.org/movie/132880',
 'https://www.themoviedb.org/movie/105300',
 'https://www.themoviedb.org/movie/105486',
 'https://www.themoviedb.org/movie/123647',
 'https://www.themoviedb.org/movie/100964',
 'https://www.themoviedb.org/movie/106147',
 'https://www.themoviedb.org/movie/957121']

### Grabbing the casts of alll the movies

In [20]:
def cast_grabber(soup: BeautifulSoup) -> list: 

    cast_list = []

    def castisize():
        cards = soup.select("li.card")

        # this loop creates a beautifulsoup instance of every link request
        for card in cards:
              name = card.find("p").get_text(strip=True)
              print(name)
              cast_list.append(name)
        
    castisize()
    return cast_list  

# my epic use of hashmap 
all_casts = {}
for link in load:
     res = requests.get(link,headers=needed_headers)
     soup = BeautifulSoup(res.text,"lxml")
     all_casts[link] = cast_grabber(soup)

मार्क वाह्ल्बर्ग
LaKeith Stanfield
Rosa Salazar
कीगन-माइकल की
Claire Lovering
Chai Hansen
Chukwudi Iwuji
Tony Shalhoub
Nat Wolff
Jens Jørn Spottag
Bodil Jørgensen
Marie Bach Hansen
Thomas Ernst
Laura Winther Møller
Mia Ejlerskov
Jesper Riefensthal
जार्न हेनरिक्सन
Mads Wille
Aleksandr Sokurov
Venus Seye
Mame Ndoumbé Diop
Ndiagne Dia
Mariama Balde
Awa Sène Sarr
Tabata Ndiaye
पीटर डिंक्लेज
Jacob Tremblay
Taylour Paige
Julia Davis
Jonny Coyne
Elijah Wood
Kevin Bacon
David Yow
Sarah Niles
पेद्रो पास्कल
वैनेसा कर्बी
Ebon Moss-Bachrach
Joseph Quinn
Ralph Ineson
जूलिया गार्नर
Paul Walter Hauser
Natasha Lyonne
Sarah Niles
Don Lee
Seohyun
David Lee
Gyeong Su-jin
Jung Ji-so
Choi Gwang-il
Ryu Seung-su
Jeon Ik-ryoung
Song Yo-sep
आइस क्यूब
Eva Longoria
क्लार्क ग्रेग
Iman Benson
Henry Hunter Hall
Devon Bostick
Andrea Savage
Nicole Pulliam
Michael O'Neill
Judy Canova
Stephen Dunne
George Cleveland
June Vincent
Irene Ryan
Roy Barcroft
Allen Jenkins
George Chandler
Elizabeth Slifer
Corey Hawkins
विलेम ड

In [21]:
all_casts.values()

dict_values([['मार्क वाह्ल्बर्ग', 'LaKeith Stanfield', 'Rosa Salazar', 'कीगन-माइकल की', 'Claire Lovering', 'Chai Hansen', 'Chukwudi Iwuji', 'Tony Shalhoub', 'Nat Wolff'], ['Jens Jørn Spottag', 'Bodil Jørgensen', 'Marie Bach Hansen', 'Thomas Ernst', 'Laura Winther Møller', 'Mia Ejlerskov', 'Jesper Riefensthal', 'जार्न हेनरिक्सन', 'Mads Wille'], ['Aleksandr Sokurov'], ['Venus Seye', 'Mame Ndoumbé Diop', 'Ndiagne Dia', 'Mariama Balde', 'Awa Sène Sarr', 'Tabata Ndiaye'], ['पीटर डिंक्लेज', 'Jacob Tremblay', 'Taylour Paige', 'Julia Davis', 'Jonny Coyne', 'Elijah Wood', 'Kevin Bacon', 'David Yow', 'Sarah Niles'], [], ['पेद्रो पास्कल', 'वैनेसा कर्बी', 'Ebon Moss-Bachrach', 'Joseph Quinn', 'Ralph Ineson', 'जूलिया गार्नर', 'Paul Walter Hauser', 'Natasha Lyonne', 'Sarah Niles'], [], ['Don Lee', 'Seohyun', 'David Lee', 'Gyeong Su-jin', 'Jung Ji-so', 'Choi Gwang-il', 'Ryu Seung-su', 'Jeon Ik-ryoung', 'Song Yo-sep'], ['आइस क्यूब', 'Eva Longoria', 'क्लार्क ग्रेग', 'Iman Benson', 'Henry Hunter Hall', 

In [22]:
# iteratin through the items of the dict to get clean result
for url, cast in all_casts.items():
     print(f"Cast for {url} -> {cast}")

Cast for https://www.themoviedb.org/movie/941109 -> ['मार्क वाह्ल्बर्ग', 'LaKeith Stanfield', 'Rosa Salazar', 'कीगन-माइकल की', 'Claire Lovering', 'Chai Hansen', 'Chukwudi Iwuji', 'Tony Shalhoub', 'Nat Wolff']
Cast for https://www.themoviedb.org/movie/103839 -> ['Jens Jørn Spottag', 'Bodil Jørgensen', 'Marie Bach Hansen', 'Thomas Ernst', 'Laura Winther Møller', 'Mia Ejlerskov', 'Jesper Riefensthal', 'जार्न हेनरिक्सन', 'Mads Wille']
Cast for https://www.themoviedb.org/movie/118635 -> ['Aleksandr Sokurov']
Cast for https://www.themoviedb.org/movie/125700 -> ['Venus Seye', 'Mame Ndoumbé Diop', 'Ndiagne Dia', 'Mariama Balde', 'Awa Sène Sarr', 'Tabata Ndiaye']
Cast for https://www.themoviedb.org/movie/338969 -> ['पीटर डिंक्लेज', 'Jacob Tremblay', 'Taylour Paige', 'Julia Davis', 'Jonny Coyne', 'Elijah Wood', 'Kevin Bacon', 'David Yow', 'Sarah Niles']
Cast for https://www.themoviedb.org/movie/131103 -> []
Cast for https://www.themoviedb.org/movie/617126 -> ['पेद्रो पास्कल', 'वैनेसा कर्बी', 'Eb

In [None]:
values_in_a_list = [item for value in all_casts.values() for item in value] # nested comprehension
values_in_a_list
# this is kinda obsolete but nice to see all listed
    

['मार्क वाह्ल्बर्ग',
 'LaKeith Stanfield',
 'Rosa Salazar',
 'कीगन-माइकल की',
 'Claire Lovering',
 'Chai Hansen',
 'Chukwudi Iwuji',
 'Tony Shalhoub',
 'Nat Wolff',
 'Jens Jørn Spottag',
 'Bodil Jørgensen',
 'Marie Bach Hansen',
 'Thomas Ernst',
 'Laura Winther Møller',
 'Mia Ejlerskov',
 'Jesper Riefensthal',
 'जार्न हेनरिक्सन',
 'Mads Wille',
 'Aleksandr Sokurov',
 'Venus Seye',
 'Mame Ndoumbé Diop',
 'Ndiagne Dia',
 'Mariama Balde',
 'Awa Sène Sarr',
 'Tabata Ndiaye',
 'पीटर डिंक्लेज',
 'Jacob Tremblay',
 'Taylour Paige',
 'Julia Davis',
 'Jonny Coyne',
 'Elijah Wood',
 'Kevin Bacon',
 'David Yow',
 'Sarah Niles',
 'पेद्रो पास्कल',
 'वैनेसा कर्बी',
 'Ebon Moss-Bachrach',
 'Joseph Quinn',
 'Ralph Ineson',
 'जूलिया गार्नर',
 'Paul Walter Hauser',
 'Natasha Lyonne',
 'Sarah Niles',
 'Don Lee',
 'Seohyun',
 'David Lee',
 'Gyeong Su-jin',
 'Jung Ji-so',
 'Choi Gwang-il',
 'Ryu Seung-su',
 'Jeon Ik-ryoung',
 'Song Yo-sep',
 'आइस क्यूब',
 'Eva Longoria',
 'क्लार्क ग्रेग',
 'Iman Benson',
 

In [None]:
# but what we need is cast members grouped together with movies
values = list(all_casts.values())
values

[['मार्क वाह्ल्बर्ग',
  'LaKeith Stanfield',
  'Rosa Salazar',
  'कीगन-माइकल की',
  'Claire Lovering',
  'Chai Hansen',
  'Chukwudi Iwuji',
  'Tony Shalhoub',
  'Nat Wolff'],
 ['Jens Jørn Spottag',
  'Bodil Jørgensen',
  'Marie Bach Hansen',
  'Thomas Ernst',
  'Laura Winther Møller',
  'Mia Ejlerskov',
  'Jesper Riefensthal',
  'जार्न हेनरिक्सन',
  'Mads Wille'],
 ['Aleksandr Sokurov'],
 ['Venus Seye',
  'Mame Ndoumbé Diop',
  'Ndiagne Dia',
  'Mariama Balde',
  'Awa Sène Sarr',
  'Tabata Ndiaye'],
 ['पीटर डिंक्लेज',
  'Jacob Tremblay',
  'Taylour Paige',
  'Julia Davis',
  'Jonny Coyne',
  'Elijah Wood',
  'Kevin Bacon',
  'David Yow',
  'Sarah Niles'],
 [],
 ['पेद्रो पास्कल',
  'वैनेसा कर्बी',
  'Ebon Moss-Bachrach',
  'Joseph Quinn',
  'Ralph Ineson',
  'जूलिया गार्नर',
  'Paul Walter Hauser',
  'Natasha Lyonne',
  'Sarah Niles'],
 [],
 ['Don Lee',
  'Seohyun',
  'David Lee',
  'Gyeong Su-jin',
  'Jung Ji-so',
  'Choi Gwang-il',
  'Ryu Seung-su',
  'Jeon Ik-ryoung',
  'Song Yo-sep'

In [43]:
len(title_grabber(vaccumed))

20

In [None]:
# next need a user defined function that returns a pandas data frame with the above data
def bamboo():
    mydata = {
        "Cast": values[0:20],  
        "Genre": genre_grabber(vaccumed)[0:20], 
        "User ratings": rating_grabber(vaccumed)[0:20], 
        "Title": title_grabber(vaccumed)[0:20]
    }
    df = pd.DataFrame(data=mydata)
    
    df = df.explode('Cast')

    # take care of NaN and str bullshitery
    df = df[df['Cast'].notna()]
    df = df[df['Cast'].apply(lambda x: isinstance(x, str))]

    df = df.groupby('Title').agg({
        'Cast': lambda x: ', '.join(x),
        'Genre': 'first',
        'User ratings': 'first'
    }).reset_index()
    
    return df.head(10)

bamboo()

Unnamed: 0,Title,Cast,Genre,User ratings
0,Assassin,"Judy Canova, Stephen Dunne, George Cleveland, ...",Mystery,42
1,Holy Night: Demon Hunters,"Don Lee, Seohyun, David Lee, Gyeong Su-jin, Ju...",Fantasy,66
2,Marco,Aleksandr Sokurov,Animation,64
3,Primitive War,"Venus Seye, Mame Ndoumbé Diop, Ndiagne Dia, Ma...",Comedy,70
4,The Man in My Basement,"Corey Hawkins, विलेम डाफ़ो, Anna Diop, Jonatha...",,52
5,The Toxic Avenger Unrated,"पीटर डिंक्लेज, Jacob Tremblay, Taylour Paige, ...",Crime,63
6,द कोन्जूरिंग: लास्ट राइट्स,"Jens Jørn Spottag, Bodil Jørgensen, Marie Bach...",Adventure,67
7,द फ़ैंटॅस्टिक 4: फ़र्स्ट स्टेप्स,"पेद्रो पास्कल, वैनेसा कर्बी, Ebon Moss-Bachrac...",Drama,72
8,प्ले डर्टी,"मार्क वाह्ल्बर्ग, LaKeith Stanfield, Rosa Sala...",Action,65
9,वॉर ऑफ़ द वर्ल्ड्स,"आइस क्यूब, Eva Longoria, क्लार्क ग्रेग, Iman B...",History,44


In [None]:
# exporting to csv
def absolute_cinema():
    all_titles = []
    all_genres = []
    all_ratings = []

    for page in range(1, 6):
        myurl = f"https://www.themoviedb.org/movie?page={page}"
        titles = title_grabber(vaccum(myurl))
        genres = genre_grabber(vaccum(myurl))
        ratings = rating_grabber(vaccum(myurl))
        all_titles.extend(titles)
        all_genres.extend(genres)
        all_ratings.extend(ratings)
    book = { "Genre" : all_genres ,"User ratings" : all_ratings, "Title" : all_titles}
    df = pd.DataFrame(data=book)
    df.to_csv("newfile.csv",index=True)
    return df
    
absolute_cinema()

Excecution FIN
Excecution FIN
Excecution FIN
Excecution FIN
Excecution FIN
Excecution FIN
Excecution FIN
Excecution FIN
Excecution FIN
Excecution FIN
Excecution FIN
Excecution FIN
Excecution FIN
Excecution FIN
Excecution FIN


Unnamed: 0,Genre,User ratings,Title
0,Action,65,प्ले डर्टी
1,Adventure,67,द कोन्जूरिंग: लास्ट राइट्स
2,Animation,64,Marco
3,Comedy,78,Demon Slayer: Kimetsu no Yaiba Infinity Castle
4,Crime,63,The Toxic Avenger Unrated
...,...,...,...
95,TV Movie,60,Smurfs
96,Thriller,59,Get Fast
97,War,66,The Balloonist
98,Western,64,Eenie Meanie
