Following this tutorial: https://www.geeksforgeeks.org/scrape-imdb-movie-rating-and-details-using-python/


In [1]:
from bs4 import BeautifulSoup
import requests
import re
import pandas as pd

In [129]:
# Downloading imdb top 250 movie's data
#url = 'http://www.imdb.com/chart/top'
url = 'https://www.imdb.com/search/title/?title_type=tv_series&release_date=1989-01-01,2022-12-31&genres=animation&view=advanced'
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")

In [142]:
titles = [a.text for a in soup.select('.lister-item-header')]
titles

['\n1.\nCyberpunk: Edgerunners\n(2022– )\n',
 '\n2.\nRick and Morty\n(2013– )\n',
 "\n3.\nDon't Hug Me I'm Scared\n(2022– )\n",
 '\n4.\nFamily Guy\n(1999– )\n',
 '\n5.\nThe Simpsons\n(1989– )\n',
 '\n6.\nAttack on Titan\n(2013–2023)\n',
 '\n7.\nOne Piece\n(1999– )\n',
 '\n8.\nStar Trek: Lower Decks\n(2020– )\n',
 '\n9.\nArcane\n(2021– )\n',
 '\n10.\nArcher\n(2009– )\n',
 "\n11.\nBob's Burgers\n(2011– )\n",
 '\n12.\nBoJack Horseman\n(2014–2020)\n',
 '\n13.\nHarley Quinn\n(2019– )\n',
 '\n14.\nLove, Death & Robots\n(2019– )\n',
 '\n15.\nSouth Park\n(1997– )\n',
 '\n16.\nDemon Slayer: Kimetsu no Yaiba\n(2019– )\n',
 '\n17.\nAvatar: The Last Airbender\n(2005–2008)\n',
 '\n18.\nJujutsu Kaisen\n(2020– )\n',
 '\n19.\nLittle Demon\n(2022– )\n',
 '\n20.\nStar Wars: The Clone Wars\n(2008–2020)\n',
 '\n21.\nAmerican Dad!\n(2005– )\n',
 '\n22.\nInvincible\n(2021– )\n',
 '\n23.\nFuturama\n(1999–2023)\n',
 '\n24.\nSpongeBob SquarePants\n(1999– )\n',
 '\n25.\nPantheon\n(2022– )\n',
 '\n26.\nMy Hero A

In [327]:
uniqueId = [a.attrs.get('data-value') for a in soup.select('div.ratings-imdb-ratings span[data-tconst]')]

uniqueId

[]

In [214]:
imdbRatings = [a.text for a in soup("strong")]
imdbRatings.pop(0)
imdbRatings.pop(0) #pop the first two values because they are column titles, not actual ratings
imdbRatings

50

In [213]:
runYears = [a.text for a in soup.select("span.lister-item-year.text-muted.unbold")]
runYears

50

In [212]:
tvRatings = [a.text for a in soup.select("span.certificate")]
tvRatings

50

In [303]:
runtimes= []
for a in soup.select("p.text-muted"):
    if len(a.contents) > 1:
        if "runtime" in str(a.contents[5]):
            runtimes.append(a.contents[5].text)
        else:
            runtimes.append(a.contents[4])            
runtimes

#no run time listed for Star Wars Bad Batch and Cars on the Road
#need to figure out how if to put null value, or manually enter episode lengths

['24 min',
 '23 min',
 '23 min',
 '22 min',
 '22 min',
 '24 min',
 '24 min',
 '25 min',
 '41 min',
 '22 min',
 '22 min',
 '25 min',
 '23 min',
 '15 min',
 '22 min',
 '24 min',
 '23 min',
 '24 min',
 '26 min',
 '23 min',
 '22 min',
 '50 min',
 '22 min',
 '23 min',
 '41 min',
 '24 min',
 '22 min',
 '24 min',
 '30 min',
 '7 min',
 '24 min',
 '24 min',
 '7 min',
 '11 min',
 '24 min',
 '24 min',
 '23 min',
 '24 min',
 '22 min',
 '30 min',
 '24 min',
 '24 min',
 '22 min',
 '24 min',
 '\n',
 '\n',
 '30 min',
 '325 min',
 '14 min',
 '23 min']

In [147]:
genres = [a.text for a in soup.select("span.genre")]
genres

['\nAnimation, Action, Adventure            ',
 '\nAnimation, Adventure, Comedy            ',
 '\nAnimation, Comedy, Horror            ',
 '\nAnimation, Comedy            ',
 '\nAnimation, Comedy            ',
 '\nAnimation, Action, Adventure            ',
 '\nAnimation, Action, Adventure            ',
 '\nAnimation, Action, Adventure            ',
 '\nAnimation, Action, Adventure            ',
 '\nAnimation, Action, Comedy            ',
 '\nAnimation, Comedy            ',
 '\nAnimation, Comedy, Drama            ',
 '\nAnimation, Action, Adventure            ',
 '\nAnimation, Short, Action            ',
 '\nAnimation, Comedy            ',
 '\nAnimation, Action, Adventure            ',
 '\nAnimation, Action, Adventure            ',
 '\nAnimation, Action, Adventure            ',
 '\nAnimation, Comedy, Horror            ',
 '\nAnimation, Action, Adventure            ',
 '\nAnimation, Comedy            ',
 '\nAnimation, Action, Adventure            ',
 '\nAnimation, Adventure, Comedy      

In [308]:
# create a empty list for storing
# movie information
list = []
 
# Iterating over movies to extract
# each movie's details
for index in range(0, len(titles)):
  
    title_string = titles[index]
    #"\n3.\nDon't Hug Me I'm Scared\n(2022– )\n",
    show = title_string.split('\n')
    tv_show_title = show[2]
    
    years = show[3].split('–')
    startYear = years[0].strip('(')
    endYear = years[-1].strip(')') #gets the last item in the list (for some reason just asking for years[1] was wrong?)
    
    if "min" in runtimes[index]:
        runtimes[index] = runtimes[index].strip(' min')
    
    showGenres = genres[index].strip()
    genreList = showGenres.split(',') #not sure what data format is best for the genres, we can have a string with all three, or a list of all three
    
    data = { "TV Show Title": tv_show_title,
            "IMDB Rating": imdbRatings[index],
            "Start Year": startYear,
            "End Year": endYear,
            "TV Rating": tvRatings[index],
            "Episode Run Time (min)": runtimes[index],
            "Top Genres": genreList,               
            }
    
    list.append(data)
    
list

[{'TV Show Title': 'Cyberpunk: Edgerunners',
  'IMDB Rating': '8.5',
  'Start Year': '2022',
  'End Year': ' ',
  'TV Rating': 'TV-MA',
  'Episode Run Time (min)': '24',
  'Top Genres': ['Animation', ' Action', ' Adventure']},
 {'TV Show Title': 'Rick and Morty',
  'IMDB Rating': '9.2',
  'Start Year': '2013',
  'End Year': ' ',
  'TV Rating': 'TV-MA',
  'Episode Run Time (min)': '23',
  'Top Genres': ['Animation', ' Adventure', ' Comedy']},
 {'TV Show Title': "Don't Hug Me I'm Scared",
  'IMDB Rating': '8.9',
  'Start Year': '2022',
  'End Year': ' ',
  'TV Rating': 'TV-14',
  'Episode Run Time (min)': '23',
  'Top Genres': ['Animation', ' Comedy', ' Horror']},
 {'TV Show Title': 'Family Guy',
  'IMDB Rating': '8.2',
  'Start Year': '1999',
  'End Year': ' ',
  'TV Rating': 'TV-MA',
  'Episode Run Time (min)': '22',
  'Top Genres': ['Animation', ' Comedy']},
 {'TV Show Title': 'The Simpsons',
  'IMDB Rating': '8.7',
  'Start Year': '1989',
  'End Year': ' ',
  'TV Rating': 'TV-14',
  

In [314]:
for show in list:
    print(show['TV Rating'], '-', show['TV Show Title'], '('+show['Start Year'] +
          ') -', 'Genres:', show['Top Genres'], show['IMDB Rating'])

TV-MA - Cyberpunk: Edgerunners (2022) - Genres: ['Animation', ' Action', ' Adventure'] 8.5
TV-MA - Rick and Morty (2013) - Genres: ['Animation', ' Adventure', ' Comedy'] 9.2
TV-14 - Don't Hug Me I'm Scared (2022) - Genres: ['Animation', ' Comedy', ' Horror'] 8.9
TV-MA - Family Guy (1999) - Genres: ['Animation', ' Comedy'] 8.2
TV-14 - The Simpsons (1989) - Genres: ['Animation', ' Comedy'] 8.7
TV-MA - Attack on Titan (2013) - Genres: ['Animation', ' Action', ' Adventure'] 9.0
TV-14 - One Piece (1999) - Genres: ['Animation', ' Action', ' Adventure'] 8.9
TV-14 - Star Trek: Lower Decks (2020) - Genres: ['Animation', ' Action', ' Adventure'] 7.4
TV-14 - Arcane (2021) - Genres: ['Animation', ' Action', ' Adventure'] 9.0
TV-MA - Archer (2009) - Genres: ['Animation', ' Action', ' Comedy'] 8.6
TV-14 - Bob's Burgers (2011) - Genres: ['Animation', ' Comedy'] 8.2
TV-MA - BoJack Horseman (2014) - Genres: ['Animation', ' Comedy', ' Drama'] 8.8
TV-MA - Harley Quinn (2019) - Genres: ['Animation', ' Act

In [315]:
#saving the list as dataframe
#then converting into .csv file
df = pd.DataFrame(list)
df.to_csv('imdb_50_animated_shows.csv',index=False)

In [318]:
df.head()

Unnamed: 0,TV Show Title,IMDB Rating,Start Year,End Year,TV Rating,Episode Run Time (min),Top Genres
0,Cyberpunk: Edgerunners,8.5,2022,,TV-MA,24,"[Animation, Action, Adventure]"
1,Rick and Morty,9.2,2013,,TV-MA,23,"[Animation, Adventure, Comedy]"
2,Don't Hug Me I'm Scared,8.9,2022,,TV-14,23,"[Animation, Comedy, Horror]"
3,Family Guy,8.2,1999,,TV-MA,22,"[Animation, Comedy]"
4,The Simpsons,8.7,1989,,TV-14,22,"[Animation, Comedy]"
