Following this tutorial: https://www.geeksforgeeks.org/scrape-imdb-movie-rating-and-details-using-python/


In [339]:
from bs4 import BeautifulSoup
import requests
import re
import pandas as pd

In [340]:
# Downloading imdb top 250 movie's data
#url = 'http://www.imdb.com/chart/top'
# 50 per page URL
#url = 'https://www.imdb.com/search/title/?title_type=tv_series&release_date=1989-01-01,2022-12-31&genres=animation&view=advanced'
#250 per page URL
url = 'https://www.imdb.com/search/title/?title_type=tv_series&release_date=1989-01-01,2022-12-31&genres=animation&count=250'
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")

In [354]:
titles = [a.text for a in soup.select('.lister-item-header')]
len(titles)

250

In [355]:
uniqueId = [a.attrs.get('data-tconst') for a in soup.select('span.userRatingValue')]

len(uniqueId)

241

In [356]:
imdbRatings = [a.text for a in soup("strong")]
imdbRatings.pop(0)
imdbRatings.pop(0) #pop the first two values because they are column titles, not actual ratings
len(imdbRatings)

241

In [357]:
runYears = [a.text for a in soup.select("span.lister-item-year.text-muted.unbold")]
len(runYears)

250

In [358]:
tvRatings = [a.text for a in soup.select("span.certificate")]
len(tvRatings)

243

In [360]:
runtimes= []
for a in soup.select("p.text-muted"):
    if len(a.contents) >= 5:
        if "runtime" in str(a.contents[5]):
            runtimes.append(a.contents[5].text)
        else:
            runtimes.append(a.contents[4])            
len(runtimes)

#no run time listed for some shows
#need to figure out how if to put null value, or manually enter episode lengths

249

In [361]:
genres = [a.text for a in soup.select("span.genre")]
len(genres)

250

In [353]:
# create a empty list for storing
# movie information
list = []
 
# Iterating over movies to extract
# each movie's details
for index in range(0, len(titles)):
  
    title_string = titles[index]
    #"\n3.\nDon't Hug Me I'm Scared\n(2022– )\n",
    show = title_string.split('\n')
    tv_show_title = show[2]
    
    years = show[3].split('–')
    startYear = years[0].strip('(')
    endYear = years[-1].strip(')') #gets the last item in the list (for some reason just asking for years[1] was wrong?)
    
    if "min" in runtimes[index]:
        runtimes[index] = runtimes[index].strip(' min')
    
    showGenres = genres[index].strip()
    genreList = showGenres.split(',') #not sure what data format is best for the genres, we can have a string with all genres, or a list of all genres
    genreList.pop(0) #gets rid of 'Animation' genre that is always first in the list
    
    data = { "TV Show Title": tv_show_title,
            "Unique ID": uniqueId[index],
            "IMDB Rating": imdbRatings[index],
            "Start Year": startYear,
            "End Year": endYear,
            "TV Rating": tvRatings[index],
            "Episode Run Time (min)": runtimes[index],
            "Top Genres": genreList,               
            }
    
    list.append(data)
    
list

[{'TV Show Title': 'Cyberpunk: Edgerunners',
  'Top Genres': [' Action', ' Adventure']},
 {'TV Show Title': 'Rick and Morty', 'Top Genres': [' Adventure', ' Comedy']},
 {'TV Show Title': "Don't Hug Me I'm Scared",
  'Top Genres': [' Comedy', ' Horror']},
 {'TV Show Title': 'Family Guy', 'Top Genres': [' Comedy']},
 {'TV Show Title': 'The Simpsons', 'Top Genres': [' Comedy']},
 {'TV Show Title': 'Attack on Titan', 'Top Genres': [' Action', ' Adventure']},
 {'TV Show Title': 'One Piece', 'Top Genres': [' Action', ' Adventure']},
 {'TV Show Title': 'Star Trek: Lower Decks',
  'Top Genres': [' Action', ' Adventure']},
 {'TV Show Title': 'Arcane', 'Top Genres': [' Action', ' Adventure']},
 {'TV Show Title': 'Archer', 'Top Genres': [' Action', ' Comedy']},
 {'TV Show Title': "Bob's Burgers", 'Top Genres': [' Comedy']},
 {'TV Show Title': 'BoJack Horseman', 'Top Genres': [' Comedy', ' Drama']},
 {'TV Show Title': 'Harley Quinn', 'Top Genres': [' Action', ' Adventure']},
 {'TV Show Title': 'Lo

In [None]:
for show in list:
    print(show['TV Rating'], '-', show['TV Show Title'], '('+show['Start Year'] +
          ') -', 'Genres:', show['Top Genres'], show['IMDB Rating'])

In [None]:
#saving the list as dataframe
#then converting into .csv file
df = pd.DataFrame(list)
df.to_csv('imdb_250_animated_shows.csv',index=False)

In [None]:
df.head()