Following this tutorial: https://www.geeksforgeeks.org/scrape-imdb-movie-rating-and-details-using-python/


In [243]:
from bs4 import BeautifulSoup
import requests
import re
import pandas as pd

In [244]:
# Downloading imdb top 250 movie's data

# 50 per page URL
#url = 'https://www.imdb.com/search/title/?title_type=tv_series&release_date=1989-01-01,2022-12-31&genres=animation&view=advanced'

#250 per page URL
#page 1
#url = 'https://www.imdb.com/search/title/?title_type=tv_series&release_date=1989-01-01,2022-12-31&genres=animation&count=250'
#page 2
#url = 'https://www.imdb.com/search/title/?title_type=tv_series&release_date=1989-01-01,2022-12-31&genres=animation&count=250&start=251&ref_=adv_nxt'
#page 3
#url = 'https://www.imdb.com/search/title/?title_type=tv_series&release_date=1989-01-01,2022-12-31&genres=animation&count=250&start=501&ref_=adv_nxt'
#page 4
#url='https://www.imdb.com/search/title/?title_type=tv_series&release_date=1989-01-01,2022-12-31&genres=animation&count=250&start=751&ref_=adv_nxt'
#page 5
#url='https://www.imdb.com/search/title/?title_type=tv_series&release_date=1989-01-01,2022-12-31&genres=animation&count=250&start=751&ref_=adv_nxt'
#... and so on
#we should probably figure out how to automate this, 
#because otherwise we have to manyally run this file 44 times

response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")

In [245]:
titles = [a.text for a in soup.select('.lister-item-header')]
len(titles)

250

In [246]:
uniqueId = [a.attrs.get('data-tconst') for a in soup.select('div.ribbonize')]

len(uniqueId)

250

In [247]:
imdbRatings = []

for a in soup.select('div.lister-item'):
    if a.find('strong'):
        imdbRatings.append(a.find('strong').text)
    else:
        imdbRatings.append('N/A')
        
len(imdbRatings)

250

In [248]:
runYears = [a.text for a in soup.select("span.lister-item-year.text-muted.unbold")]
len(runYears)

250

In [249]:
tvRatings = []

for a in soup.select('p.text-muted'):
    if a.find('span'):
        if a.find('span', class_='certificate'):
            tvRatings.append(a.select('span.certificate')[0].text)
        else:
            tvRatings.append('N/A')
len(tvRatings)


250

In [250]:
runtimes= []          
    
for a in soup.select('p.text-muted'):
    if a.find('span'):
        if a.find('span', class_='runtime'):
            runtimes.append(a.select('span.runtime')[0].text)
        else:
            runtimes.append('N/A')    
len(runtimes)

#need to decide if we will manually enter episode lengths

250

In [251]:
genres = [a.text for a in soup.select("span.genre")]
len(genres)

250

In [252]:
# create a empty list for storing
# movie information
list = []
 
# Iterating over movies to extract
# each movie's details
for index in range(0, len(titles)):
  
    title_string = titles[index]
    #"\n3.\nDon't Hug Me I'm Scared\n(2022– )\n",
    show = title_string.split('\n')
    tv_show_title = show[2]
    
    years = show[3].split('–')
    startYear = years[0].strip('(')
    endYear = years[-1].strip(')') #gets the last item in the list (for some reason just asking for years[1] was wrong?)
    
    if "min" in runtimes[index]:
        runtimes[index] = runtimes[index].strip(' min')
    
    showGenres = genres[index].strip()
    genreList = showGenres.split(',') #not sure what data format is best for the genres, we can have a string with all genres, or a list of all genres
    genreList.pop(0) #gets rid of 'Animation' genre that is always first in the list
    
    data = { "TV Show Title": tv_show_title,
            "Unique ID": uniqueId[index],
            "IMDB Rating": imdbRatings[index],
            "Start Year": startYear,
            "End Year": endYear,
            "TV Rating": tvRatings[index],
            "Episode Run Time (min)": runtimes[index],
            "Top Genres": genreList,               
            }
    
    list.append(data)
    
list

[{'TV Show Title': 'Goof Troop',
  'Unique ID': 'tt0103428',
  'IMDB Rating': '6.8',
  'Start Year': '1992',
  'End Year': '1993',
  'TV Rating': 'TV-Y',
  'Episode Run Time (min)': '30',
  'Top Genres': [' Adventure', ' Comedy']},
 {'TV Show Title': 'My Little Pony: Tell Your Tale',
  'Unique ID': 'tt18270744',
  'IMDB Rating': '5.8',
  'Start Year': '2022',
  'End Year': ' ',
  'TV Rating': 'TV-Y',
  'Episode Run Time (min)': '5',
  'Top Genres': [' Short', ' Adventure']},
 {'TV Show Title': 'SuperMansion',
  'Unique ID': 'tt4843640',
  'IMDB Rating': '7.6',
  'Start Year': '2015',
  'End Year': '2019',
  'TV Rating': 'TV-14',
  'Episode Run Time (min)': '30',
  'Top Genres': [' Comedy']},
 {'TV Show Title': 'All Grown Up!',
  'Unique ID': 'tt0387714',
  'IMDB Rating': '5.4',
  'Start Year': '2003',
  'End Year': '2008',
  'TV Rating': 'TV-Y',
  'Episode Run Time (min)': '30',
  'Top Genres': [' Adventure', ' Comedy']},
 {'TV Show Title': 'Booba',
  'Unique ID': 'tt6581428',
  'IMDB 

In [253]:
for show in list:
    print(show['TV Rating'], '-', show['TV Show Title'], '('+show['Start Year'] +
          ') -', 'Genres:', show['Top Genres'], show['IMDB Rating'])

TV-Y - Goof Troop (1992) - Genres: [' Adventure', ' Comedy'] 6.8
TV-Y - My Little Pony: Tell Your Tale (2022) - Genres: [' Short', ' Adventure'] 5.8
TV-14 - SuperMansion (2015) - Genres: [' Comedy'] 7.6
TV-Y - All Grown Up! (2003) - Genres: [' Adventure', ' Comedy'] 5.4
TV-Y7 - Booba (2014) - Genres: [' Comedy', ' Family'] 7.1
TV-Y7 - Transformers: Robots in Disguise (2014) - Genres: [' Action', ' Adventure'] 6.0
TV-14 - Gyakkyô burai Kaiji (2007) - Genres: [' Action', ' Thriller'] 8.2
TV-14 - Saturday Morning All Star Hits! (2021) - Genres: [' Comedy'] 7.5
N/A - Genshin Impact (2019) - Genres: [] 6.3
TV-Y7 - Xiaolin Showdown (2003) - Genres: [' Action', ' Adventure'] 7.5
TV-14 - Snow White with the Red Hair (2015) - Genres: [' Drama', ' Fantasy'] 7.7
TV-Y7 - He-Man and the Masters of the Universe (2002) - Genres: [' Action', ' Adventure'] 7.5
TV-PG - TheOdd1sOut (2014) - Genres: [' Biography', ' Comedy'] 8.1
TV-Y - Back to the Future (1991) - Genres: [' Adventure', ' Family'] 6.3
TV-M

In [254]:

#saving the list as dataframe
#then converting into .csv file
df = pd.DataFrame(list)

#write to new (or write over old) csv file
#df.to_csv('data/imdb_animated_shows.csv', index=False)

#append preexising csv file ?
df.to_csv('data/imdb_animated_shows.csv', mode='a', index=False, header=False)
