In [35]:
from bs4 import BeautifulSoup
import requests
import re
import pandas as pd

In [36]:
#scrapes data from the IMDB search result page
#formats data into a list to be returned
def getPageData(url):
    response = requests.get(url) #gets the web page located at the url
    soup = BeautifulSoup(response.text, "html.parser") #reads the page's text as parsable html
    
    #get Titles
    #get the text inside the tag that has .lister-item.header class
    titles = [a.text for a in soup.select('.lister-item-header')]
    
    #get unique IDs
    #for every div with the ribbonize class
    #search through its attributes to find a data-tconst
    #get the text in the 'data-tconts'
    uniqueId = [a.attrs.get('data-tconst') for a in soup.select('div.ribbonize')]
    
    #get IMDB Ratings
    imdbRatings = []
    #find every div with lister-item class
    for a in soup.select('div.lister-item'):
        #if the div contains a strong tag
        if a.find('strong'):
            #append the text inside the strong tag to the imdbRatings list
            imdbRatings.append(a.find('strong').text)
        #if the dif doesn't contain a strong tag
        else:
            #apppend 'N/A' to the list
            imdbRatings.append('N/A')
    
    #get start and end year of shows
    #get the text inside every span with the classes lister-item-year and text-muted-unbold
    runYears = [a.text for a in soup.select("span.lister-item-year.text-muted.unbold")]

    #get TV maturity rating
    tvRatings = []
    #for every p tag with text-muted class
    for a in soup.select('p.text-muted'):
        #check if p contains span tag
        if a.find('span'):
            #check if span tag has certificate class
            if a.find('span', class_='certificate'):
                #add the first text contents of the certificate span to the tvRatings list
                tvRatings.append(a.select('span.certificate')[0].text)
            #if p doesn't contain a span tag
            else:
                #append 'N/A' to the tvRatings list
                tvRatings.append('N/A')
                
    #get runtime of episodes
    runtimes= []          
    for a in soup.select('p.text-muted'):
        if a.find('span'):
            if a.find('span', class_='runtime'):
                runtimes.append(a.select('span.runtime')[0].text)
            else:
                runtimes.append('N/A')
    
    #get genres of shows
    genres = [a.text for a in soup.select("span.genre")]
    
#Formatting the Data
    # create a empty list for storing show information
    list = []

    # Iterating over shows to extract each shows's details
    for index in range(0, len(titles)):
        title_string = titles[index]  #Example String: "\n3.\nDon't Hug Me I'm Scared\n(2022– )\n",
        show = title_string.split('\n') #turn string into list, splitting at each '\n'
        tv_show_title = show[2]
        
        #NOTE: this doesn't properly handle shows that only ran 1 year or shows that have multiple versions ex. "Yu-Gi-Oh (I) (2000-2006)" gets the "(I)" mixed in with the year data.
        years = show[3].split('–') #splits the year data further
        startYear = years[0].strip('(') #removes '(' from the start year
        endYear = years[-1].strip(')') #gets the last item in the list (for some reason just asking for years[1] was wrong?)

        if "min" in runtimes[index]:
            runtimes[index] = runtimes[index].strip(' min')

        showGenres = genres[index].strip()
        genreList = showGenres.split(',') #not sure what data format is best for the genres, we can have a string with all genres, or a list of all genres
        genreList.pop(0) #gets rid of 'Animation' genre that is always first in the list
        
        #put the data into each corresponding column
        #this order can be changed if we need
        data = { "TV Show Title": tv_show_title,
                "Unique ID": uniqueId[index],
                "IMDB Rating": imdbRatings[index],
                "Start Year": startYear,
                "End Year": endYear,
                "TV Rating": tvRatings[index],
                "Episode Run Time (min)": runtimes[index],
                "Top Genres": genreList,               
                }
        #add this to the end of the overall list 
        list.append(data)
    
    return list


In [38]:
#adds data to the end of the csv file
def appendCSV(list):
    #saving the list as dataframe then converting into .csv file
    df = pd.DataFrame(list)
    #append preexising csv file
    df.to_csv('data/imdb_animated_shows.csv', mode='a', index=False, header=False)

In [39]:
#checks the shape of the csv file
def checkCSV():
    #find the csv file at the path and read it as a dataframe
    da = pd.read_csv('data/imdb_animated_shows.csv')
    #print out the shape of the dataframe
    print("Current Shape: ", da.shape)

In [40]:
#numberOfPages=45 #after the 39th page, the url format changes
                  #its because its going from starting with result 9750 to result 10000
                  #so there are actually 45 pages, but we can only automate reading from 39 of them
                  #so 6 pages of data will go have to be manually added
numberOfPages=39
currentPage = 1
url = "https://www.imdb.com/search/title/?title_type=tv_series&release_date=1989-01-01,2022-12-31&genres=animation&count=250&ref_=adv_prv"

#get page 1 data and return a list
#saving the list as dataframe
df = pd.DataFrame(getPageData(url))

#then converting into .csv file
#write to new (or write over old) csv file
df.to_csv('data/imdb_animated_shows.csv', index=False)

checkCSV() #should only have 249 shows

while currentPage <= numberOfPages:
    #gets the URL of the next page
    url = "https://www.imdb.com/search/title/?title_type=tv_series&release_date=1989-01-01,2022-12-31&genres=animation&count=250&start=" + str((250*currentPage)+1) + "&ref_=adv_nxt"
    #get the next page's data and returns it as a formatted list
    #the list is apended to the csv file created earlier
    appendCSV(getPageData(url))
    currentPage += 1


In [41]:
checkCSV() #should have 9,999 shows

Current Shape:  (9999, 8)


In [42]:
#getting the data from the last 5 pages that have special urls
appendCSV(getPageData("https://www.imdb.com/search/title/?title_type=tv_series&release_date=1989-01-01,2022-12-31&genres=animation&count=250&after=WzE1NjM3MzAsInR0MDQ2MzgwOCIsMTAwMDBd&ref_=adv_nxt"))
appendCSV(getPageData("https://www.imdb.com/search/title/?title_type=tv_series&release_date=1989-01-01,2022-12-31&genres=animation&count=250&after=WzE3MTYyNzAsInR0MTU0NjIzMjYiLDEwMjUwXQ%3D%3D&ref_=adv_nxt"))
appendCSV(getPageData("https://www.imdb.com/search/title/?title_type=tv_series&release_date=1989-01-01,2022-12-31&genres=animation&count=250&after=WzE4NjA0NTEsInR0NzIzMjk1OCIsMTA1MDBd&ref_=adv_nxt"))
appendCSV(getPageData("https://www.imdb.com/search/title/?title_type=tv_series&release_date=1989-01-01,2022-12-31&genres=animation&count=250&after=WzIwMjM1MTEsInR0MTI5MjkzOTQiLDEwNzUwXQ%3D%3D&ref_=adv_nxt"))
appendCSV(getPageData("https://www.imdb.com/search/title/?title_type=tv_series&release_date=1989-01-01,2022-12-31&genres=animation&count=250&after=WzIxOTY1NzIsInR0MTAxNDYzMDIiLDExMDAwXQ%3D%3D&ref_=adv_nxt"))


In [43]:
checkCSV() #should have all 11,147 shows

Current Shape:  (11147, 8)
