Importing required libraries

In [47]:
import pandas as pd
from requests import get
from bs4 import BeautifulSoup

Getting the URL and parsing the HTML content

In [48]:
url = 'https://www.imdb.com/title/tt1442437/episodes?season=1'
response = get(url)
#print(response.text[:250])
html_soup = BeautifulSoup(response.text, 'html.parser')
type(html_soup)

bs4.BeautifulSoup

Episodes Information

In [49]:
episode_containers = html_soup.find_all('div', class_='info')

Episode Title

In [50]:
episode_containers[0].a['title']

'Pilot'

Episode Number

In [51]:
episode_containers[0].meta['content']

'1'

Episode AirDate

In [52]:
episode_containers[0].find('div', class_='airdate').text.strip()

'23 Sep. 2009'

Episode rating on IMDb

In [53]:
episode_containers[0].find('span', class_='ipl-rating-star__rating').text

'8.2'

Episode votes

In [54]:
episode_containers[0].find('span', class_='ipl-rating-star__total-votes').text

'(3,490)'

Episode Summary

In [55]:
episode_containers[0].find('div', class_='item_description').text.strip()

'Jay must adapt to his young new wife, Gloria and her son. Meanwhile, Claire deals with her own family. Mitchell and Cam decide to adopt a baby.'

Gathering information on all episodes 

In [56]:
# Initializing the series that the loop will populate
mofa_episodes = []

# For every season in the series-- range depends on the show
for sn in range(1,12):
    # Request from the server the content of the web page by using get(), and store the server’s response in the variable response
    response = get('https://www.imdb.com/title/tt1442437/episodes?season=' + str(sn))

    # Parse the content of the request with BeautifulSoup
    page_html = BeautifulSoup(response.text, 'html.parser')

    # Select all the episode containers from the season's page
    episode_containers = page_html.find_all('div', class_ = 'info')

    # For each episode in each season
    for episodes in episode_containers:
            # Get the info of each episode on the page
            season = sn
            episode_number = episodes.meta['content']
            title = episodes.a['title']
            airdate = episodes.find('div', class_='airdate').text.strip()
            rating = episodes.find('span', class_='ipl-rating-star__rating').text
            total_votes = episodes.find('span', class_='ipl-rating-star__total-votes').text
            desc = episodes.find('div', class_='item_description').text.strip()
            # Compiling the episode info
            episode_data = [season, episode_number, title, airdate, rating, total_votes, desc]

            # Append the episode info to the complete dataset
            mofa_episodes.append(episode_data)

Preparing the dataframe

In [57]:
mofa_episodes = pd.DataFrame(mofa_episodes, columns = ['season', 'episode_number', 'title', 'airdate', 'rating', 'total_votes', 'desc'])

mofa_episodes.head()

Unnamed: 0,season,episode_number,title,airdate,rating,total_votes,desc
0,1,1,Pilot,23 Sep. 2009,8.2,"(3,490)","Jay must adapt to his young new wife, Gloria a..."
1,1,2,The Bicycle Thief,30 Sep. 2009,8.2,"(2,887)","After buying a new bike for his son, Phil ends..."
2,1,3,Come Fly with Me,7 Oct. 2009,8.0,"(2,553)",Jay takes Phil on a model-airplane excursion w...
3,1,4,The Incident,14 Oct. 2009,8.2,"(2,608)",Jay's ex-wife shows up for the first time foll...
4,1,5,Coal Digger,21 Oct. 2009,8.3,"(2,509)",A conflict between Manny and Luke spreads to t...


### Data Cleaning

Converting number of votes to integer

In [58]:
def remove_str(votes):
    for r in ((',',''), ('(',''),(')','')):
        votes = votes.replace(*r)
        
    return votes


mofa_episodes['total_votes'] = mofa_episodes.total_votes.apply(remove_str).astype(int)

mofa_episodes.head()

Unnamed: 0,season,episode_number,title,airdate,rating,total_votes,desc
0,1,1,Pilot,23 Sep. 2009,8.2,3490,"Jay must adapt to his young new wife, Gloria a..."
1,1,2,The Bicycle Thief,30 Sep. 2009,8.2,2887,"After buying a new bike for his son, Phil ends..."
2,1,3,Come Fly with Me,7 Oct. 2009,8.0,2553,Jay takes Phil on a model-airplane excursion w...
3,1,4,The Incident,14 Oct. 2009,8.2,2608,Jay's ex-wife shows up for the first time foll...
4,1,5,Coal Digger,21 Oct. 2009,8.3,2509,A conflict between Manny and Luke spreads to t...


Converting rating from string to integer

In [59]:
mofa_episodes['rating'] = mofa_episodes.rating.astype(float)

Converting datatype of airdate to timestamp

In [60]:
mofa_episodes['airdate'] = pd.to_datetime(mofa_episodes.airdate)

mofa_episodes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250 entries, 0 to 249
Data columns (total 7 columns):
season            250 non-null int64
episode_number    250 non-null object
title             250 non-null object
airdate           250 non-null datetime64[ns]
rating            250 non-null float64
total_votes       250 non-null int32
desc              250 non-null object
dtypes: datetime64[ns](1), float64(1), int32(1), int64(1), object(3)
memory usage: 12.8+ KB


Saving the final dataset as excel file

In [62]:
mofa_episodes.to_excel('Modern_Family_Episodes_IMDb_Ratings.xlsx',index=False)