In [1]:
# Scraping IMDB website using Python 

#If the data you’re looking for is on an web page, however, then the solution to all these problems is web scraping.
# We will scrap the all the Game of thrones episodes from the IMDB website.
# Importing required libraries and modules
# Using the IMDB website 
from requests import get
url = 'https://www.imdb.com/title/tt0944947/episodes?season=8'
response = get(url)
print(response.text[:350])
# Requesting from the server the content of the web page by using get(), and store the server’s response in the variable response.


 










<!DOCTYPE html>
<html
    xmlns:og="http://ogp.me/ns#"
    xmlns:fb="http://www.facebook.com/2008/fbml">
    <head>
         
        <meta charset="utf-8">
        <meta http-equiv="X-UA-Compatible" content="IE=edge">

    <meta name="apple-itunes-app" content="app-id=342792525, app-argument=imdb:///title/tt0944947?src=mdot">



     


In [2]:
# View the website by inspecting element o0r just by pressing f12 for better understanding. 

In [3]:
# Parsing response.text by creating a BeautifulSoup object, and assign this object to html_soup. The html.parser argument indicates that we want to do the parsing using Python’s built-in HTML parser.
from bs4 import BeautifulSoup
html_soup = BeautifulSoup(response.text, 'html.parser')

In [4]:
# Ignoring warning
import warnings
warnings.filterwarnings('ignore')

In [5]:
# We will grab all of the instances of <div class="info" ...> </div> from the page; there is one for each episode.
# find_all() returned a ResultSet object –episode_containers– which is a list containing all the DIV tags.
episode_containers = html_soup.find_all('div', class_='info')

In [6]:
# The HTML attributes are the dictionary’s keys. The values of the HTML attributes are the values of the dictionary’s keys.
# Extracting Title of Episodes by calling title attribute from the <a> tag.
episode_containers[0].a['title']

'Winterfell'

In [7]:
# Episode number in the <meta> tag, under the content attribute. 
episode_containers[0].meta['content']

'1'

In [8]:
# Extracting Airdate of episodes
episode_containers[0].find('div', class_='airdate').text.strip()

'15 Apr. 2019'

In [9]:
# Extracting IMDB Rating of episodes
episode_containers[0].find('span', class_='ipl-rating-star__rating').text

'7.5'

In [10]:
episode_containers[0].find('span', class_='ipl-rating-star__total-votes').text

'(123,184)'

In [11]:
# Extracting Episode description.
episode_containers[0].find('div', class_='item_description').text.strip()

'Jon and Daenerys arrive in Winterfell and are met with skepticism. Sam learns about the fate of his family. Cersei gives Euron the reward he aims for. Theon follows his heart.'

In [12]:
# declaration of list
community_episodes = []

# For every season in the series -range depends on the show
for sn in range(1,9):
    # Request from the server the content of the web page by using get(), and store the server’s response in the variable response, just as we did earlier.
    response = get('https://www.imdb.com/title/tt0944947/episodes?season=' + str(sn))

    # Parse the content of the request with BeautifulSoup
    page_html = BeautifulSoup(response.text, 'html.parser')

    # Select all the episode containers from the season's page
    episode_containers = page_html.find_all('div', class_ = 'info')

    # For each episode in each season
    for episodes in episode_containers:
            # Getting the info of each episode on the page
            season = sn
            episode_number = episodes.meta['content']
            title = episodes.a['title']
            airdate = episodes.find('div', class_='airdate').text.strip()
            rating = episodes.find('span', class_='ipl-rating-star__rating').text
            total_votes = episodes.find('span', class_='ipl-rating-star__total-votes').text
            desc = episodes.find('div', class_='item_description').text.strip()
            # Compiling the episode info
            episode_data = [season, episode_number, title, airdate, rating, total_votes, desc]

            # Append the episode info to the complete dataset
            community_episodes.append(episode_data)

In [13]:
# Creating a dataFrame to gather all info at one place.
import pandas as pd 
community_episodes = pd.DataFrame(community_episodes, columns = ['season', 'episode_number', 'title', 'airdate', 'rating', 'total_votes', 'desc'])
# Viewing the Dataframe
community_episodes.head()

Unnamed: 0,season,episode_number,title,airdate,rating,total_votes,desc
0,1,1,Winter Is Coming,22 Aug. 2011,9.1,"(39,455)",Eddard Stark is torn between his family and an...
1,1,2,The Kingsroad,22 Aug. 2011,8.8,"(29,932)","While Bran recovers from his fall, Ned takes o..."
2,1,3,Lord Snow,1 May 2011,8.7,"(28,306)",Jon begins his training with the Night's Watch...
3,1,4,"Cripples, Bastards, and Broken Things",8 May 2011,8.8,"(26,861)",Eddard investigates Jon Arryn's murder. Jon be...
4,1,5,The Wolf and the Lion,15 May 2011,9.1,"(27,945)",Catelyn has captured Tyrion and plans to bring...


In [14]:
# Now time for some Data cleaning.
# As you can see th etotal votes is extracted with the parentheses,so we need to remove that.
community_episodes['total_votes'].unique()

array(['(39,455)', '(29,932)', '(28,306)', '(26,861)', '(27,945)',
       '(27,658)', '(28,154)', '(26,221)', '(36,994)', '(32,464)',
       '(25,364)', '(23,961)', '(23,682)', '(22,913)', '(23,098)',
       '(24,126)', '(23,501)', '(23,220)', '(40,400)', '(28,657)',
       '(25,293)', '(22,941)', '(23,162)', '(31,552)', '(23,603)',
       '(23,473)', '(22,700)', '(23,279)', '(88,286)', '(26,339)',
       '(29,789)', '(46,567)', '(25,187)', '(24,202)', '(23,426)',
       '(43,086)', '(26,063)', '(46,681)', '(39,389)', '(38,703)',
       '(26,531)', '(23,474)', '(22,748)', '(23,347)', '(23,624)',
       '(26,716)', '(25,774)', '(88,195)', '(38,720)', '(37,539)',
       '(36,370)', '(42,744)', '(30,560)', '(32,883)', '(64,349)',
       '(30,376)', '(29,503)', '(33,560)', '(186,896)', '(130,185)',
       '(46,837)', '(40,454)', '(42,181)', '(79,926)', '(40,236)',
       '(58,686)', '(58,258)', '(123,184)', '(121,360)', '(202,165)',
       '(154,373)', '(179,760)', '(224,197)'], dtype=obje

In [15]:
# function to remove parenthese
def remove_str(votes):
    for r in ((',',''), ('(',''),(')','')):
        votes = votes.replace(*r)
        
    return votes

In [16]:
community_episodes['total_votes'] = community_episodes.total_votes.apply(remove_str).astype(int)

# Checking if done successfully
community_episodes.head()

Unnamed: 0,season,episode_number,title,airdate,rating,total_votes,desc
0,1,1,Winter Is Coming,22 Aug. 2011,9.1,39455,Eddard Stark is torn between his family and an...
1,1,2,The Kingsroad,22 Aug. 2011,8.8,29932,"While Bran recovers from his fall, Ned takes o..."
2,1,3,Lord Snow,1 May 2011,8.7,28306,Jon begins his training with the Night's Watch...
3,1,4,"Cripples, Bastards, and Broken Things",8 May 2011,8.8,26861,Eddard investigates Jon Arryn's murder. Jon be...
4,1,5,The Wolf and the Lion,15 May 2011,9.1,27945,Catelyn has captured Tyrion and plans to bring...


In [17]:
# Converting the rating column into numeric type as it was extracted as string.
community_episodes['rating'] = community_episodes.rating.astype(float)

In [18]:
# Manupulating the airdate column as real date and time format.
community_episodes['airdate'] = pd.to_datetime(community_episodes.airdate)
community_episodes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 73 entries, 0 to 72
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   season          73 non-null     int64         
 1   episode_number  73 non-null     object        
 2   title           73 non-null     object        
 3   airdate         73 non-null     datetime64[ns]
 4   rating          73 non-null     float64       
 5   total_votes     73 non-null     int32         
 6   desc            73 non-null     object        
dtypes: datetime64[ns](1), float64(1), int32(1), int64(1), object(3)
memory usage: 3.8+ KB


In [19]:
community_episodes.head()

Unnamed: 0,season,episode_number,title,airdate,rating,total_votes,desc
0,1,1,Winter Is Coming,2011-08-22,9.1,39455,Eddard Stark is torn between his family and an...
1,1,2,The Kingsroad,2011-08-22,8.8,29932,"While Bran recovers from his fall, Ned takes o..."
2,1,3,Lord Snow,2011-05-01,8.7,28306,Jon begins his training with the Night's Watch...
3,1,4,"Cripples, Bastards, and Broken Things",2011-05-08,8.8,26861,Eddard investigates Jon Arryn's murder. Jon be...
4,1,5,The Wolf and the Lion,2011-05-15,9.1,27945,Catelyn has captured Tyrion and plans to bring...


In [20]:
# Finally, Converting the dataset into CSV file and save it.
community_episodes.to_csv('Game_Of_Thrones_Episodes_IMDb_info.csv',index=False)


In [21]:
# End of the script.