### 1. Imports

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

### 2. Create a soup object

In [2]:
with open("html-code.html") as file:
    soup = BeautifulSoup(file,"html.parser")

In [3]:
type(soup)

bs4.BeautifulSoup

### 3. Basics of soup object
- prettify()
- individual tags:
    - title
    - a
    - p
- text
- name
- parent
- children
- descendants
- get_text()
- find()
- find_all()
- get()/square bracket notation

In [4]:
print(soup.prettify())

<head>
 <meta charset="utf-8"/>
 <title>
  Three Little Sisters
 </title>
</head>
<body>
 <h1>
  Welcome to the Story
 </h1>
 <p>
  Once upon a time there were three little sisters, and their names were
  <a class="sister" href="https://example.com/elsie" id="link1">
   Elsie
  </a>
  ,
  <a class="sister" href="https://example.com/lacie" id="link2">
   Lacie
  </a>
  , and
  <a class="sister" href="https://example.com/tillie" id="link3">
   Tillie
  </a>
  ;
      and they lived at the bottom of a well.
 </p>
 <p class="story">
  ...
 </p>
</body>



In [5]:
soup.title

<title>Three Little Sisters</title>

In [6]:
soup.a

<a class="sister" href="https://example.com/elsie" id="link1">Elsie</a>

In [7]:
soup.p

<p>
      Once upon a time there were three little sisters, and their names were 
      <a class="sister" href="https://example.com/elsie" id="link1">Elsie</a>, 
      <a class="sister" href="https://example.com/lacie" id="link2">Lacie</a>, and 
      <a class="sister" href="https://example.com/tillie" id="link3">Tillie</a>;
      and they lived at the bottom of a well.
    </p>

In [8]:
# Text
soup.title.text

'Three Little Sisters'

In [9]:
# name
soup.title.name

'title'

In [10]:
# parent
soup.title.parent

<head>
<meta charset="utf-8"/>
<title>Three Little Sisters</title>
</head>

In [11]:
# children
soup.body.children

<list_iterator at 0x2b48da49a80>

In [12]:
for child in soup.body.children:
    print(child)



<h1>Welcome to the Story</h1>


<p>
      Once upon a time there were three little sisters, and their names were 
      <a class="sister" href="https://example.com/elsie" id="link1">Elsie</a>, 
      <a class="sister" href="https://example.com/lacie" id="link2">Lacie</a>, and 
      <a class="sister" href="https://example.com/tillie" id="link3">Tillie</a>;
      and they lived at the bottom of a well.
    </p>


<p class="story">...</p>




In [13]:
# descendants
soup.body.descendants

<generator object Tag.descendants at 0x000002B48DA0AEA0>

In [14]:
for descendant in soup.body.descendants:
    print(descendant)



<h1>Welcome to the Story</h1>
Welcome to the Story


<p>
      Once upon a time there were three little sisters, and their names were 
      <a class="sister" href="https://example.com/elsie" id="link1">Elsie</a>, 
      <a class="sister" href="https://example.com/lacie" id="link2">Lacie</a>, and 
      <a class="sister" href="https://example.com/tillie" id="link3">Tillie</a>;
      and they lived at the bottom of a well.
    </p>

      Once upon a time there were three little sisters, and their names were 
      
<a class="sister" href="https://example.com/elsie" id="link1">Elsie</a>
Elsie
, 
      
<a class="sister" href="https://example.com/lacie" id="link2">Lacie</a>
Lacie
, and 
      
<a class="sister" href="https://example.com/tillie" id="link3">Tillie</a>
Tillie
;
      and they lived at the bottom of a well.
    


<p class="story">...</p>
...




In [15]:
# get_text()
soup.title.get_text()

'Three Little Sisters'

In [16]:
# find()
soup.find('a')

<a class="sister" href="https://example.com/elsie" id="link1">Elsie</a>

In [17]:
# find_all()
soup.find_all('a')

[<a class="sister" href="https://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="https://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="https://example.com/tillie" id="link3">Tillie</a>]

In [18]:
# get()/square bracket notation
soup.a.get('class')

['sister']

### 4. Fetching webpage with Requests
- URL = https://www.bbc.com/sport/football/premier-league/top-scorers

### GET request

In [19]:
url = "https://www.bbc.com/sport/football/premier-league/top-scorers"

response = requests.get(url)

### check for errors

In [20]:
response.raise_for_status()

In [21]:
print(response.raise_for_status())

None


### status code

In [22]:
response.status_code

200

### text (string format)

In [23]:
response.text

'<!DOCTYPE html><html lang="en-GB" class="no-js"><head><meta charSet="utf-8" /><meta name="viewport" content="width=device-width, initial-scale=1" /><title data-rh="true">Premier League Top Scorers - BBC Sport</title><meta data-rh="true" name="description" content="Premier League top scorers. Showing assists, time on pitch and the shots on and off target."/><meta data-rh="true" name="theme-color" content="#FFFFFF"/><meta data-rh="true" property="og:description" content="Premier League top scorers. Showing assists, time on pitch and the shots on and off target."/><meta data-rh="true" property="og:image" content="https://static.files.bbci.co.uk/core/website/assets/static/sport/bbc-sport-logo.0da9386782.png"/><meta data-rh="true" property="og:site_name" content="BBC Sport"/><meta data-rh="true" property="og:title" content="Premier League Top Scorers - BBC Sport"/><meta data-rh="true" property="og:type" content="article"/><meta data-rh="true" property="og:url" content="https://www.bbc.co.u

### content (binary format)

In [24]:
response.content

b'<!DOCTYPE html><html lang="en-GB" class="no-js"><head><meta charSet="utf-8" /><meta name="viewport" content="width=device-width, initial-scale=1" /><title data-rh="true">Premier League Top Scorers - BBC Sport</title><meta data-rh="true" name="description" content="Premier League top scorers. Showing assists, time on pitch and the shots on and off target."/><meta data-rh="true" name="theme-color" content="#FFFFFF"/><meta data-rh="true" property="og:description" content="Premier League top scorers. Showing assists, time on pitch and the shots on and off target."/><meta data-rh="true" property="og:image" content="https://static.files.bbci.co.uk/core/website/assets/static/sport/bbc-sport-logo.0da9386782.png"/><meta data-rh="true" property="og:site_name" content="BBC Sport"/><meta data-rh="true" property="og:title" content="Premier League Top Scorers - BBC Sport"/><meta data-rh="true" property="og:type" content="article"/><meta data-rh="true" property="og:url" content="https://www.bbc.co.

- can use either content or text to create this soup object
- with text, we must be certain about the encoding
- with content, we let Beautiful Soup handle the encoding mostly


In [25]:
# soup object
soup = BeautifulSoup(response.content, "html.parser")

In [26]:
soup.prettify()

'<!DOCTYPE html>\n<html class="no-js" lang="en-GB">\n <head>\n  <meta charset="utf-8"/>\n  <meta content="width=device-width, initial-scale=1" name="viewport"/>\n  <title data-rh="true">\n   Premier League Top Scorers - BBC Sport\n  </title>\n  <meta content="Premier League top scorers. Showing assists, time on pitch and the shots on and off target." data-rh="true" name="description"/>\n  <meta content="#FFFFFF" data-rh="true" name="theme-color"/>\n  <meta content="Premier League top scorers. Showing assists, time on pitch and the shots on and off target." data-rh="true" property="og:description"/>\n  <meta content="https://static.files.bbci.co.uk/core/website/assets/static/sport/bbc-sport-logo.0da9386782.png" data-rh="true" property="og:image"/>\n  <meta content="BBC Sport" data-rh="true" property="og:site_name"/>\n  <meta content="Premier League Top Scorers - BBC Sport" data-rh="true" property="og:title"/>\n  <meta content="article" data-rh="true" property="og:type"/>\n  <meta conten

### 5. HTML parsing with Beautiful Soup: A Mini Project
- scrape data from public website
- organize the data as a dataframe
- export the data as an excel sheet

In [27]:
try:
    response = requests.get(url)
    response.raise_for_status()
except Exception as e:
    print(e)
else:
    soup = BeautifulSoup(response.content,"html.parser")
    players = soup.find('tbody').find_all('tr',class_='ssrcss-dhlz6k-TableRowBody e1icz100')

    player_names = []
    team_names = []
    goals = []
    assists = []
    num_matches = []
    shots = []

    for player in players:
        player_name = player.find('div',class_="ssrcss-m6ah29-PlayerName e1n8xy5b1").get_text(strip=True)                                                                                          
        team_name = player.find('div',class_="ssrcss-qvpga1-TeamsSummary e1n8xy5b0").get_text(strip=True)  
        goal = player.find('div',class_="ssrcss-8k20kk-CellWrapper ef9ipf0").get_text(strip=True)
        assist = player.find('div',class_="ssrcss-150z8d-CellWrapper ef9ipf0").get_text(strip=True)
        played = player.find('div',class_="ssrcss-150z8d-CellWrapper ef9ipf0").get_text(strip=True)  

        stats = player.find_all('div',class_="ssrcss-150z8d-CellWrapper ef9ipf0")
        asists_made = int(stats[0].get_text(strip=True))
        matches_played = int(stats[2].get_text(strip=True))
        shots_taken = int ( stats[-3].get_text(strip=True))

        #print(asists_made, matches_played,shots_taken)
    
        '''for div_tag in player.find_all('div',class_="ssrcss-150z8d-CellWrapper ef9ipf0"):
            print(div_tag)
        break'''
    
        player_names.append(player_name)
        team_names.append(team_name)
        goals.append(int(goal))
        assists.append(int(asists_made))
        num_matches.append(int(matches_played))
        shots.append(shots_taken)

    data = {
        'player': player_names,
        'team': team_names,
        'goals': goals,
        'assists': assists,
        'matches': num_matches,
        'shots': shots    
    }
    df_players = pd.DataFrame(data)


In [28]:
df_players.head()

Unnamed: 0,player,team,goals,assists,matches,shots
0,Mohamed Salah,Liverpool,27,17,31,108
1,E. Haaland,Man City,21,3,28,102
2,A. Isak,Newcastle,20,5,27,75
3,C. Wood,Nottm Forest,18,3,29,53
4,B. Mbeumo,Brentford,16,5,31,67


In [29]:
df_players.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28 entries, 0 to 27
Data columns (total 6 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   player   28 non-null     object
 1   team     28 non-null     object
 2   goals    28 non-null     int64 
 3   assists  28 non-null     int64 
 4   matches  28 non-null     int64 
 5   shots    28 non-null     int64 
dtypes: int64(4), object(2)
memory usage: 1.4+ KB


In [30]:
df_players.to_excel('EPL Top Scorers.xlsx',index=False)

ModuleNotFoundError: No module named 'openpyxl'